In [1]:
import pandas as pd

df_shared = pd.read_csv(r"C:\Users\early\Downloads\shared_articles.csv")
df_user = pd.read_csv(r"C:\Users\early\Downloads\users_interactions.csv")

df_user.head()

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,


In [2]:
df_articles = df_shared.drop(columns='timestamp')
df_interactions = df_user.drop(columns='timestamp')

df_interactions = df_interactions[df_interactions['eventType'].isin(['VIEW', 'LIKE', 'BOOKMARK', 'FOLLOW', 'COMMENT'])]
df_interactions = df_interactions.drop_duplicates(subset=['personId', 'contentId'], keep='last')

user_activity = df_interactions.groupby('personId').size()
active_users = user_activity[user_activity >= 3].index
df_interactions = df_interactions[df_interactions['personId'].isin(active_users)]

value_counts = df_interactions['contentId'].value_counts()
keep_list = value_counts[value_counts >= 9]
df_interactions_reduced = df_interactions[df_interactions['contentId'].isin(keep_list.index)]


In [3]:
print(f"Number of interactions: {len(df_interactions)}")
print(f"Number of articles: {df_interactions.contentId.nunique()}")
print(f"Number of users: {df_interactions.personId.nunique()}")

sparsity = 1 - (len(df_interactions)) / (df_interactions.contentId.nunique() * df_interactions.personId.nunique())
print(f"Sparsity: {sparsity:.4f}")

Number of interactions: 39995
Number of articles: 2985
Number of users: 1400
Sparsity: 0.9904


In [4]:
print('Min:\t\t', df_interactions['contentId'].value_counts().min())
print('Quartile 1:\t', df_interactions['contentId'].value_counts().quantile(0.25))
print('Median:\t\t', df_interactions['contentId'].value_counts().quantile(0.5))
print('Quartile 3:\t', df_interactions['contentId'].value_counts().quantile(0.75))
print('Max:\t\t', df_interactions['contentId'].value_counts().max())


Min:		 1
Quartile 1:	 4.0
Median:		 9.0
Quartile 3:	 17.0
Max:		 237


In [5]:
print('Min:\t\t', df_interactions['personId'].value_counts().min())
print('Quartile 1:\t', df_interactions['personId'].value_counts().quantile(0.25))
print('Median:\t\t', df_interactions['personId'].value_counts().quantile(0.5))
print('Quartile 3:\t', df_interactions['personId'].value_counts().quantile(0.75))
print('Max:\t\t', df_interactions['personId'].value_counts().max())


Min:		 3
Quartile 1:	 5.0
Median:		 11.0
Quartile 3:	 27.0
Max:		 961


In [6]:
from scipy.sparse import csr_matrix
import numpy as np

U = df_interactions_reduced['personId'].nunique()
I = df_interactions_reduced['contentId'].nunique()

user_mapper = dict(zip(np.unique(df_interactions_reduced['personId']), list(range(U))))
item_mapper = dict(zip(np.unique(df_interactions_reduced['contentId']), list(range(I))))

user_inv_mapper = dict(zip(list(range(U)), np.unique(df_interactions_reduced['personId'])))
item_inv_mapper = dict(zip(list(range(I)), np.unique(df_interactions_reduced['contentId'])))

user_index = [user_mapper[i] for i in df_interactions_reduced['personId']]
item_index = [item_mapper[i] for i in df_interactions_reduced['contentId']]

X = csr_matrix((np.ones(len(user_index)), (item_index, user_index)), shape=(I, U))


In [7]:
from sklearn.neighbors import NearestNeighbors

def recommend(itemId, X, item_mapper, item_inv_mapper, k, metric='cosine', messages=True):
    rec_ids = []

    item = item_mapper[itemId]
    item_vector = X[item]

    knn = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric).fit(X)
    rec = knn.kneighbors(item_vector.reshape(1, -1), return_distance=True)

    rec_indices = rec[1][0][1:]
    rec_distances = rec[0][0][1:]

    for idx in rec_indices:
        rec_ids.append(item_inv_mapper[idx])

    if messages:
        print(f'List of recommended item indices:\n{rec_indices}\n')
        print(f'List of recommended contentIds:\n{rec_ids}\n')
        print(f'Similarity distances:\n{rec_distances}\n')

    return rec_ids, rec_distances


In [8]:
df_articles.set_index('contentId', inplace=True)

article_id = df_interactions_reduced['contentId'].iloc[0]

rec_ids, rec_distances = recommend(article_id, X, item_mapper, item_inv_mapper, k=5)

original_title = df_articles.loc[article_id, 'title']
print(f"If you like article ID {article_id} ({original_title}), you may also enjoy:\n")

valid_rec_ids = [rid for rid in rec_ids if rid in df_articles.index]

print(df_articles.loc[valid_rec_ids][['title']])


List of recommended item indices:
[1234  612  648  115  107]

List of recommended contentIds:
[np.int64(5688279681867464747), np.int64(-1625217607550912638), np.int64(-1196068832249300490), np.int64(-8018088591294066196), np.int64(-8142426490949346803)]

Similarity distances:
[0.49748109 0.5545646  0.59798487 0.61509982 0.65573481]

If you like article ID 3460026829794173084 (TAM recebe primeira aeronave com o design e pintura da LATAM e avança na implantação da nova marca!), you may also enjoy:

                                                                  title
contentId                                                              
 5688279681867464747  Margaret Gould Stewart: How giant websites des...
-1625217607550912638  Google made a huge change to the way Google Ma...
-1196068832249300490                  The 'impossible' is now a reality
-8018088591294066196      LOGOBR School | Behance Reviews Campinas 2016
-8142426490949346803  A identidade visual de Budapeste para as Oli

In [9]:
import pickle

with open('collaborative_model.sav', 'wb') as f:
    pickle.dump((X, item_mapper, item_inv_mapper), f)

In [10]:
-1032019229384696495

-1032019229384696495

In [11]:
# Pick a specific userId (personId)
user_id = -1032019229384696495

# Get articles this user has interacted with
user_articles = df_interactions_reduced[df_interactions_reduced['personId'] == user_id]['contentId'].tolist()

print(f"User {user_id} has interacted with {len(user_articles)} articles.")
print("Recommending based on the first article they interacted with...\n")

# Pick one of the articles (you can try others too)
article_id = user_articles[0]

# Recommend similar articles
rec_ids, rec_distances = recommend(article_id, X, item_mapper, item_inv_mapper, k=5)

# Show original title
original_title = df_articles.loc[article_id, 'title']
print(f"If user {user_id} liked article ID {article_id} ({original_title}), they may also enjoy:\n")

# Display recommended article titles
valid_rec_ids = [rid for rid in rec_ids if rid in df_articles.index]
print(df_articles.loc[valid_rec_ids][['title']])


User -1032019229384696495 has interacted with 431 articles.
Recommending based on the first article they interacted with...

List of recommended item indices:
[1493 1191  717  452 1014]

List of recommended contentIds:
[np.int64(8890720798209849691), np.int64(5206308811707978799), np.int64(-447851796385928420), np.int64(-3750879736572068916), np.int64(3149164017776669829)]

Similarity distances:
[0.59490425 0.60196884 0.61256749 0.61710793 0.61885551]

If user -1032019229384696495 liked article ID -8518096793350810174 (Microsoft adquire LinkedIn por US$ 26,2 bilhões), they may also enjoy:

                                                                  title
contentId                                                              
 8890720798209849691                     Top 10 Intranet Trends of 2016
 5206308811707978799                                   The rise of APIs
-447851796385928420   MOBILE TIME - FlyHelo: app permite compartilha...
-3750879736572068916  No, Apple isn't the n

In [12]:
recommendations_df = pd.DataFrame({
    'input_contentId': [article_id] * len(rec_ids),
    'recommended_contentId': rec_ids,
    'similarity': rec_distances
})

recommendations_df.to_csv('collaborative_recommendations.csv', index=False)