## Content recommendation application

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.decomposition import PCA

from surprise import Dataset, Reader
from surprise.prediction_algorithms import SVD
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import implicit
from scipy.sparse import coo_matrix, csr_matrix

from helpers import *

  from .autonotebook import tqdm as notebook_tqdm


### Dataset: News Portal User Interactions by Globo.com

In [2]:
metadata = pd.read_csv('news-portal-user-interactions-by-globocom/articles_metadata.csv')
clicks_sample = pd.read_csv('news-portal-user-interactions-by-globocom/clicks_sample.csv')

The dataset contains a sample of user interactions (page views) in G1 news portal from Oct. 1 to 16, 2017, including about 3 million clicks, distributed in more than 1 million sessions from 314,000 users who read more than 46,000 different news articles during that period.

It is composed by three files/folders:

- clicks.zip - Folder with CSV files (one per hour), containing user sessions interactions in the news portal.
- articles_metadata.csv - CSV file with metadata information about all (364047) published articles
- articles_embeddings.pickle Pickle (Python 3) of a NumPy matrix containing the Article Content Embeddings (250-dimensional vectors), trained upon articles' text and metadata by the CHAMELEON's ACR module (see paper for details) for 364047 published articles.

#### Metadata

In [3]:
metadata.head()

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,1513144419000,0,168
1,1,1,1405341936000,0,189
2,2,1,1408667706000,0,250
3,3,1,1408468313000,0,230
4,4,1,1407071171000,0,162


In [22]:
metadata.describe()

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
count,364047.0,364047.0,364047.0,364047.0,364047.0
mean,182023.0,283.108239,1474070000000.0,0.0,190.897727
std,105091.461061,136.72347,42930380000.0,0.0,59.502766
min,0.0,0.0,1159356000000.0,0.0,0.0
25%,91011.5,199.0,1444925000000.0,0.0,159.0
50%,182023.0,301.0,1489422000000.0,0.0,186.0
75%,273034.5,399.0,1509891000000.0,0.0,218.0
max,364046.0,460.0,1520943000000.0,0.0,6690.0


In [23]:
print("Unique articles :", metadata.article_id.nunique())
print("Unique categories :", metadata.category_id.nunique())

Unique articles : 364047
Unique categories : 461


#### Clicks

In [7]:
clicks_sample.head()

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2
1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2
2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2
3,1,1506825426267738,1506825426000,2,96663,1506827047951,4,1,17,1,16,2
4,2,1506825435299739,1506825435000,2,119592,1506827090575,4,1,17,1,24,2


In [13]:
clicks_sample.describe()

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,1883.0,1883.0,1883.0,1883.0,1883.0,1883.0,1883.0,1883.0,1883.0,1883.0,1883.0,1883.0
mean,355.893787,1506828000000000.0,1506828000000.0,3.459904,176717.848646,1506830000000.0,3.917153,2.305895,12.113648,1.491768,18.774827,1.764206
std,206.162865,867962800.0,867946.8,3.037467,82324.177259,10649380.0,0.410461,1.062301,7.825735,2.007772,7.0834,1.225679
min,0.0,1506825000000000.0,1506825000000.0,2.0,2137.0,1506827000000.0,1.0,1.0,2.0,1.0,1.0,1.0
25%,181.5,1506827000000000.0,1506827000000.0,2.0,108854.0,1506828000000.0,4.0,1.0,2.0,1.0,13.0,1.0
50%,353.0,1506828000000000.0,1506828000000.0,3.0,157541.0,1506828000000.0,4.0,3.0,17.0,1.0,21.0,1.0
75%,537.0,1506828000000000.0,1506828000000.0,4.0,236697.5,1506829000000.0,4.0,3.0,19.0,1.0,25.0,2.0
max,706.0,1506829000000000.0,1506829000000.0,24.0,363291.0,1506998000000.0,4.0,4.0,20.0,11.0,28.0,7.0


In [20]:
print("Unique user IDs :", clicks_sample.user_id.nunique())
print("Unique session IDs :", clicks_sample.session_id.nunique())

Unique user IDs : 707
Unique session IDs : 707


### Preprocessing

In [3]:
articles = metadata[['article_id', 'category_id', 'words_count']]
pd.DataFrame.to_csv(articles, 'articles.csv', index=False)
articles.tail()

Unnamed: 0,article_id,category_id,words_count
364042,364042,460,144
364043,364043,460,463
364044,364044,460,177
364045,364045,460,126
364046,364046,460,479


In [6]:
clicks = get_click_data("news-portal-user-interactions-by-globocom/clicks")
clicks

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2
1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2
2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2
3,1,1506825426267738,1506825426000,2,96663,1506827047951,4,1,17,1,16,2
4,2,1506825435299739,1506825435000,2,119592,1506827090575,4,1,17,1,24,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2988176,10051,1508211372158328,1508211372000,2,84911,1508211557302,4,3,2,1,25,1
2988177,322896,1508211376302329,1508211376000,2,30760,1508211672520,4,1,17,1,25,2
2988178,322896,1508211376302329,1508211376000,2,157507,1508211702520,4,1,17,1,25,2
2988179,123718,1508211379189330,1508211379000,2,234481,1508211513583,4,3,2,1,25,2


In [7]:
clicks.describe()

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,2988181,2988181,2988181,2988181,2988181,2988181,2988181,2988181,2988181,2988181,2988181,2988181
unique,322897,1048594,646874,72,46033,2983198,3,5,8,11,28,7
top,5890,1507563657895091,1507563657000,2,160974,1506961009961,4,1,17,1,25,2
freq,1232,124,127,1260372,37213,3,2904478,1823162,1738138,2852406,804985,1602601


In [None]:
df = clicks.groupby('user_id').agg(LIST_articles = ('click_article_id', lambda x: list(x)))
df['LIST_categories'] = df['LIST_articles'].apply(lambda x: get_cats(x))

In [7]:
df_users = pd.DataFrame(columns=['user_id', 'LIST_articles', 'LIST_categories'])
df_users['user_id'] = df.index
df_users['LIST_articles'] = df['LIST_articles']
df_users['LIST_categories'] = df['LIST_categories']

In [8]:
pd.DataFrame.to_csv(df_users, 'user_clicks_articles_categories.csv', index=False)
df_users

Unnamed: 0,user_id,LIST_articles,LIST_categories
0,0,"[157541, 68866, 96755, 313996, 160158, 233470,...","[136, 186, 186, 209, 281, 281, 375, 431]"
1,1,"[235840, 96663, 59758, 160474, 285719, 156723,...","[43, 123, 209, 281, 281, 301, 331, 375, 375, 4..."
2,2,"[119592, 30970, 30760, 209122]","[26, 26, 247, 332]"
3,3,"[236065, 236294, 234686, 233769, 235665, 23513...","[249, 281, 375, 375, 375, 375, 375, 375, 375, ..."
4,4,"[48915, 44488, 195887, 195084, 63307, 336499, ...","[81, 92, 132, 317, 317, 399, 437]"
...,...,...,...
322892,322892,"[42567, 39894]","[66, 67]"
322893,322893,"[50644, 36162]","[43, 99]"
322894,322894,"[36162, 168401]","[43, 297]"
322895,322895,"[289197, 63746]","[133, 418]"


### Content-Based

In [127]:
with open('news-portal-user-interactions-by-globocom/articles_embeddings.pickle', 'rb') as f:
    embeddings_pickle = pickle.load(f)
embeddings = pd.DataFrame(embeddings_pickle, columns=["embedding_" + str(i) for i in range(embeddings_pickle.shape[1])])
print("Shape of embeddings df: ", embeddings.shape)
embeddings.head()

Shape of embeddings df:  (364047, 250)


Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
0,-0.161183,-0.957233,-0.137944,0.050855,0.830055,0.901365,-0.335148,-0.559561,-0.500603,0.165183,...,0.321248,0.313999,0.636412,0.169179,0.540524,-0.813182,0.28687,-0.231686,0.597416,0.409623
1,-0.523216,-0.974058,0.738608,0.155234,0.626294,0.485297,-0.715657,-0.897996,-0.359747,0.398246,...,-0.487843,0.823124,0.412688,-0.338654,0.320787,0.588643,-0.594137,0.182828,0.39709,-0.834364
2,-0.619619,-0.97296,-0.20736,-0.128861,0.044748,-0.387535,-0.730477,-0.066126,-0.754899,-0.242004,...,0.454756,0.473184,0.377866,-0.863887,-0.383365,0.137721,-0.810877,-0.44758,0.805932,-0.285284
3,-0.740843,-0.975749,0.391698,0.641738,-0.268645,0.191745,-0.825593,-0.710591,-0.040099,-0.110514,...,0.271535,0.03604,0.480029,-0.763173,0.022627,0.565165,-0.910286,-0.537838,0.243541,-0.885329
4,-0.279052,-0.972315,0.685374,0.113056,0.238315,0.271913,-0.568816,0.341194,-0.600554,-0.125644,...,0.238286,0.809268,0.427521,-0.615932,-0.503697,0.61445,-0.91776,-0.424061,0.185484,-0.580292


In [134]:
# PCA with explained variance of 97%

pca = PCA(n_components=0.97)
pca.fit(embeddings)
embeddings_pca = pca.transform(embeddings)

embeddings_pca = pd.DataFrame(embeddings_pca, columns=["embedding_" + str(i) for i in range(embeddings_pca.shape[1])])

print("Shape of PCA embeddings df: ", embeddings_pca.shape)
embeddings_pca.head()

Shape of PCA embeddings df:  (364047, 63)


Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_53,embedding_54,embedding_55,embedding_56,embedding_57,embedding_58,embedding_59,embedding_60,embedding_61,embedding_62
0,-2.176781,-1.316914,-1.029052,0.901915,-1.809547,2.0647,1.221907,0.024435,0.927239,0.66981,...,0.332383,-0.610108,-0.442511,-0.155561,-0.179347,0.393691,-0.222066,0.230603,-0.100332,0.048705
1,-1.735178,0.489895,3.268559,0.087855,1.473055,0.932716,-1.841625,0.881803,-0.207199,-0.81681,...,-0.016971,0.17787,-0.111902,-0.038589,0.167507,0.092142,-0.245996,-0.102199,0.083906,-0.035523
2,-0.91269,-2.089339,1.86587,-1.202525,2.53059,0.521978,-0.224346,-1.479933,-0.191864,-1.356812,...,0.069812,0.104525,0.204842,0.050869,-0.370885,0.186885,-0.019912,0.104944,0.102711,0.192421
3,1.096565,0.212958,4.183516,-0.649572,-0.130867,-1.126552,-1.063991,0.662894,0.348144,-1.463901,...,0.066608,0.483661,-0.3142,-0.159661,-0.129967,0.060394,0.244929,0.180022,-0.028637,-0.371275
4,0.193783,-0.263949,1.896583,-1.834346,1.270363,1.723299,-0.329007,-0.283798,-0.659794,-1.223744,...,0.015978,0.18145,0.009191,-0.165058,-0.091436,-0.05917,-0.186881,0.44874,-0.348886,0.222732


In [135]:
# Save the data to a file
with open('embeddings_pca.pickle', 'wb') as f:
    pickle.dump(embeddings_pca.values, f, protocol=pickle.HIGHEST_PROTOCOL)

Our old embeddings file size was 355 MB, our new one reduced with PCA is 106 MB.

In [136]:
recommend_user_articles(embedding_matrix = embeddings, userId = 11, n_recommendations = 5)

[177140, 176700, 176586, 177148, 176555]

In [137]:
recommend_user_articles(embedding_matrix = embeddings_pca, userId = 11, n_recommendations = 5)

[177140, 176700, 176586, 177148, 176555]

### Collaborative Filtering

#### Feature engineering: appreciation rating

In [21]:
clicks = (pd.read_csv('clicks.csv')).sort_values(by=['user_id'])

In [22]:
# Create a new feature: the time difference between the click and the session start

clicks['time_since_session_start'] = pd.to_datetime(clicks['click_timestamp'], unit='ms') - pd.to_datetime(clicks['session_start'], unit='ms')
clicks['time_since_session_start'] = pd.to_timedelta(clicks['time_since_session_start']).dt.total_seconds()

In [23]:
# Group the data by user ID and article ID
grouped = clicks.groupby(['user_id', 'click_article_id'])

# Create a click rate per article per user, weighted by the time since session start
weighted_click_rate = (grouped['session_size'].sum() / grouped['time_since_session_start'].sum())

In [24]:
# transform weighted_click_rate to a dataframe

weighted_click_rate_df = pd.DataFrame(weighted_click_rate).reset_index()

# rename weighted_click_rate_df['0'] column to 'rating'

weighted_click_rate_df.rename(columns={0: 'rating'}, inplace=True)
weighted_click_rate_df

Unnamed: 0,user_id,click_article_id,rating
0,0,68866,0.001394
1,0,87205,0.035325
2,0,87224,0.075140
3,0,96755,0.060323
4,0,157541,0.001423
...,...,...,...
2950705,322894,168401,0.011319
2950706,322895,63746,0.060811
2950707,322895,289197,0.692281
2950708,322896,30760,0.006745


In [25]:
np.isinf(weighted_click_rate).sum()
weighted_click_rate_df.rating = weighted_click_rate_df.rating.replace([np.inf, -np.inf], 0)

In [26]:
weighted_click_rate_df.rating.describe()

count    2.950710e+06
mean     2.791689e-01
std      1.267142e+01
min      0.000000e+00
25%      3.727070e-03
50%      1.118059e-02
75%      3.191676e-02
max      9.000000e+03
Name: rating, dtype: float64

In [27]:
scaler = StandardScaler().fit_transform(weighted_click_rate_df.rating.values.reshape(-1,1))
scaler = MinMaxScaler(feature_range=(0,5)).fit_transform(scaler)
weighted_click_rate_df['rating_scaled'] = pd.Series(scaler.reshape(-1))
weighted_click_rate_df

Unnamed: 0,user_id,click_article_id,rating,rating_scaled
0,0,68866,0.001394,7.742827e-07
1,0,87205,0.035325,1.962504e-05
2,0,87224,0.075140,4.174442e-05
3,0,96755,0.060323,3.351263e-05
4,0,157541,0.001423,7.908152e-07
...,...,...,...,...
2950705,322894,168401,0.011319,6.288300e-06
2950706,322895,63746,0.060811,3.378367e-05
2950707,322895,289197,0.692281,3.846006e-04
2950708,322896,30760,0.006745,3.747171e-06


In [28]:
weighted_click_rate_df.rating_scaled.describe()

count    2.950710e+06
mean     1.550939e-04
std      7.039679e-03
min      0.000000e+00
25%      2.070595e-06
50%      6.211439e-06
75%      1.773153e-05
max      5.000000e+00
Name: rating_scaled, dtype: float64

#### Collaborative filtering with the Surprise library

In [32]:
df = weighted_click_rate_df[['user_id', 'click_article_id', 'rating_scaled']]

In [39]:
# initialize the reader
reader = Reader(rating_scale=(0,5))

# load 10% sample of the data
data = Dataset.load_from_df(df.sample(frac=0.1, random_state=11), reader)

In [41]:
# define the parameter grid
param_grid = {'n_factors': [20, 50, 100], 'n_epochs': [20, 30, 40], 'lr_all': [0.005, 0.01, 0.02], 'reg_all': [0.02, 0.04, 0.06]}

# initialize the grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

# fit the grid search
gs.fit(data)

# print the best parameters and rmse score
print(gs.best_params['rmse'])
print(gs.best_score['rmse'])

{'n_factors': 20, 'n_epochs': 40, 'lr_all': 0.02, 'reg_all': 0.06}
0.010809402604548641


In [42]:
# before fitting the model, load the full dataset
data = Dataset.load_from_df(df, reader)

# best parameters
best_params = gs.best_params['rmse']

# initialize the SVD algorithm with the best parameters
svd_algo = SVD(n_factors=best_params['n_factors'], n_epochs=best_params['n_epochs'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])

# fit the SVD algorithm on the whole dataset
svd_algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20e5cd91240>

In [48]:
svd_algo.predict(uid=11, iid=100)

Prediction(uid=11, iid=100, r_ui=None, est=1.9114937288529295e-05, details={'was_impossible': False})

In [67]:
def recommend_articles(user_id, data, svd_algo):
    # get all articles the user hasn't read yet
    unread_articles = data[~data.user_id.isin([user_id])]

    # predict the rating for each unread article
    predictions = []
    for i, row in unread_articles.iterrows():
        pred = svd_algo.predict(user_id, row['click_article_id'])
        predictions.append((row['click_article_id'], pred.est))

    # sort the predictions by rating
    predictions.sort(key=lambda x: x[1], reverse=True)

    # output the top 5 articles
    top_5_articles = predictions[:5]
    return top_5_articles

In [None]:
recommend_articles(11, df, svd_algo)

In [65]:
# Given a user_id
user_id = 5

# Get the articles that the user has not rated
unread_articles = df[~df.user_id.isin([user_id])]['click_article_id']

# Predict the rating of the 5 articles that the user has not rated yet
predictions = []
for article in unread_articles:
    pred = svd_algo.predict(user_id, article, verbose=False)
    predictions.append((article, pred.est))

# Sort the predictions by the estimated rating
predictions = sorted(predictions, key=lambda x: x[1], reverse=True)

# Print the top-5 articles
for i in range(5):
    print(f'Article ID: {predictions[i][0]}, estimated rating: {predictions[i][1]:.2f}')

Article ID: 287083, estimated rating: 0.50
Article ID: 180792, estimated rating: 0.47
Article ID: 47077, estimated rating: 0.45
Article ID: 89670, estimated rating: 0.42
Article ID: 152975, estimated rating: 0.36


In [66]:
# Given a user_id
user_id = 500

# Get the articles that the user has not rated
unread_articles = df[~df.user_id.isin([user_id])]['click_article_id']

# Predict the rating of the 5 articles that the user has not rated yet
predictions = []
for article in unread_articles:
    pred = svd_algo.predict(user_id, article, verbose=False)
    predictions.append((article, pred.est))

# Sort the predictions by the estimated rating
predictions = sorted(predictions, key=lambda x: x[1], reverse=True)

# Print the top-5 articles
for i in range(5):
    print(f'Article ID: {predictions[i][0]}, estimated rating: {predictions[i][1]:.2f}')

Article ID: 287083, estimated rating: 0.50
Article ID: 180792, estimated rating: 0.47
Article ID: 47077, estimated rating: 0.45
Article ID: 89670, estimated rating: 0.42
Article ID: 152975, estimated rating: 0.36


#### Collaborative filtering with the Implicit library

In [2]:
clicks = pd.read_csv('clicks.csv')

In [3]:
clicks 

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2
1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2
2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2
3,1,1506825426267738,1506825426000,2,96663,1506827047951,4,1,17,1,16,2
4,2,1506825435299739,1506825435000,2,119592,1506827090575,4,1,17,1,24,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2988176,10051,1508211372158328,1508211372000,2,84911,1508211557302,4,3,2,1,25,1
2988177,322896,1508211376302329,1508211376000,2,30760,1508211672520,4,1,17,1,25,2
2988178,322896,1508211376302329,1508211376000,2,157507,1508211702520,4,1,17,1,25,2
2988179,123718,1508211379189330,1508211379000,2,234481,1508211513583,4,3,2,1,25,2


In [5]:
pd.DataFrame.to_csv(clicks_user_article, 'clicks_user_article.csv', index=False)

In [144]:
rows = df['user_id']
cols = df['click_article_id']
data = np.ones(len(df))

user_item_matrix = coo_matrix((data, (rows, cols))).tocsr()

In [75]:
model = implicit.als.AlternatingLeastSquares()
model.fit(user_item_matrix)

100%|██████████| 15/15 [05:38<00:00, 22.55s/it]


In [139]:
with open('als_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [141]:
# define the function to recommend new articles to a user
def recommend_articles(user_id, model, user_item_matrix, n_articles=5):
    # get the row of the user in the user-item matrix
    user_row = user_item_matrix.getrow(user_id)
    # get the recommendations for the user
    recommendations = model.recommend(user_id, user_row, N = 5)
    # get the article ids from the recommendations
    return recommendations[0]

In [142]:
# get recommendations for a user
user_id = 500
recommend_articles(user_id, model, user_item_matrix)

array([225010, 207603, 202388,  58619,  70591])