In [20]:
#import the libraries
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler

Data import and preprocessing

In [21]:
#import data
user_interactions = pd.read_csv('user-interactions.csv')
meta_data = pd.read_csv('metadata.csv')

In [22]:
# merge data   
combine_data = pd.merge(user_interactions,meta_data, 
                      on ='pratilipi_id', 
                      how ='inner')

In [23]:
#sorting data based on publication time 
combined_data = combine_data.sort_values("published_at")

In [24]:
#creating time_spent (read_percent*reading_time)
combine_data['time_spent'] = combine_data['read_percent']*combine_data['reading_time']/100

Creating sparse csr matrix

In [25]:
values = pd.DataFrame(combine_data['time_spent'].value_counts())
_len = combine_data.shape[0]
values = values.reset_index()

In [26]:
values['weights'] = (values['time_spent']/_len)*20
values

Unnamed: 0,index,time_spent,weights
0,314.000000,78408,0.098675
1,377.000000,75834,0.095436
2,374.000000,69396,0.087334
3,0.000000,57711,0.072628
4,376.000000,54949,0.069152
...,...,...,...
73753,91.014583,1,0.000001
73754,1665.653570,1,0.000001
73755,16.080116,1,0.000001
73756,31.779895,1,0.000001


In [27]:
read_percent_strength = {i: w for i,w in zip(values['index'], values['weights'])}
read_percent_strength

{314.0: 0.09867523761599528,
 377.0: 0.09543589900738939,
 374.0: 0.08733377703295082,
 0.0: 0.07262838789481563,
 376.0: 0.06915245423631931,
 308.0: 0.06501959176908474,
 320.0: 0.06142158513271944,
 318.0: 0.059390391459724134,
 458.0: 0.058402481278000885,
 608.0: 0.058077792326555536,
 316.0: 0.05800731720531158,
 387.0: 0.05650594542595384,
 311.0: 0.05625173159575245,
 313.0: 0.056156086788349935,
 317.0: 0.05582636389967288,
 807.0: 0.054156355223052816,
 305.0: 0.04819869050932307,
 323.0: 0.047929374867426544,
 378.0: 0.047423464175639606,
 554.0: 0.04673633174351108,
 307.0: 0.04637766371575169,
 315.0: 0.04567165401900424,
 428.0: 0.04512547182936363,
 309.0: 0.0440796713694757,
 310.0: 0.043771342714033414,
 375.0: 0.043689541234018116,
 341.0: 0.043680731843862626,
 306.0: 0.043454204668435636,
 322.0: 0.043172304183459825,
 332.0: 0.04298856547450238,
 335.0: 0.04247258690825203,
 368.0: 0.04244741722209347,
 319.0: 0.04243860783193797,
 373.0: 0.042287589714986654,
 536

In [28]:
combine_data['read_strength'] = combine_data['time_spent'].apply(lambda x: read_percent_strength[x])

In [29]:
combine_data['read_strength']

0           0.033289
1           0.033289
2           0.033289
3           0.033289
4           0.033289
              ...   
15892128    0.000422
15892129    0.034533
15892130    0.001143
15892131    0.001143
15892132    0.001143
Name: read_strength, Length: 15892133, dtype: float64

In [30]:
combine_data = combine_data.drop_duplicates()
grouped_df = combine_data.groupby(['pratilipi_id','user_id']).sum().reset_index()
grouped_df.sample(10)

Unnamed: 0.1,pratilipi_id,user_id,Unnamed: 0,read_percent,author_id,reading_time,time_spent,read_strength
1124368,1377786219491072,5506791953627976,21361578,300.0,-6810997050643986,3237,3237.0,0.013909
106353,1377786215757004,5506791976023367,12066028,150.0,-4540664705378744,372,279.0,0.000614
5731298,1377786228267665,5506791973896529,1349244,300.0,-6810997041001947,1161,1161.0,0.169518
2116372,1377786223106904,5506791969685487,2172444,300.0,-6810997027600407,1371,1371.0,0.078922
2461886,1377786223960506,5506791963444146,13710772,200.0,-4540664698040910,4582,4582.0,0.000146
5849176,1377786228292273,5506791961037165,6249204,300.0,-6810997057348656,1611,1611.0,0.057772
330994,1377786216791060,5506791995464889,9738385,100.0,-2270332348043639,191,191.0,0.010912
1343219,1377786220435544,5506791974986179,14503932,300.0,-6810997033444041,672,672.0,0.046181
3808699,1377786226396245,5506791985820225,5277190,200.0,-4540664678401692,926,926.0,0.056289
4145108,1377786226775496,5506791970671512,3470685,300.0,-6810996972711894,1137,1137.0,0.108903


In [31]:
grouped_df['pratilipi_id'] = grouped_df['pratilipi_id'].astype("category")
grouped_df['user_id'] = grouped_df['user_id'].astype("category")
grouped_df['user_id_1'] = grouped_df['user_id'].cat.codes
grouped_df['pratilipi_id_1'] = grouped_df['pratilipi_id'].cat.codes

sparse_pratilipi_user = sparse.csr_matrix((grouped_df['read_strength'].astype(float), (grouped_df['pratilipi_id_1'], grouped_df['user_id_1'])))
sparse_user_pratilipi = sparse.csr_matrix((grouped_df['read_strength'].astype(float), (grouped_df['user_id_1'], grouped_df['pratilipi_id_1'])))

model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

alpha = 15
data = (sparse_pratilipi_user * alpha).astype('double')
model.fit(data)

100%|██████████| 50/50 [00:55<00:00,  1.11s/it]


Similar users 

In [32]:
pratilipi_id_1 = 450
n_similar = 5

user_vecs = model.user_factors
pratilipi_vecs = model.item_factors

pratilipi_norms = np.sqrt((pratilipi_vecs * pratilipi_vecs).sum(axis=1))

scores = pratilipi_vecs.dot(pratilipi_vecs[pratilipi_id_1]) / pratilipi_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / pratilipi_norms[pratilipi_id_1]), key=lambda x: -x[1])

for pratilipi in similar:
    idx, score = pratilipi
    print(grouped_df.pratilipi_id.loc[grouped_df.pratilipi_id_1 == idx].iloc[0])

-414874382199096
1377786215494604
1377786224482499
1377786225823428
1377786228132478


similar books/recommendations

In [56]:
idxs = []
def recommend(user_id_1, sparse_user_pratilipi, pratilipi_vecs, user_vecs, num_contents=10):
    # Get the interactions scores from the sparse person content matrix
    person_interactions = sparse_user_pratilipi[:,user_id_1].toarray()
    
    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1
    
    # Make articles already interacted zero
    person_interactions[person_interactions > 1] = 0
    
    # Get dot product of person vector and all content vectors
    rec_vector = user_vecs[user_id_1,:].dot(pratilipi_vecs.T).toarray()
    
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    
    # Content already interacted have their recommendation multiplied by zero
    recommend_vector = person_interactions * rec_vector_scaled
    
    # Sort the indices of the content into order of best recommendations
    content_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    idxs = content_idx
    # Start empty list to store titles and scores
    titles = []
    scores = []

    for idx in content_idx:
        # Append titles and scores to the list
        if len(grouped_df.query(f"pratilipi_id_1 == {idx}")) > 0:
            titles.append(grouped_df.pratilipi_id.loc[grouped_df.pratilipi_id_1 == idx].iloc[0])
            scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'title': titles, 'score': scores})

    return recommendations, idxs
    
# Get the trained person and content vectors. We convert them to csr matrices
person_vecs = sparse.csr_matrix(model.user_factors)
content_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for person with id 50
user_id_1 = 1

recommendations, idxs = recommend(user_id_1=user_id_1, sparse_user_pratilipi=sparse_user_pratilipi, user_vecs=person_vecs, pratilipi_vecs=content_vecs)

print(recommendations)

              title     score
0  1377786225921976  1.000000
1  1377786221747059  0.926714
2  1377786224036770  0.913978
3  1377786220397600  0.913373
4  1377786224357848  0.904162
5  1377786224333253  0.885074
6  1377786222395755  0.849535
7  1377786221409036  0.847165
8  1377786222735257  0.845763
