In [None]:
#import the libraries
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler

Data import and preprocessing

In [None]:
#import data
user_interactions = pd.read_csv('ds-assignment/user-interactions.csv')
meta_data = pd.read_csv('ds-assignment/metadata.csv')

In [8]:
# merge data   
combine_data = pd.merge(user_interactions,meta_data, 
                      on ='pratilipi_id', 
                      how ='inner')

In [9]:
#sorting data based on publication time 
combined_data = combine_data.sort_values("published_at")

In [None]:
#creating time_spent (read_percent*reading_time)
combine_data['time_spent'] = combine_data['read_percent']*combine_data['reading_time']/100

Creating sparse csr matrix

In [13]:
values = pd.DataFrame(combine_data['time_spent'].value_counts())
_len = combine_data.shape[0]
values = values.reset_index()

In [None]:
values['weights'] = (values['time_spent']/_len)*20
values

In [None]:
read_percent_strength = {i: w for i,w in zip(values['index'], values['weights'])}
read_percent_strength

In [16]:
combine_data['read_strength'] = combine_data['time_spent'].apply(lambda x: read_percent_strength[x])

In [None]:
combine_data = combine_data.drop_duplicates()
grouped_df = combine_data.groupby(['pratilipi_id','user_id']).sum().reset_index()
grouped_df.sample(10)

In [None]:
grouped_df['pratilipi_id'] = grouped_df['pratilipi_id'].astype("category")
grouped_df['user_id'] = grouped_df['user_id'].astype("category")
grouped_df['user_id_1'] = grouped_df['user_id'].cat.codes
grouped_df['pratilipi_id_1'] = grouped_df['pratilipi_id'].cat.codes

sparse_pratilipi_user = sparse.csr_matrix((grouped_df['read_strength'].astype(float), (grouped_df['pratilipi_id_1'], grouped_df['user_id_1'])))
sparse_user_pratilipi = sparse.csr_matrix((grouped_df['read_strength'].astype(float), (grouped_df['user_id_1'], grouped_df['pratilipi_id_1'])))

model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

alpha = 15
data = (sparse_pratilipi_user * alpha).astype('double')
model.fit(data)

Similar users 

In [None]:
pratilipi_id_1 = 450
n_similar = 5

user_vecs = model.user_factors
pratilipi_vecs = model.item_factors

pratilipi_norms = np.sqrt((pratilipi_vecs * pratilipi_vecs).sum(axis=1))

scores = pratilipi_vecs.dot(pratilipi_vecs[pratilipi_id_1]) / pratilipi_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / pratilipi_norms[pratilipi_id_1]), key=lambda x: -x[1])

for pratilipi in similar:
    idx, score = pratilipi
    print(grouped_df.pratilipi_id.loc[grouped_df.pratilipi_id_1 == idx].iloc[0])

similar books/recommendations

In [None]:
def recommend(user_id_1, sparse_user_pratilipi, pratilipi_vecs, user_vecs, num_contents=10):
    # Get the interactions scores from the sparse person content matrix
    person_interactions = sparse_user_pratilipi[:,user_id_1].toarray()
    
    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1
    
    # Make articles already interacted zero
    person_interactions[person_interactions > 1] = 0
    
    # Get dot product of person vector and all content vectors
    rec_vector = user_vecs[user_id_1,:].dot(pratilipi_vecs.T).toarray()
    
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    
    # Content already interacted have their recommendation multiplied by zero
    recommend_vector = person_interactions * rec_vector_scaled
    
    # Sort the indices of the content into order of best recommendations
    content_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    
    # Start empty list to store titles and scores
    titles = []
    scores = []

    for idx in content_idx:
        # Append titles and scores to the list
        titles.append(grouped_df.pratilipi_id.loc[grouped_df.content_id == idx].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'title': titles, 'score': scores})

    return recommendations
    
# Get the trained person and content vectors. We convert them to csr matrices
person_vecs = sparse.csr_matrix(model.user_factors)
content_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for person with id 50
user_id_1 = 50

recommendations = recommend(user_id_1=user_id_1, sparse_user_pratilipi=sparse_user_pratilipi, user_vecs=person_vecs, pratilipi_vecs=content_vecs)

print(recommendations)