In [46]:
import pandas as pd
import joblib

In [8]:
interaction_df = pd.read_csv("data/user_item_interactions.csv")
interaction_df.head()

Unnamed: 0,item_id,user_id,timestamp,click,purchase
0,6198,36c0daa2-5aad-4b0c-81e3-8d276b223573,1633082142,1,0
1,190,c9a9b40a-90ce-42c7-8b02-1301ad4914b1,1630125341,1,0
2,7330,83969882-083c-4150-b9eb-19bb3197720d,1630143229,1,0
3,8978,83969882-083c-4150-b9eb-19bb3197720d,1630143685,1,0
4,9263,83969882-083c-4150-b9eb-19bb3197720d,1630143781,1,0


In [16]:
interaction = interaction_df.copy()
interaction['purchase'] = interaction['purchase'].apply(lambda x: 2 if x==1 else 0)
interaction['rating'] = interaction['purchase'] + interaction['click']
interaction.head()

Unnamed: 0,item_id,user_id,timestamp,click,purchase,rating
0,6198,36c0daa2-5aad-4b0c-81e3-8d276b223573,1633082142,1,0,1
1,190,c9a9b40a-90ce-42c7-8b02-1301ad4914b1,1630125341,1,0,1
2,7330,83969882-083c-4150-b9eb-19bb3197720d,1630143229,1,0,1
3,8978,83969882-083c-4150-b9eb-19bb3197720d,1630143685,1,0,1
4,9263,83969882-083c-4150-b9eb-19bb3197720d,1630143781,1,0,1


In [21]:
dataset = pd.DataFrame()
dataset['user'] = interaction['user_id']
dataset['item'] = interaction['item_id']
dataset['rating'] = interaction['rating']
dataset['timestamp'] = interaction['timestamp']
dataset.head()

Unnamed: 0,user,item,rating,timestamp
0,36c0daa2-5aad-4b0c-81e3-8d276b223573,6198,1,1633082142
1,c9a9b40a-90ce-42c7-8b02-1301ad4914b1,190,1,1630125341
2,83969882-083c-4150-b9eb-19bb3197720d,7330,1,1630143229
3,83969882-083c-4150-b9eb-19bb3197720d,8978,1,1630143685
4,83969882-083c-4150-b9eb-19bb3197720d,9263,1,1630143781


In [68]:
dataset.to_csv('data/user_item_interactions.csv', index=False)

### N-Recommended Products

In [47]:
from surprise import KNNBaseline
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold
from surprise import SVD


reader = Reader(rating_scale=(1, 2))
data = Dataset.load_from_df(df=dataset[['user', 'item', 'rating']], reader=reader)

kf = KFold(n_splits=10)

svd = SVD()

for trainset, testset in kf.split(data):
    svd.fit(trainset)
    predictions = svd.test(testset)
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.1775
RMSE: 0.1789
RMSE: 0.1742
RMSE: 0.1774
RMSE: 0.1766
RMSE: 0.1762
RMSE: 0.1763
RMSE: 0.1747
RMSE: 0.1759
RMSE: 0.1766


In [66]:
from collections import defaultdict
import json
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=10)
result = {}
for uid, user_ratings in top_n.items():
    result[uid] = [iid for (iid, _) in user_ratings]
with open("result.json", "w") as f:
    f.write(json.dumps(result))

In [51]:
joblib.dump(svd,'svd.joblib')

['svd.joblib']

### K-Similar Items

In [43]:
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
knn = KNNBaseline(sim_options=sim_options)
knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1203a4f40>

In [44]:
testset = trainset.build_testset()
predictions = knn.test(testset)
accuracy.rmse(predictions, verbose=True)

RMSE: 0.0465


0.046461337102403036

In [48]:
knn.get_neighbors(456, k=10)

[689, 735, 67, 3104, 365, 688, 2414, 3107, 6947, 1202]

In [93]:
joblib.dump(knn, 'knn.joblib')

['knn.joblib']

In [94]:
model = joblib.load('knn.joblib')
model.get_neighbors(456, k=10)

[689, 735, 67, 3104, 365, 688, 2414, 3107, 6947, 1202]

### Popular Recommended Products

In [92]:
popular_items = dataset.groupby(['item']).size().reset_index()
popular_items['count'] = popular_items[0]
popular_items.sort_values(by='count', ascending=False).head(10)
# popular_items[0]

Unnamed: 0,item,0,count
2511,2719,3409,3409
3188,3589,3081,3081
7610,8978,2867,2867
2845,3082,2612,2612
1451,1547,2075,2075
2509,2717,1842,1842
10290,13400,1569,1569
1188,1264,1489,1489
717,761,1467,1467
176,189,1458,1458


In [91]:
dataset[(dataset['item'] == 2719)]

Unnamed: 0,user,item,rating,timestamp
581,689e1169-d896-4523-8a97-c70649763e67,2719,1,1629074397
583,d7d13b02-7da2-475c-ba5b-698833c87ef9,2719,1,1629100051
642,689e1169-d896-4523-8a97-c70649763e67,2719,1,1629419326
1233,ef5a9bc2-fbcb-4466-a887-70dce4e7cf85,2719,1,1631925066
1295,11e464cf-31c0-4968-ae4d-188ee0aa8664,2719,1,1632423279
...,...,...,...,...
529308,3c4d0011-f473-44fa-9b65-e9118a51a955,2719,2,1624616329
539090,b5633cde-5f00-4e47-a1c4-01d2b4c56aff,2719,2,1635050924
550641,25670cd5-c9bf-4ec1-b532-eecdd293b812,2719,2,1635659599
551352,29396054-3374-471f-b4cd-b793750e5d41,2719,2,1636366687


### Recommendation using Implict

In [104]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler

In [111]:
dataset_df = dataset.copy()
dataset_df.rename(columns={'rating':'event_strength'}, inplace=True)

In [148]:
dataset_transformed = dataset_df.copy()
dataset_transformed['user'] = dataset_transformed['user'].astype("category")
dataset_transformed['user_id'] = dataset_transformed['user'].cat.codes

sparse_item_user = sparse.csr_matrix((dataset_transformed['event_strength'].astype(float), (dataset_transformed['item'], dataset_transformed['user_id'])))
sparse_user_item = sparse.csr_matrix((dataset_transformed['event_strength'].astype(float), (dataset_transformed['user_id'], dataset_transformed['item'])))


model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

alpha = 15
data = (sparse_item_user * alpha).astype('double')

model.fit(data)

  0%|          | 0/50 [00:00<?, ?it/s]

In [141]:
dataset_transformed.head()

Unnamed: 0,user,item,event_strength,timestamp,user_id
0,36c0daa2-5aad-4b0c-81e3-8d276b223573,6198,1,1633082142,4919
1,c9a9b40a-90ce-42c7-8b02-1301ad4914b1,190,1,1630125341,18023
2,83969882-083c-4150-b9eb-19bb3197720d,7330,1,1630143229,11813
3,83969882-083c-4150-b9eb-19bb3197720d,8978,1,1630143685,11813
4,83969882-083c-4150-b9eb-19bb3197720d,9263,1,1630143781,11813


#### Similar Items

In [138]:
item_id = 2719
n_similar = 10

user_vecs = model.user_factors
item_vecs = model.item_factors

item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

scores = item_vecs.dot(item_vecs[item_id]) / item_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / item_norms[item_id]), key=lambda x: -x[1])

for item in similar:
    idx, score = item
    print(idx, score)

2719 1.0000001
5720 0.87037086
15838 0.8476719
22584 0.84492004
9098 0.84096164
5885 0.8357961
8698 0.83492094
2176 0.8272938
22466 0.8253766
14947 0.821198


In [170]:
def recommend(user_id, sparse_user_item, user_vecs, item_vecs, num_contents=10):
    # Get the interactions scores from the sparse person content matrix
    user_interactions = sparse_user_item[user_id,:].toarray()
    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    user_interactions = user_interactions.reshape(-1) + 1
    # Make articles already interacted zero
    user_interactions[user_interactions > 1] = 0
    # Get dot product of person vector and all content vectors
    rec_vector = user_vecs[user_id,:].dot(item_vecs.T).toarray()
    
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    # Content already interacted have their recommendation multiplied by zero
    recommend_vector = user_interactions * rec_vector_scaled
    # Sort the indices of the content into order of best recommendations
    content_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    
    # Start empty list to store titles and scores
    items = []
    scores = []
    
    for idx in content_idx:
        items.append(idx)
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'item': items, 'score': scores})

    return recommendations

user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

user_id = 18023

recommendations = recommend(user_id, sparse_item_user, user_vecs, item_vecs)

print(recommendations)

    item     score
0  16534  1.000000
1   7116  0.959908
2    169  0.950384
3  21447  0.928597
4  22323  0.910530
5    325  0.898174
6   6341  0.897446
7  14359  0.894313
8  15797  0.890978
9  21660  0.876455
