In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import os
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise import KNNBasic
from surprise import KNNBaseline
from surprise import KNNWithMeans

In [29]:
data = pd.read_csv('data/1m_useratt.csv')

#for split
data['r_date'] = data['r_date'].astype('datetime64[ns]')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   mid              1000000 non-null  int64         
 1   cust_id          1000000 non-null  int64         
 2   rating           1000000 non-null  float64       
 3   r_date           1000000 non-null  datetime64[ns]
 4   m_decade         1000000 non-null  int64         
 5   m_avg_rating     1000000 non-null  float64       
 6   user_engagement  1000000 non-null  int64         
 7   adopters         1000000 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(5)
memory usage: 61.0 MB


In [2]:
train = pd.read_csv('data/train_1M.csv')
test = pd.read_csv('data/test_1M.csv')
holdout = pd.read_csv('data/ho_1M.csv')

print('holdout shape: ', holdout.shape)
print('test shape: ', test.shape)
print('train shape: ', train.shape)

holdout shape:  (100000, 9)
test shape:  (200000, 9)
train shape:  (700000, 9)


In [3]:
train.head()

Unnamed: 0,mid,cust_id,rating,r_date,m_decade,m_avg_rating,user_engagement,adopters,split
0,3113,510180,3.0,1999-12-12,4,3.187192,9,1,1.0
1,2953,510180,5.0,1999-12-16,3,3.662879,9,1,1.0
2,11242,830363,3.0,1999-12-21,3,3.843621,16,1,1.0
3,10152,1435350,5.0,1999-12-27,4,3.870432,2,1,1.0
4,17499,1394647,4.0,1999-12-29,4,3.942529,7,1,1.0


In [16]:
train['m_decade'].value_counts()

m_decade
5    364284
4    186429
3     80683
2     27997
0     21735
1     18872
Name: count, dtype: int64

In [26]:
train['m_decade'] = train['m_decade'].replace([0], 6)
test['m_decade'] = test['m_decade'].replace([0], 6)
holdout['m_decade'] = holdout['m_decade'].replace([0], 6)

In [23]:
train['m_decade'].value_counts()

m_decade
5    364284
4    186429
3     80683
2     27997
6     21735
1     18872
Name: count, dtype: int64

In [64]:
reader = Reader(rating_scale=(1,5))

train_data = Dataset.load_from_df(train[['cust_id','mid','rating']], reader)
test_data = Dataset.load_from_df(test[['cust_id','mid','rating']], reader)
ho_data = Dataset.load_from_df(holdout[['cust_id','mid','rating']], reader)
all_data = Dataset.load_from_df(data[['cust_id','mid','rating']], reader)

#correct surprise dataset format
train_sr = train_data.build_full_trainset()

test_sr1 = test_data.build_full_trainset()
test_sr = test_sr1.build_testset()

ho_sr1 = ho_data.build_full_trainset()
ho_sr = ho_sr1.build_testset()

all_sr = all_data.build_full_trainset()



In [8]:
sim_dict = {'name': 'cosine', 'user_based': False}
knn = KNNBasic(sim_options=sim_dict)
knn.fit(train_sr)
knn_preds = knn.test(test_sr)

accuracy.rmse(knn_preds)
accuracy.mae(knn_preds)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.2172
MAE:  0.9481


0.9480793984143315

In [11]:
knn.predict(510180, 17499)

Prediction(uid=510180, iid=17499, r_ui=None, est=3.3333333333333335, details={'actual_k': 3, 'was_impossible': False})

In [15]:
sim_dict = {'name': 'cosine', 'user_based': False}
knn_means = KNNWithMeans(sim_options=sim_dict)
knn_means.fit(train_sr)
knn_means_preds = knn_means.test(test_sr)

accuracy.rmse(knn_means_preds)
accuracy.mae(knn_means_preds)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.1319
MAE:  0.8815


0.8815136710150847

# hybrid: rating and decade

In [13]:
sim_dict = {'name': 'cosine', 'user_based': False}
knn_bsl = KNNBaseline(sim_options=sim_dict)
knn_bsl.fit(train_sr)
knn_bsl_preds = knn_bsl.test(test_sr)

accuracy.rmse(knn_bsl_preds)
accuracy.mae(knn_bsl_preds)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.1191
MAE:  0.8707


0.8706840387794991

In [36]:
neighbors = knn_bsl.get_neighbors(train_sr.to_inner_iid(2953), k=10)

In [37]:
neighbors

[0, 3, 4, 9, 12, 13, 16, 18, 22, 27]

In [31]:
knn_dec = KNNBaseline(sim_options=sim_dict_dec)
knn_dec.fit(train_sr_dec)
knn_preds_dec = knn_dec.test(test_sr_dec)

accuracy.rmse(knn_preds_dec)
accuracy.mae(knn_preds_dec)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.2809
MAE:  0.1133


0.11333856608924349

In [39]:
neighborsd = knn_dec.get_neighbors(train_sr.to_inner_iid(2953), k=10)
neighborsd

[0, 3, 4, 9, 11, 12, 13, 16, 17, 18]

In [43]:
resulting_list = neighbors + [i for i in neighborsd if i not in neighbors]
resulting_list

[0, 3, 4, 9, 12, 13, 16, 18, 22, 27, 11, 17]

In [56]:
rated = data[data['cust_id'] == 510180]['mid'].to_list()
rated[-1]

1428

In [57]:
rated

[3113, 2953, 13882, 16912, 10785, 7533, 17251, 12355, 1428]

In [None]:
rating_pred = knn_bsl.predict(510150, 2953).est
dec_pred = appear_model.predict(cust, mid).est
        
        # hybrid model
v2v_pred = (rating_pred * 0.5) + (dec_pred * 0.5)

In [71]:
def v2v_preds(user_list):
    
    '''
    this is an unpersonalized item based recommendation
    returns a movie that is similar to one that has been watched 
    but doesn't take into account any other user behavior
    '''
    
    v2v = []
    
    def v2v_hybrid(cust_id):
        
        # last movie they rated
        rated = data[data['cust_id'] == cust_id]['mid'].to_list()
        mid = rated[-1]
        
        # neighbors from both models
        neighbors = knn_bsl.get_neighbors(all_sr.to_inner_iid(mid), k=10)
        neighborsd = knn_dec.get_neighbors(all_sr.to_inner_iid(mid), k=10)
        
        #combine lists
        nearest_neighbors = neighbors + [i for i in neighborsd if i not in neighbors]

        return [(cust_id, mid, neighbor) for neighbor in nearest_neighbors]
    
    #for all users get the nearest neighbors to the last movie they rated
    for cust_id in user_list:
        user_recs = v2v_hybrid(cust_id)
        v2v.extend(user_recs)
    
    #new df of recs for analysis
    neighbors_df = pd.DataFrame(v2v, columns=["cust_id", "mid", 'recs'])
    return neighbors_df

In [72]:
test_list = [305344, 2439493, 1664010, 387418, 1933293, 834542]

In [73]:
test_df = v2v_preds(test_list)

In [74]:
test_df

Unnamed: 0,cust_id,mid,recs
0,305344,15587,0
1,305344,15587,3
2,305344,15587,11
3,305344,15587,13
4,305344,15587,29
...,...,...,...
61,834542,15702,44
62,834542,15702,14
63,834542,15702,19
64,834542,15702,25


In [75]:
test_df['recs'].value_counts()

recs
11      3
0       2
39      2
19      2
147     2
93      2
59      2
52      2
83      2
29      2
31      2
8773    1
97      1
6979    1
1       1
5846    1
42      1
71      1
95      1
18      1
157     1
2       1
12      1
3107    1
27      1
34      1
36      1
40      1
44      1
14      1
5343    1
130     1
1803    1
739     1
13      1
32      1
38      1
51      1
167     1
173     1
189     1
208     1
222     1
285     1
330     1
60      1
66      1
82      1
104     1
106     1
117     1
3       1
102     1
25      1
Name: count, dtype: int64

In [78]:
test_df.groupby('mid')['recs'].count()

mid
608      10
1535     10
7086     12
10607    10
15587    10
15702    14
Name: recs, dtype: int64