In [1]:
import pandas as pd
import numpy as np

import os
from surprise import Dataset
from surprise import Reader
from surprise import accuracy

from surprise import NormalPredictor

from surprise import BaselineOnly
from surprise import SVD
from surprise import SVDpp
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise.model_selection import RandomizedSearchCV

In [2]:
#notify me when a long running cell is complete
%load_ext jupyternotify
import time

<IPython.core.display.Javascript object>

In [3]:
data = pd.read_csv('data/1m_useratt.csv')

#for split
data['r_date'] = data['r_date'].astype('datetime64[ns]')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   mid              1000000 non-null  int64         
 1   cust_id          1000000 non-null  int64         
 2   rating           1000000 non-null  float64       
 3   r_date           1000000 non-null  datetime64[ns]
 4   m_decade         1000000 non-null  int64         
 5   m_avg_rating     1000000 non-null  float64       
 6   user_engagement  1000000 non-null  int64         
 7   adopters         1000000 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(5)
memory usage: 61.0 MB


In [4]:
testsize = round(len(data) * 0.2)
hosize = round(len(data) * 0.1)
data = data.sort_values(by=['cust_id'])

#splitting data into 3 groups based on how many reviews they've given
for cust in data['cust_id'].unique():
    i = data.index[data['cust_id'] == cust]
    
    #if more than 2, the final review will be in holdout, 2nd to last will be in the test
    if len(i)> 2:
        data.loc[i[-1], 'split'] = 3
        data.loc[i[-2], 'split'] = 2
        data.loc[i[:-2], 'split'] = 1
        
    #if 2, the final review will be in the test
    if len(i) == 2:
        data.loc[i[-1], 'split'] = 2
        data.loc[i[0], 'split'] = 1
        
    #everyone with one review gets a linear temporal split
    else:
        data.loc[i[0], 'split'] = 1
        
#sort values by date
data = data.sort_values(by=['r_date'])

In [None]:
ho2 = traintest.loc[traintest.index.isin(holdout.index)]
ho2.shape

In [None]:
#random sample of final reviews
holdout = data[(data['split'] == 3)].sample(n=hosize, random_state=1)

#using the rest of the data for train/test
traintest = data.loc[~data.index.isin(holdout.index)]

#2nd to last review and linear split
test = traintest[(traintest['split'] == 2)]
test1 = traintest.loc[~traintest.index.isin(test.index)][-(testsize - len(test)):]
test = pd.concat([test, test1])

#remainder in train
train = traintest.loc[~traintest.index.isin(test.index)]

print('holdout shape: ', holdout.shape)
print('test shape: ', test.shape)
print('train shape: ', train.shape)


In [None]:
#train.to_csv('data/train_1M.csv', index = False)
#test.to_csv('data/test_1M.csv', index=False)
#holdout.to_csv('data/ho_1M.csv', index=False)

# Prepare for Surprise

In [4]:
train = pd.read_csv('data/train_1M.csv')
test = pd.read_csv('data/test_1M.csv')
holdout = pd.read_csv('data/ho_1M.csv')

In [6]:
print('holdout shape: ', holdout.shape)
print('test shape: ', test.shape)
print('train shape: ', train.shape)

holdout shape:  (100000, 9)
test shape:  (200000, 9)
train shape:  (700000, 9)


In [7]:
reader = Reader(rating_scale=(1,5))

train_data = Dataset.load_from_df(train[['cust_id','mid','rating']], reader)
test_data = Dataset.load_from_df(test[['cust_id','mid','rating']], reader)
ho_data = Dataset.load_from_df(holdout[['cust_id','mid','rating']], reader)

#correct surprise dataset format
train_sr = train_data.build_full_trainset()

test_sr1 = test_data.build_full_trainset()
test_sr = test_sr1.build_testset()

ho_sr1 = ho_data.build_full_trainset()
ho_sr = ho_sr1.build_testset()

# Normal Predictor

In [8]:
np = NormalPredictor()
np.fit(train_sr)
np_preds = np.test(test_sr)

accuracy.mae(np_preds)

MAE:  1.1781


1.178090054630419

In [9]:
accuracy.rmse(np_preds)

RMSE: 1.4680


1.4680181988882528

# Bias Baseline

In [23]:
#using stochastic gradient descent bc it performed the best
bsl_options = {'method': 'sgd'}
bias_baseline = BaselineOnly(bsl_options)
bias_baseline.fit(train_sr)
predictions = bias_baseline.test(test_sr)

accuracy.mae(predictions)

Estimating biases using sgd...
MAE:  0.7921


0.7920659838316496

In [24]:
accuracy.rmse(predictions, verbose=True)

RMSE: 0.9912


0.9912218719062363

In [25]:
preds_bsl_ho = bias_baseline.test(ho_sr)
accuracy.rmse(preds_bsl_ho)

RMSE: 0.9571


0.9571389103325522

In [26]:
accuracy.mae(preds_bsl_ho)

MAE:  0.7639


0.7639382511529568

In [34]:
def get_recs_bsl(user_list):
    all_recommendations = []
    
    def rec_content(cust_id):
        # Get a list of all the movies the user has not rated
        all_content = data['mid'].unique()
        user_content = data[data['cust_id'] == cust_id]['mid'].unique()
        new_content = list(set(all_content) - set(user_content))

        # Predict the ratings for the new movies
        predictions = [bias_baseline.predict(cust_id, mid) for mid in new_content]

        # Sort the predictions by estimated rating
        predictions.sort(key=lambda x: x.est, reverse=True)

        # Get the top 15 recommendations
        top_recommendations = [prediction.iid for prediction in predictions[:15]]

        return [(cust_id, movie_id) for movie_id in top_recommendations]

    #for all users apply recs
    for cust_id in user_list:
        user_recs = rec_content(cust_id)
        all_recommendations.extend(user_recs)
    
    #new df of recs for analysis
    recommendations_df = pd.DataFrame(all_recommendations, columns=["cust_id", "mid"])
    return recommendations_df

# TESTING TOP 10 RECS

In [13]:
test_list = [305344, 2439493, 1664010, 387418, 1933293, 834542]

In [35]:
get_recs_bsl(test_list)

Unnamed: 0,cust_id,mid
0,305344,7230
1,305344,7833
2,305344,7057
3,305344,2102
4,305344,12834
...,...,...
85,834542,14550
86,834542,17219
87,834542,10080
88,834542,8571


In [36]:
testdf_bsl = get_recs(test_list)
testdf_bsl

Unnamed: 0,cust_id,mid
0,305344,5926
1,305344,12530
2,305344,14961
3,305344,8447
4,305344,8438
5,305344,3456
6,305344,1256
7,305344,16384
8,305344,12034
9,305344,13614


In [37]:
testdf_bsl['mid'].value_counts()

mid
7230     3
10080    3
16587    2
14961    2
3456     2
7833     2
8116     2
15861    1
14240    1
12731    1
13728    1
3864     1
6287     1
17157    1
8535     1
5926     1
17085    1
442      1
1476     1
5738     1
12293    1
12184    1
15296    1
16006    1
8571     1
14302    1
11521    1
359      1
417      1
6428     1
8447     1
8438     1
1256     1
16384    1
12034    1
13614    1
4427     1
16302    1
2803     1
12891    1
371      1
10643    1
16022    1
5        1
76       1
167      1
199      1
223      1
242      1
12530    1
2862     1
Name: count, dtype: int64

In [38]:
#list of all users
all_users = data['cust_id'].unique()
len(all_users)

290022

# TOP 10 RECS: Baseline

In [40]:
top10_allusers = get_recs_bsl(all_users)
top10_allusers

Unnamed: 0,cust_id,mid
0,510180,7230
1,510180,7833
2,510180,7057
3,510180,2102
4,510180,12834
...,...,...
4350325,883348,14550
4350326,883348,17219
4350327,883348,10080
4350328,883348,8571


In [41]:
top10_allusers['mid'].value_counts()

mid
2102     289705
4427     289509
7833     289035
8571     288958
8535     288894
          ...  
113           2
14240         1
57            1
180           1
203           1
Name: count, Length: 122, dtype: int64

In [61]:
top10_allusers.to_csv('data/bslrecs.csv')

In [None]:
%notify

# SVD++

In [47]:
svdpp = SVDpp(n_factors= 150, n_epochs= 20)
svdpp.fit(train_sr)
preds_svdpp = svdpp.test(test_sr)

accuracy.rmse(preds_svdpp)
accuracy.mae(preds_svdpp)

RMSE: 1.0306
MAE:  0.8285


0.8284991647974528

In [54]:
preds_svdpp_ho = svdpp.test(ho_sr)
accuracy.rmse(preds_svdpp_ho)
accuracy.mae(preds_svdpp_ho)

RMSE: 0.9094
MAE:  0.7011


0.7010662501871413

In [55]:
def get_recs(model, user_list):
    all_recommendations = []
    
    def rec_content(cust_id):
        # Get a list of all the movies the user has not rated
        all_content = data['mid'].unique()
        user_content = data[data['cust_id'] == cust_id]['mid'].unique()
        new_content = list(set(all_content) - set(user_content))

        # Predict the ratings for the new movies
        predictions = [model.predict(cust_id, mid) for mid in new_content]

        # Sort the predictions by estimated rating
        predictions.sort(key=lambda x: x.est, reverse=True)

        # Get the top 15 recommendations
        top_recommendations = [prediction.iid for prediction in predictions[:15]]

        return [(cust_id, movie_id) for movie_id in top_recommendations]

    #for all users apply recs
    for cust_id in user_list:
        user_recs = rec_content(cust_id)
        all_recommendations.extend(user_recs)
    
    #new df of recs for analysis
    recommendations_df = pd.DataFrame(all_recommendations, columns=["cust_id", "mid"])
    return recommendations_df

In [56]:
test_svdpp = get_recs(svdpp, test_list)

In [57]:
test_svdpp

Unnamed: 0,cust_id,mid
0,305344,5924
1,305344,3282
2,305344,4227
3,305344,13073
4,305344,8387
...,...,...
85,834542,7057
86,834542,5732
87,834542,8116
88,834542,14601


In [58]:
test_svdpp['mid'].value_counts()

mid
5837     2
8535     2
14961    2
8116     2
345      2
        ..
9395     1
10276    1
6196     1
10080    1
6450     1
Name: count, Length: 81, dtype: int64

In [60]:
top10_svdpp_allusers = get_recs(svdpp, all_users)

KeyboardInterrupt: 

In [None]:
%notify