## 1. Importing libraries

In [1]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, evaluate, accuracy, dump, get_dataset_dir
from surprise.model_selection import cross_validate, train_test_split
from surprise.model_selection import GridSearchCV
from surprise import SVD, NMF, KNNBasic, KNNWithMeans, KNNBaseline, KNNWithZScore
from surprise import SlopeOne, SVDpp, NormalPredictor, BaselineOnly, CoClustering

In [2]:
import io

In [3]:
from tqdm import tqdm_notebook as tqdm

## 2. Importing dataset

In [4]:
df = pd.read_csv('item_item_final.csv')
df.head(30)

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp
0,66,4,66,0.5,1145937915
1,90,4,181,0.5,1127946789
2,93,4,193,0.5,1113766123
3,159,4,546,0.5,1115782390
4,296,4,1760,0.5,1114050751
5,341,4,2152,0.5,1115781964
6,363,4,2381,0.5,1135543160
7,371,4,2450,0.5,1113796599
8,417,4,2816,2.5,1123989912
9,466,4,3268,0.5,1114050659


In [5]:
df.drop(['Unnamed: 0','timestamp'], inplace=True,axis=1)

In [6]:
df.head()

Unnamed: 0,userId,movieId,rating
0,4,66,0.5
1,4,181,0.5
2,4,193,0.5
3,4,546,0.5
4,4,1760,0.5


In [7]:
len(df)

216399

In [8]:
def rec_gridsearch(model,param_grid,cv):
    gs = GridSearchCV(model, param_grid, measures=['rmse', 'mae'], cv=cv, n_jobs=-1)
    gs.fit(data)
    # best RMSE score
    print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])

## Setting a benchmark for user-based recommendation   

In [9]:
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df,reader)

In [11]:
# Massive Cross Validation of every model Surprise has. 
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=10, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.999671,66.845375,0.753049
SVD,1.026698,12.790418,0.240849
SlopeOne,1.037818,0.996858,0.564639
BaselineOnly,1.056784,0.818183,0.164463
NMF,1.060784,25.028074,0.218789
CoClustering,1.075421,10.61119,0.209843
NormalPredictor,1.620468,0.380421,0.196912


In [None]:
KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore()

## Testing KNNs for item-item recommendation.

In [13]:
train2, test2 = train_test_split(data2, test_size=.2)

### KNN Baseline

In [26]:
sim_options = {'name': 'msd',
               'min_support': 5,
               'user_based': False}
base1 = KNNBaseline(k=30,sim_options=sim_options)

In [21]:
base1.fit(train2)
base1_preds = base1.test(test2)
accuracy.rmse(base1_preds)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0197


1.0196939903010263

In [22]:
sim_options = {'name': 'cosine',
               'min_support': 5,
               'user_based': False}
base2 = KNNBaseline(k=30,sim_options=sim_options)

In [23]:
base2.fit(train2)
base2_preds = base2.test(test2)
accuracy.rmse(base2_preds)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0241


1.024058366731553

In [27]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
base3 = KNNBaseline(k=30,sim_options=sim_options)

In [28]:
base3.fit(train2)
base3_preds = base3.test(test2)
accuracy.rmse(base3_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0098


1.0098460059651595

In [31]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
base4 = KNNBaseline(k=35,sim_options=sim_options)

In [32]:
base4.fit(train2)
base4_preds = base4.test(test2)
accuracy.rmse(base4_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0098


1.0098252215921415

In [33]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
base5 = KNNBaseline(k=40,sim_options=sim_options)

In [34]:
base5.fit(train2)
base5_preds = base5.test(test2)
accuracy.rmse(base5_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0098


1.0098465914432893

In [49]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
base6 = KNNBaseline(k=39,sim_options=sim_options)

In [50]:
base6.fit(train2)
base6_preds = base6.test(test2)
accuracy.rmse(base6_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0098


1.0098372002372284

In [51]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
base7 = KNNBaseline(k=37,sim_options=sim_options)

In [52]:
base7.fit(train2)
base7_preds = base7.test(test2)
accuracy.rmse(base7_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0098


1.0098314020112866

In [53]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
base8 = KNNBaseline(k=36,sim_options=sim_options)

In [54]:
base8.fit(train2)
base8_preds = base8.test(test2)
accuracy.rmse(base8_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0098


1.0098283820135368

In [55]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
base9 = KNNBaseline(k=33,sim_options=sim_options)

In [56]:
base9.fit(train2)
base9_preds = base9.test(test2)
accuracy.rmse(base9_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0098


1.0098364010834906

In [57]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
base10 = KNNBaseline(k=10,sim_options=sim_options)

In [58]:
base10.fit(train2)
base10_preds = base10.test(test2)
accuracy.rmse(base10_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0099


1.0098848285440225

In [59]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
base11 = KNNBaseline(k=20,sim_options=sim_options)

In [60]:
base11.fit(train2)
base11_preds = base11.test(test2)
accuracy.rmse(base11_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0098


1.0098036979411174

In [61]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
base12 = KNNBaseline(k=25,sim_options=sim_options)

In [62]:
base12.fit(train2)
base12_preds = base12.test(test2)
accuracy.rmse(base12_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0099


1.0098544168866566

In [63]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
base13 = KNNBaseline(k=21,sim_options=sim_options)

In [64]:
base13.fit(train2)
base13_preds = base13.test(test2)
accuracy.rmse(base13_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0098


1.009794033164408

### BEST MODEL ^^^^^^^^

In [67]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
base14 = KNNBaseline(k=22,sim_options=sim_options)

In [68]:
base14.fit(train2)
base14_preds = base14.test(test2)
accuracy.rmse(base14_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0098


1.009816116980724

### KNNMeans

In [38]:
sim_options = {'name': 'msd',
               'min_support': 5,
               'user_based': False}
means1 = KNNWithMeans(k=30,sim_options=sim_options)

In [36]:
means1.fit(train2)
means1_preds = means1.test(test2)
accuracy.rmse(means1_preds)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0294


1.0294320120270934

In [39]:
sim_options = {'name': 'cosine',
               'min_support': 5,
               'user_based': False}
means2 = KNNWithMeans(k=30,sim_options=sim_options)

In [40]:
means2.fit(train2)
means2_preds = means2.test(test2)
accuracy.rmse(means2_preds)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0337


1.0337170543942928

In [41]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
means3 = KNNWithMeans(k=30,sim_options=sim_options)

In [42]:
means3.fit(train2)
means3_preds = means3.test(test2)
accuracy.rmse(means3_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0203


1.0203067696333434

### KNN Basic

In [43]:
sim_options = {'name': 'msd',
               'min_support': 5,
               'user_based': False}
basic1 = KNNBasic(k=30,sim_options=sim_options)

In [44]:
basic1.fit(train2)
basic1_preds = basic1.test(test2)
accuracy.rmse(basic1_preds)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0450


1.045005286085737

In [45]:
sim_options = {'name': 'cosine',
               'min_support': 5,
               'user_based': False}
basic2 = KNNBasic(k=30,sim_options=sim_options)

In [46]:
basic2.fit(train2)
basic2_preds = basic2.test(test2)
accuracy.rmse(basic2_preds)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0508


1.0508391483421877

In [47]:
sim_options = {'name': 'pearson_baseline',
               'min_support': 5,
               'user_based': False}
basic3 = KNNBasic(k=30,sim_options=sim_options)

In [48]:
basic3.fit(train2)
basic3_preds = basic3.test(test2)
accuracy.rmse(basic3_preds)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0352


1.035231088637645