The penultimate algorithm I will be testing is Slope One. Described as a simple
but effective collaborative model. It is neither a nearest neirbough or a matirx factorisation algorithm and has no tunable parameters.

In [17]:
from surprise import Dataset, SlopeOne
from surprise.accuracy import rmse, mae
from surprise.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import time


In [18]:
#split into validate and test sets

data100k = Dataset.load_builtin('ml-100k')
raw_ratings= data100k.raw_ratings

random.seed(2001)
np.random.seed(2001)
random.shuffle(raw_ratings)

# create threshold for unseen, 80-20
cutoff = int(0.8 * len(raw_ratings))
A_raw= raw_ratings[:cutoff]
B_raw= raw_ratings[cutoff:]

# data is now only set A ratings
data100k.raw_ratings= A_raw

In [19]:
# training and testing on ml100k
algo= SlopeOne()

trainset = data100k.build_full_trainset()
start_fit = time.time()
algo.fit(trainset)
fit_100k= time.time()-start_fit

testset = data100k.construct_testset(B_raw)  # testset is now the set B
start_predict= time.time()
predictions_100k = algo.test(testset)
test_100k= time.time()-start_predict
print("Unbiased accuracy on 100k=,", end=" ")
rmse_100k= rmse(predictions_100k)
print("Fit time for 100k = "+ str(fit_100k))
print("Test time for 100k ="+ str(test_100k))

Unbiased accuracy on 100k=, RMSE: 0.9438
Fit time for 100k = 0.8432004451751709
Test time for 100k =2.370535373687744


In [20]:
# training and testing on ml1m
data1m= Dataset.load_builtin('ml-1m')
trainset, testset = train_test_split(data1m, test_size=0.25, random_state=1)
start_fit = time.time()
algo.fit(trainset)
fit_1m= time.time()-start_fit

start_predict= time.time()
predictions_1m = algo.test(testset)
test_1m= time.time()-start_predict
print("Unbiased accuracy on 1m=,", end=" ")
rmse_1m= rmse(predictions_1m)
print("Fit time for 1m = "+ str(fit_1m))
print("Test time for 1m ="+ str(test_1m))


Unbiased accuracy on 1m=, RMSE: 0.9065
Fit time for 1m = 13.68208909034729
Test time for 1m =54.40141558647156


In [21]:
data_100k=pd.DataFrame(data={'Algorithm':['Slope One'],
                        'RMSE': [rmse_100k],
                        'Fit Time': [fit_100k],
                        'Predict Time': [test_100k]})
data_1m=pd.DataFrame(data={'Algorithm':['Slope One'],
                        'RMSE': [rmse_1m],
                        'Fit Time': [fit_1m],
                        'Predict Time': [test_1m]})


In [22]:
data_100k.to_csv('./algo_data/slope_one_100k')
data_1m.to_csv('./algo_data/slope_one_1m')

In [23]:
from own_algorithms.top_n_list import get_top_n_list

movies_cols = ['movie_id', 'title', 'genres']
movies_df = pd.read_csv('./ml-1m/movies.dat', sep='::', names=movies_cols, engine='python', encoding='latin-1')

# create top n list
movies_398=get_top_n_list(predictions_1m, 10, '398', movies_df)
movies_1=get_top_n_list(predictions_1m, 10, '1', movies_df)
movies_134=get_top_n_list(predictions_1m, 10, '134', movies_df)


In [24]:
movies_134

['Braveheart (1995)',
 'In the Line of Fire (1993)',
 'Last of the Mohicans, The (1992)',
 'Austin Powers: International Man of Mystery (1997)',
 'Full Monty, The (1997)',
 'Office Space (1999)',
 'Being John Malkovich (1999)',
 'Toy Story 2 (1999)',
 'Chicken Run (2000)',
 'Almost Famous (2000)']

In [26]:
# load the current prediction tables and add this column
df398 = pd.read_csv('./predictions/398.csv')
df1= pd.read_csv('./predictions/1.csv')
df134= pd.read_csv('./predictions/134.csv')
df398['Slope One']= movies_398
df1['Slope One']= movies_1
df134['Slope One']= movies_134

In [31]:
#save predictions with slope one
df398.to_csv('./predictions/398.csv', index=False)
df1.to_csv('./predictions/1.csv', index=False)
df134.to_csv('./predictions/134.csv', index=False)