In [48]:
import pandas as pd
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import NormalPredictor
from surprise import Reader
from surprise.prediction_algorithms.knns import KNNBasic
from surprise.prediction_algorithms.matrix_factorization import NMF
import sys
sys.path.append("./../")
from src.utils import percentileMetric
from surprise.prediction_algorithms.co_clustering import CoClustering
from surprise.dataset import DatasetAutoFolds

In [37]:
training = pd.read_csv('../data/training.csv')

In [54]:
full_testset = pd.read_csv('../data/requests.csv')

In [38]:
training.drop(columns='timestamp', inplace=True)

In [42]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   user    800000 non-null  int64
 1   movie   800000 non-null  int64
 2   rating  800000 non-null  int64
dtypes: int64(3)
memory usage: 18.3 MB


In [5]:
tuner = training.sample(n=100000, axis=0)

In [6]:
tuner

Unnamed: 0,user,movie,rating
235918,4369,1957,4
399028,3383,1296,5
747768,910,2124,3
547941,2379,527,5
546303,2391,3918,2
...,...,...,...
794029,1117,1042,2
269698,4111,1299,4
788204,689,168,3
661764,1943,1259,5


In [7]:
reader = Reader(rating_scale=(1, 5))
tuner_suprise = Dataset.load_from_df(tuner, reader)

In [8]:
tuner_train, tuner_test = train_test_split(tuner_suprise, test_size=.25)

In [10]:
algo = SVDpp()

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 0.9535
The Snyder Metric: 4.028195887216451


In [11]:
algo = SVDpp(n_epochs=50)

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 1.0139
The Snyder Metric: 3.9691686157771637


In [12]:
algo = SVDpp(n_epochs=10)

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 0.9536
The Snyder Metric: 4.051906779661017


In [13]:
algo = SVDpp(n_epochs=5)

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 0.9702
The Snyder Metric: 4.055049756510693


In [21]:
algo = SVDpp(n_epochs=4)

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 0.9779
The Snyder Metric: 4.04996824052509


In [22]:
algo = SVDpp(n_epochs=6)

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 0.9650
The Snyder Metric: 4.052932458183358


In [20]:
algo = SVDpp(n_epochs=1)

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 1.0412
The Snyder Metric: 4.030277366080881


In [14]:
algo = SVDpp(n_factors=40)

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 0.9576
The Snyder Metric: 4.038371846512614


In [23]:
algo = SVDpp(n_factors=10, n_epochs=5)

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 0.9695
The Snyder Metric: 4.059496082998095


In [24]:
algo = SVDpp(n_factors=5, n_epochs=5)

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 0.9692
The Snyder Metric: 4.060965283657917


In [26]:
algo = SVDpp(n_factors=1, n_epochs=5)

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 0.9690
The Snyder Metric: 4.055261486343426


In [27]:
algo = SVDpp(n_factors=6, n_epochs=5)

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 0.9696
The Snyder Metric: 4.0575783234547


In [28]:
algo = SVDpp(n_factors=4, n_epochs=5)

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 0.9694
The Snyder Metric: 4.054838026677959


In [29]:
algo = SVDpp(n_factors=6, n_epochs=6)

algo.fit(tuner_train)
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))

RMSE: 0.9642
The Snyder Metric: 4.058225704001694


In [50]:
reader = Reader(rating_scale=(1, 5))
suprise_train = Dataset.load_from_df(training, reader)
full_trainset = suprise_train.build_full_trainset()

In [51]:
type(full_trainset)

surprise.trainset.Trainset

In [67]:
algo = SVDpp()
algo.fit(full_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fb9ee1e8890>

In [66]:
full_testset

Unnamed: 0,user,movie,predicted_rating
0,4958,1924,0
1,4958,3264,0
2,4958,2634,0
3,4958,1407,0
4,4958,2399,0
...,...,...,...
200204,1875,3793,0
200205,1875,2160,0
200206,1875,1035,0
200207,1875,1580,0


In [None]:
# for idx in full_testset.index:
#     full_testset['predicted_rating'][idx] = algo.predict(str(full_testset['user'][idx]), str(full_testset['movie'][idx]))

In [73]:
full_testset.loc[1, 'predicted_rating']

Prediction(uid='4958', iid='3264', r_ui=None, est=3.59047875, details={'was_impossible': False})

In [69]:
for idx in full_testset.index:
    full_testset.loc[idx, 'predicted_rating'] = algo.predict(str(full_testset['user'][idx]), str(full_testset['movie'][idx]))

KeyboardInterrupt: 

In [70]:
full_testset

Unnamed: 0,user,movie,predicted_rating
0,4958,1924,"(4958, 1924, None, 3.59047875, {'was_impossibl..."
1,4958,3264,"(4958, 3264, None, 3.59047875, {'was_impossibl..."
2,4958,2634,"(4958, 2634, None, 3.59047875, {'was_impossibl..."
3,4958,1407,"(4958, 1407, None, 3.59047875, {'was_impossibl..."
4,4958,2399,"(4958, 2399, None, 3.59047875, {'was_impossibl..."
...,...,...,...
200204,1875,3793,"(1875, 3793, None, 3.59047875, {'was_impossibl..."
200205,1875,2160,"(1875, 2160, None, 3.59047875, {'was_impossibl..."
200206,1875,1035,"(1875, 1035, None, 3.59047875, {'was_impossibl..."
200207,1875,1580,"(1875, 1580, None, 3.59047875, {'was_impossibl..."


In [None]:
predictions = algo.test(tuner_test)

accuracy.rmse(predictions)

pred_df = pd.DataFrame(data=predictions, columns=('user','movie','actualrating','predictedrating','details'))
print('The Snyder Metric:', percentileMetric(pred_df))