In [1]:
import pandas as pd

### Import Data

In [2]:
df_train = pd.read_csv('training.txt', header=None, names=['user_id', 'item_id', 'rating'])
df_test = pd.read_csv('testing.txt', header=None, names=['user_id', 'item_id', 'rating'])

In [3]:
df_train = df_train.sort_values(by='user_id')
df_test = df_test.sort_values(by='user_id')

### Checking if users in test are not in train

In [4]:
train_user_id = df_train['user_id'].drop_duplicates()
test_user_id = df_test['user_id'].drop_duplicates()

In [5]:
train_user_id.shape

(162541,)

In [6]:
check = test_user_id.isin(train_user_id)

In [7]:
test_user_id.shape[0] == test_user_id[check].shape[0]

True

all user_ids in test set are in training set

### Checking ratings

In [8]:
df_train['rating'].drop_duplicates().sort_values()

3876688     0.5
17166485    1.0
16312916    1.5
11389711    2.0
2446325     2.5
1398466     3.0
9461865     3.5
0           4.0
17127033    4.5
9470443     5.0
Name: rating, dtype: float64

In [9]:
from surprise import Reader, Dataset, accuracy, Reader
from surprise import SVD
from surprise.model_selection import cross_validate

In [10]:
reader = Reader(rating_scale=(1, 5))
data_train = Dataset.load_from_df(df_train, reader)

### Testing methods on MF

In [29]:
# SVD Method
print('running svd')
svd = SVD()
cross_validate(svd, data_train, measures=["RMSE"], cv=3, verbose=True)

running svd
Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8010  0.8003  0.8012  0.8008  0.0004  
Fit time          190.44  195.79  196.05  194.10  2.59    
Test time         106.66  114.65  134.44  118.58  11.68   


{'test_rmse': array([0.80095148, 0.80032362, 0.8011513 ]),
 'fit_time': (190.44161248207092, 195.79224038124084, 196.05382299423218),
 'test_time': (106.65508317947388, 114.65052008628845, 134.4395034313202)}

### 10 Fold Crossvalidation

In [11]:
import time

In [12]:
# SVD Tuning
start_time_svd_tune = time.time()

svd = SVD(n_factors=80, n_epochs = 60, lr_all=0.005, reg_all=0.05)
result = cross_validate(svd, data_train, measures=["RMSE"], cv=10, verbose=True)


end_time_svd_tune = time.time()
time.strftime('%H:%M:%S', time.gmtime(end_time_svd_tune - start_time_svd_tune))

Evaluating RMSE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.7782  0.7779  0.7783  0.7775  0.7775  0.7777  0.7774  0.7781  0.7790  0.7775  0.7779  0.0005  
Fit time          687.22  681.41  695.61  691.33  690.36  686.70  687.42  702.67  666.98  681.25  687.10  9.03    
Test time         49.57   43.97   47.96   44.85   48.12   40.19   39.92   49.96   42.25   38.39   44.52   4.04    


'02:09:43'

mean of 10-fold cross validation = 0.7779

In [14]:
data_train_fit = data_train.build_full_trainset()

In [15]:
svd = SVD(n_factors=80, n_epochs = 60, lr_all=0.005, reg_all=0.05)
svd.fit(data_train_fit)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f47e85ff730>

In [16]:
from surprise.dump import dump, load

In [None]:
dump('svd_final.pkl',None,svd,1)