In [57]:
import pandas as pd
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import KNNWithMeans, SVDpp, SVD

In [2]:
ratings = pd.read_csv('ml-1m/ratings.dat', header=None, sep='::', engine='python',
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

In [3]:
movies = pd.read_csv('ml-1m/movies.dat', header=None, sep='::', engine='python',
                      names=['MovieID', 'Title', 'Genres'])

In [4]:
movies_with_ratings = movies.join(ratings.set_index('MovieID'), on='MovieID').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
movies_with_ratings.head()

Unnamed: 0,MovieID,Title,Genres,UserID,Rating,Timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,5.0,978824268.0
1,1,Toy Story (1995),Animation|Children's|Comedy,6.0,4.0,978237008.0
2,1,Toy Story (1995),Animation|Children's|Comedy,8.0,4.0,978233496.0
3,1,Toy Story (1995),Animation|Children's|Comedy,9.0,5.0,978225952.0
4,1,Toy Story (1995),Animation|Children's|Comedy,10.0,5.0,978226474.0


In [6]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.UserID,
    'iid': movies_with_ratings.Title,
    'rating': movies_with_ratings.Rating
})

In [7]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),5.0
1,6.0,Toy Story (1995),4.0
2,8.0,Toy Story (1995),4.0
3,9.0,Toy Story (1995),5.0
4,10.0,Toy Story (1995),5.0


In [8]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

Алгоритм SVD со стандартными параметрами

In [9]:
algo_svd = SVD()

In [10]:
result_svd = cross_validate(algo_svd, data, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8740  0.8750  0.8721  0.8751  0.8733  0.8739  0.0011  
MAE (testset)     0.6867  0.6862  0.6847  0.6865  0.6860  0.6860  0.0007  
Fit time          49.22   48.15   49.01   46.10   47.22   47.94   1.16    
Test time         1.54    1.86    1.70    1.87    1.70    1.73    0.12    


In [11]:
np.mean(result_svd['test_rmse'])

0.8738965238958354

Немного меньше значение, если уменьшить значение параметра n_factors до 30

In [16]:
algo_svd3 = SVD(n_factors=30, n_epochs=20)

In [17]:
result_svd3 = cross_validate(algo_svd3, data, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8724  0.8672  0.8699  0.8706  0.8697  0.8700  0.0017  
MAE (testset)     0.6855  0.6822  0.6841  0.6837  0.6838  0.6839  0.0011  
Fit time          22.56   22.50   22.60   22.54   22.92   22.63   0.15    
Test time         1.96    1.92    1.92    1.90    1.84    1.91    0.04    


In [18]:
np.mean(result_svd3['test_rmse'])

0.8699655574932699

Еще чуть меньше значение, если немного увеличить значение параметра n_epochs

In [34]:
algo_svd4 = SVD(n_factors=30, n_epochs=25)

In [35]:
result_svd4 = cross_validate(algo_svd4, data, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8701  0.8682  0.8681  0.8660  0.8713  0.8688  0.0018  
MAE (testset)     0.6825  0.6804  0.6814  0.6792  0.6832  0.6813  0.0014  
Fit time          27.64   27.97   29.61   27.83   29.64   28.54   0.89    
Test time         2.20    2.24    2.21    2.31    2.36    2.26    0.06    


In [36]:
np.mean(result_svd4['test_rmse'])

0.868757261907553

Немного увеличим параметр reg_all

In [96]:
algo_svd6 = SVD(n_factors=30, n_epochs=25, reg_all=0.03)

In [97]:
result_svd6 = cross_validate(algo_svd6, data, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8673  0.8642  0.8621  0.8626  0.8631  0.8639  0.0019  
MAE (testset)     0.6823  0.6788  0.6784  0.6786  0.6786  0.6793  0.0015  
Fit time          25.60   25.86   26.00   26.84   26.57   26.17   0.46    
Test time         2.62    1.78    2.54    2.54    1.78    2.26    0.39    


In [98]:
np.mean(result_svd6['test_rmse'])

0.8638781894180099

И также увеличим параметр lr_all

In [103]:
algo_svd7 = SVD(n_factors=30, n_epochs=25, reg_all=0.03, lr_all=0.006)

In [104]:
result_svd7 = cross_validate(algo_svd7, data, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8593  0.8654  0.8597  0.8599  0.8649  0.8618  0.0027  
MAE (testset)     0.6753  0.6786  0.6753  0.6755  0.6789  0.6767  0.0017  
Fit time          26.00   26.07   26.84   25.99   25.91   26.16   0.34    
Test time         1.82    2.57    2.55    1.79    2.59    2.26    0.37    


In [105]:
np.mean(result_svd7['test_rmse'])

0.8618338760306161