## Automatic cross-validation 交叉验证与SVD 

In [1]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate


In [2]:
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to C:\Users\hey/.surprise_data/ml-100k
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9349  0.9388  0.9441  0.9333  0.9369  0.9376  0.0037  
MAE (testset)     0.7372  0.7406  0.7437  0.7354  0.7370  0.7388  0.0030  
Fit time          6.93    6.87    6.86    6.86    7.51    7.01    0.25    
Test time         0.84    0.17    0.22    0.23    0.23    0.34    0.25    


{'fit_time': (6.930888652801514,
  6.867884635925293,
  6.862859010696411,
  6.860879421234131,
  7.513324975967407),
 'test_mae': array([0.73722038, 0.74055314, 0.74370748, 0.73536637, 0.73697778]),
 'test_rmse': array([0.93488139, 0.93883547, 0.94410307, 0.93330198, 0.93689337]),
 'test_time': (0.8420693874359131,
  0.17210149765014648,
  0.21917152404785156,
  0.22614002227783203,
  0.2251572608947754)}

## Train-test split and the fit() method 设置训练与测试SVD

In [None]:
from surprise import SVD
from surprise import Dataset

In [3]:
from surprise import accuracy
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9427


0.9427343662364416

## Train on a whole trainset and the predict() method 用全部数据建模KNN

In [None]:
from surprise import Dataset

In [4]:
from surprise import KNNBasic

In [5]:
# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')

# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
algo = KNNBasic()
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1f3385f1048>

In [6]:
# Let’s say you’re interested in user 196 and item 302 (make sure they’re in the trainset!)
# and you know that the true rating rui=4
# We can now predict ratings by directly calling the predict() method

uid = str(196)  # raw user id (as in the ratings file). They are **strings**!
iid = str(302)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 4.06   {'actual_k': 40, 'was_impossible': False}


## Use a custom dataset 使用自定义数据集

In [9]:
from surprise import Dataset
from surprise.model_selection import cross_validate
import os 

In [2]:
from surprise import BaselineOnly
from surprise import Reader

In [12]:
# path to dataset file
file_path = os.path.expanduser("C:\\Users\\hey\\.surprise_data\\ml-100k\\ml-100k\\u.data")

# As we're loading a custom dataset, we need to define a reader. 
# 在加载数据集之前需要初始化一个reader，因为加载本地方法需要两个参数，一个是你的数据集的地址，另一个就是初始化一个Reader对象
# In the movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
# 定义Reader类分割文件后，导入的本地文件的数据结构必须为：user ;item ;rating ; [timestamp]格式，
# 当然你可以少个timestamp也是没关系的，user为用户的id；item为项目的id；rating为项目所在用户id的评分；
# line_format为数据的行格式，也就是上面的user ; item ; rating ;而seq的意思是要去怎么分割行数据，比如说根据空格或者逗号

reader = Reader(line_format='user item rating timestamp', sep='\t')

data = Dataset.load_from_file(file_path, reader=reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(BaselineOnly(), data, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9482  0.9545  0.9403  0.9398  0.9365  0.9439  0.0066  
MAE (testset)     0.7523  0.7547  0.7447  0.7453  0.7434  0.7481  0.0045  
Fit time          0.27    0.28    0.25    0.26    0.26    0.26    0.01    
Test time         0.15    0.25    0.14    0.19    0.20    0.19    0.04    


{'fit_time': (0.27380967140197754,
  0.2751924991607666,
  0.25318241119384766,
  0.25818800926208496,
  0.2581608295440674),
 'test_mae': array([0.75226501, 0.75471259, 0.74472754, 0.74527895, 0.74340791]),
 'test_rmse': array([0.94818966, 0.9544741 , 0.94031428, 0.93977826, 0.9364955 ]),
 'test_time': (0.14783310890197754,
  0.253706693649292,
  0.14209747314453125,
  0.190138578414917,
  0.20316433906555176)}

In [13]:
import pandas as pd
from surprise import NormalPredictor

In [None]:
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

In [14]:
# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                'userID': [9, 32, 2, 45, 'user_foo'],
                'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=2)

{'fit_time': (0.0, 0.001001596450805664),
 'test_mae': array([1.29247703, 0.88441249]),
 'test_rmse': array([1.52467677, 0.89469998]),
 'test_time': (0.02602076530456543, 0.0)}

In [17]:
df

Unnamed: 0,itemID,rating,userID
0,1,3,9
1,1,2,32
2,1,4,2
3,2,3,45
4,2,1,user_foo


## Use cross-validation iterators 交叉验证

In [19]:
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import KFold


In [None]:
from surprise import Dataset

In [20]:
# 对完整数据集自动循环进行拆分，交叉验证
# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')

# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = SVD()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9418
RMSE: 0.9474
RMSE: 0.9468


In [None]:
# 对已经拆分好的数据集，分别读取

from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

## Tune algorithm parameters with GridSearchCV 算法参数

In [None]:
from surprise import SVD
from surprise import Dataset

In [21]:
from surprise.model_selection import GridSearchCV

In [22]:
# Use movielens-100K
data = Dataset.load_builtin('ml-100k')

# 定义好需要优选的参数网格
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

# 使用网格搜索交叉验证
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.9642954850162692
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [23]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2f4306e0ef0>