# Surprise

In [14]:
from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly, KNNBasic, NormalPredictor
from surprise import accuracy
from surprise.model_selection import KFold

In [16]:
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file('dataset/movies/ratings.csv', reader=reader)
train_set = data.build_full_trainset()

In [10]:
# Baseline 算法，使用 ALS 进行优化
bsl_options = {'method':'als', 'n_epochs':5, 'reg_u':12, 'reg_i':5}

# SGD 优化
# bsl_options = {'method':'sgd', 'n_epochs':5}

algo = BaselineOnly(bsl_options=bsl_options)

uid=str(196)
iid=str(302)
pred = 0

# 定义 K 折交叉验证迭代器， k=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    # 训练并预测
    algo.fit(trainset)
    predictions = algo.test(testset)
    # 计算 RMSE
    accuracy.rmse(predictions, verbose=True)
    # 每个模型提供 1/3 的结果
    pred=algo.predict(uid, iid, r_ui=4, verbose=False)
    print(pred)


Estimating biases using als...
RMSE: 0.8646
user: 196        item: 302        r_ui = 4.00   est = 4.11   {'was_impossible': False}
Estimating biases using als...
RMSE: 0.8626
user: 196        item: 302        r_ui = 4.00   est = 4.24   {'was_impossible': False}
Estimating biases using als...
RMSE: 0.8645
user: 196        item: 302        r_ui = 4.00   est = 4.04   {'was_impossible': False}


## NormalPredictor 

In [11]:
algo = NormalPredictor()

# 定义 k 折交叉验证迭代器 k=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)
    pred=algo.predict(uid, iid, r_ui=4, verbose=False)
    print(pred)

RMSE: 1.4308
user: 196        item: 302        r_ui = 4.00   est = 3.41   {'was_impossible': False}
RMSE: 1.4325
user: 196        item: 302        r_ui = 4.00   est = 2.49   {'was_impossible': False}
RMSE: 1.4332
user: 196        item: 302        r_ui = 4.00   est = 3.88   {'was_impossible': False}


## SlopeOne

In [21]:
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise import KNNBaseline, SlopeOne
import io
import pandas as pd

In [24]:
# 数据读取
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file('dataset/movies/ratings.csv', reader=reader)
train_set = data.build_full_trainset()

In [25]:
# 使用 SlopeOne 算法
algo = SlopeOne()
algo.fit(train_set)

# 对制定用户和商品进行评分预测
uid = str(196)
iid = str(302)

pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 4.32   {'was_impossible': False}


## KNNBaseline

In [49]:
df = pd.read_csv('dataset/movies/movies.csv')
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [28]:
def read_item_names():
    file_name = ('dataset/movies/movies.csv')
    data = pd.read_csv('dataset/movies/movies.csv')
    rid_to_name = {}
    name_to_rid = {}
    for i in range(len(data['movieId'])):
        rid_to_name[data['movieId'][i]] = data['title'][i]
        name_to_rid[data['title'][i]] = data['movieId'][i]

    return rid_to_name, name_to_rid

In [73]:
# 数据读取
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file('dataset/movies/ratings.csv', reader=reader)
train_set = data.build_full_trainset()

# 相似度计算，使用皮尔逊相似度计算法，使用 ItemCF 相似度计算
sim_options = {'name':'pearson_baseline', 'user_based':False}

# 使用 KNNBaseline 算法，一种 CF 算法
algo = KNNBaseline(sim_options=sim_options)
algo.fit(train_set)

# 获得电影名称信息数据
rid_to_name, name_to_rid = read_item_names()

# 获得 Toy Story 电影的电影 ID
toy_story_raw_id = name_to_rid['Toy Story (1995)']
print(toy_story_raw_id)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
1


In [70]:
# 通过 Toy Story 电影的电影 ID 获取该电影的推荐内部 id
toy_story_inner_id = algo.trainset.to_inner_iid(str(toy_story_raw_id))
print('电影名称:{}, 电影 id:{}'.format(df.loc[df['movieId']==toy_story_inner_id]['title'].values, toy_story_inner_id))

电影名称:['Drop Zone (1994)'], 电影 id:227
