In [1]:
import numpy as np
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly, KNNBasic, NormalPredictor
from surprise import accuracy
from surprise import Trainset
from surprise.model_selection import KFold

In [2]:
df1 = pd.read_csv('./data/combined_data_1.txt', header = None, names = ['user ids','ratings'], usecols=[0,1])
df2 = pd.read_csv('./data/combined_data_2.txt', header = None, names = ['user ids','ratings'], usecols=[0,1])
df3 = pd.read_csv('./data/combined_data_3.txt', header = None, names = ['user ids','ratings'], usecols=[0,1])
df4 = pd.read_csv('./data/combined_data_4.txt', header = None, names = ['user ids','ratings'], usecols=[0,1])
df = df1
df = df.append(df2,ignore_index=True)
df = df.append(df3,ignore_index=True)
df = df.append(df4,ignore_index=True)
df.shape

(100498277, 2)

In [3]:
# 数据中有一些行是用来表示电影id的，现在将它处理到 item ids列中
# 首先获取ratings列值为NaN的行索引,转为ndarray
movieID_index= df[df['ratings'].isnull()].index.values
# 创建一个全为0的长度等于df长度的ndarray
item_data = np.zeros(df.shape[0])

#df1.insert(1,'item ids', 0)
for i in range(movieID_index.shape[0]-1):
    item_data[movieID_index[i]:movieID_index[i+1]] = i+1 
item_data[movieID_index[-1]:] = movieID_index.shape[0]

item_data = item_data.astype('int')

df.insert(1,'item ids', item_data)
df = df.dropna(axis = 0)

df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,user ids,item ids,ratings
0,1488844,1,3.0
1,822109,1,5.0
2,885013,1,4.0
3,30878,1,4.0
4,823519,1,3.0
...,...,...,...
100480502,1790158,17770,4.0
100480503,1608708,17770,3.0
100480504,234275,17770,1.0
100480505,255278,17770,4.0


In [4]:
# 把ids都转为int
df = df.astype({'user ids': 'int'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100480507 entries, 0 to 100480506
Data columns (total 3 columns):
 #   Column    Dtype  
---  ------    -----  
 0   user ids  int64  
 1   item ids  int64  
 2   ratings   float64
dtypes: float64(1), int64(2)
memory usage: 2.2 GB


In [5]:
# 处理测试集
df_test = pd.read_csv('./data/probe.txt',header = None, names = ['user ids'], usecols=[0])
df_test

Unnamed: 0,user ids
0,1:
1,30878
2,2647871
3,1283744
4,2488120
...,...
1425328,1288730
1425329,2536567
1425330,1107317
1425331,9999:


In [6]:
# 根据冒号找对应行，并进行和上面类似的操作
movieID_index_test = df_test[df_test['user ids'].str.find(':') != -1].index.values
item_data_test = np.full(df_test.shape[0], np.nan)

#df1.insert(1,'item ids', 0)
for i in range(movieID_index_test.shape[0]-1):
    movieID = int(df_test.loc[movieID_index_test[i],'user ids'][0:-1])
    item_data_test[movieID_index_test[i]+1:movieID_index_test[i+1]] = movieID
movieID = int(df_test.loc[movieID_index_test[-1],'user ids'][0:-1])
item_data_test[movieID_index_test[-1]+1:] = movieID


df_test.insert(1,'item ids', item_data_test)
df_test = df_test.dropna(axis = 0)

df_test.reset_index(drop=True,inplace=True)

# 让ids类型都为整数 
df_test = df_test.astype({'item ids': 'int'})
df_test = df_test.astype({'user ids': 'int'})
df_test

Unnamed: 0,user ids,item ids
0,30878,1
1,2647871,1
2,1283744,1
3,2488120,1
4,317050,1
...,...,...
1408390,2328701,9997
1408391,1288730,9998
1408392,2536567,9998
1408393,1107317,9998


In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408395 entries, 0 to 1408394
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype
---  ------    --------------    -----
 0   user ids  1408395 non-null  int64
 1   item ids  1408395 non-null  int64
dtypes: int64(2)
memory usage: 21.5 MB


In [8]:
# 用merge操作找出两个表中user ids和item ids 都相同的index
df_train = pd.merge(left=df, right=df_test, how='left',indicator=True, on=['user ids','item ids'])
df_train

Unnamed: 0,user ids,item ids,ratings,_merge
0,1488844,1,3.0,left_only
1,822109,1,5.0,left_only
2,885013,1,4.0,left_only
3,30878,1,4.0,both
4,823519,1,3.0,left_only
...,...,...,...,...
100480502,1790158,17770,4.0,left_only
100480503,1608708,17770,3.0,left_only
100480504,234275,17770,1.0,left_only
100480505,255278,17770,4.0,left_only


In [9]:
# df_true实际评分, 之后被用于计算rmse
df_true = df_train.loc[df_train._merge=='both'].drop(columns='_merge')
df_true.reset_index(drop=True,inplace=True)
df_true

Unnamed: 0,user ids,item ids,ratings
0,30878,1,4.0
1,2647871,1,4.0
2,1283744,1,3.0
3,2488120,1,5.0
4,317050,1,5.0
...,...,...,...
1408390,829192,17770,3.0
1408391,54864,17770,1.0
1408392,533482,17770,3.0
1408393,1196966,17770,2.0


In [10]:
# df_train 为训练所用的数据，除去了probe set,行数应该为 100480507-1408395=99072112
df_train = df_train.loc[df_train._merge=='left_only',:].drop(columns='_merge')
df_train.reset_index(drop=True, inplace=True)

df_train

Unnamed: 0,user ids,item ids,ratings
0,1488844,1,3.0
1,822109,1,5.0
2,885013,1,4.0
3,823519,1,3.0
4,893988,1,3.0
...,...,...,...
99072107,1790158,17770,4.0
99072108,1608708,17770,3.0
99072109,234275,17770,1.0
99072110,255278,17770,4.0


In [11]:
# 使用surprise进行评分预测
reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
train_data = Dataset.load_from_df(df_train,reader=reader)


train_set = train_data.build_full_trainset()

bsl_options = {'method': 'als','n_epochs': 10,'reg_u': 12,'reg_i': 5}
algo = BaselineOnly(bsl_options=bsl_options)
algo.fit(train_set)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7f3765f0b550>

In [12]:
# 测试
# 这里要计算rmse 所以带入的是df_true
test_data = Dataset.load_from_df(df_true,reader=reader) 
testset = test_data.build_full_trainset().build_testset()

predictions = algo.test(testset)
# 计算RMSE
accuracy.rmse(predictions, verbose=True)

RMSE: 0.9827


0.9827216080797988

In [16]:
# 预测
uid = 1608708
iid = 17770
# 输出uid对iid的预测结果
pred = algo.predict(uid, iid, r_ui=3, verbose=True)

user: 1608708    item: 17770      r_ui = 3.00   est = 2.75   {'was_impossible': False}
