In [1]:
# 带用户和物品打分偏差的矩阵分解算法做评分预测

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [3]:
# 读取数据
rating_file = '../jupyter_files/ml-latest-small/ratings.csv'

data = pd.read_csv(rating_file)

In [4]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# 将Id 转换为用户矩阵/物品矩阵的行号
data['userId'] = data['userId'] - 1
data['movieId'] = LabelEncoder().fit_transform(data['movieId'])

In [6]:
data.drop("timestamp", axis=1, inplace=True)           # 去除不用的列
data = data.sample(frac=1, random_state=2020)          # 打乱顺序
data.reset_index(drop=True, inplace=True)

train = data.iloc[0:int(len(data)*0.7), :]            # 分割训练集合测试集
test = data.iloc[int(len(data)*0.7):, :]

In [7]:
train['movieId'].max()

9723

In [8]:
train['movieId'].nunique()                 # 注意到movieId的最大编号大于movieId的不同值的个数。这个在训练的时候要予以处理

8604

In [9]:
data['movieId'].max()

9723

# 定义模型参数

In [10]:
mu = train['rating'].mean()              # 全局偏差常数

In [11]:
bu = np.zeros(train['userId'].nunique())        # 定义用户偏差系数bu

In [12]:
bu.shape

(610,)

In [13]:
bi = np.zeros(train['movieId'].nunique())      # 定义物品偏差系数bi

In [14]:
bi.shape

(8604,)

In [15]:
pu = np.random.normal(0, 0.1, (train['userId'].nunique(), 30))       # 定义用户矩阵，设隐向量为30维

In [16]:
pu.shape

(610, 30)

In [17]:
qi = np.random.normal(0, 0.1, (train['movieId'].nunique(), 30)   )    # 定义物品矩阵

In [18]:
qi.shape

(8604, 30)

In [19]:
lr = 0.01      # 定义学习率。此处为统一的学习率，也可以给各参数分别定义。
reg = 0.02     # 定义正则系数。此处为统一的正则系数，也可以给各参数分别定义。

n_epochs = 5   # 最大迭代次数

In [20]:
qi[0]

array([-0.05048308,  0.0484573 ,  0.06287342,  0.03919684, -0.00537019,
       -0.05688397, -0.06798224,  0.02417511,  0.00727975, -0.01518131,
       -0.05024629,  0.06364183,  0.00076297, -0.09872642, -0.09570419,
       -0.19803726, -0.144549  , -0.05855031,  0.02669158,  0.11209512,
        0.0358852 ,  0.02974759, -0.0431942 , -0.02761993,  0.17365057,
        0.03895474, -0.00722383,  0.02591012,  0.11120256, -0.035517  ])

# 使用随机梯度下降求解各模型参数

In [21]:
for epoch in tqdm(range(n_epochs)):
    for row in train.itertuples():
        u = row[1]
        i = row[2]
        r= row[3]
        if i >= train['movieId'].nunique():
            continue
        
        # 计算拟合值
        r_hat = mu + bi[i] + bu[u] + np.dot(qi[i], pu[u])
        error = r - r_hat
        
        # 更新参数
        bi[i] += lr * (error - reg*bi[i])
        bu[u] += lr * (error - reg* bu[u])
        qi[i] += lr * (error*pu[u] - reg*qi[i])
        pu[u] += lr * (error*qi[i] - reg*pu[u])

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [06:33<00:00, 78.62s/it]


In [22]:
qi[0]

array([-0.03852267,  0.13175349,  0.10998723,  0.02241689, -0.04293674,
       -0.1399863 , -0.05067184, -0.01050995, -0.0189344 ,  0.04726878,
        0.0037312 ,  0.02849495,  0.05072748, -0.0337539 , -0.09962791,
       -0.11492175, -0.08481368, -0.00185224,  0.06110406,  0.07974761,
       -0.00803508,  0.09002603, -0.03589235,  0.0150733 ,  0.17847898,
        0.07641436, -0.0120292 ,  0.04476086,  0.06582417, -0.04614931])

In [23]:
# 计算测试集的MAE
count = 0
error_sum = 0
for row in test.itertuples():
    u = row[1]
    i = row[2]
    r = row[3]
    # 跳过未识别的物品或用户
    if i < train['movieId'].nunique() and u < train['userId'].nunique():
        r_hat = mu + bi[i] + bu[u] + np.dot(qi[i], pu[u])
        error_sum += abs(r - r_hat)
        count += 1

In [24]:
print("MAE: ", error_sum / count)

MAE:  0.6827952960632541
