In [1]:
import numpy as np
import pandas as pd

### 奇异值分解

In [2]:
#随机生成8个用户对4个物品的评分矩阵
A = np.random.randint(1, 10, (8, 4))
A

array([[9, 4, 9, 9],
       [2, 5, 8, 4],
       [7, 2, 7, 9],
       [1, 5, 8, 1],
       [3, 9, 3, 5],
       [2, 6, 9, 7],
       [5, 8, 5, 6],
       [7, 9, 3, 7]])

In [5]:
#实现矩阵分解
U, S, V = np.linalg.svd(A, full_matrices=False)
print(U.shape, S.shape, V.shape)

(8, 4) (4,) (4, 4)


In [6]:
U

array([[-0.46258098,  0.47852477, -0.14491341,  0.36752958],
       [-0.29419065, -0.02542881,  0.41631837,  0.01554843],
       [-0.37549051,  0.5375596 , -0.18771883, -0.2380621 ],
       [-0.23358352, -0.14457362,  0.59721348,  0.54200071],
       [-0.30002354, -0.52558728, -0.13776278, -0.2133592 ],
       [-0.37365907, -0.00093302,  0.38810369, -0.65554762],
       [-0.35872944, -0.26509575, -0.12843753,  0.07790508],
       [-0.38156819, -0.33760471, -0.4771557 ,  0.18140529]])

In [9]:
#因为是对角矩阵，这里进行了简写
S

array([33.34833138,  8.94702598,  8.55836735,  2.93153011])

In [8]:
V

array([[-0.41158403, -0.4943663 , -0.54621053, -0.53652084],
       [ 0.29136714, -0.86686748,  0.31140726,  0.25820821],
       [-0.56176144, -0.01408872,  0.75902239, -0.32880177],
       [ 0.65584134,  0.06280319,  0.1690106 , -0.73305069]])

In [11]:
#假设用户和物品的隐向量维度k为2，则用户隐向量矩阵为
user_matrix = U[:, 0:2]
user_matrix

array([[-0.46258098,  0.47852477],
       [-0.29419065, -0.02542881],
       [-0.37549051,  0.5375596 ],
       [-0.23358352, -0.14457362],
       [-0.30002354, -0.52558728],
       [-0.37365907, -0.00093302],
       [-0.35872944, -0.26509575],
       [-0.38156819, -0.33760471]])

In [12]:
#物品隐向量矩阵为
item_matrix = V[0:2,:]
item_matrix

array([[-0.41158403, -0.4943663 , -0.54621053, -0.53652084],
       [ 0.29136714, -0.86686748,  0.31140726,  0.25820821]])

### 梯度下降法

In [106]:
data = pd.DataFrame({'user_id':[1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2],
                    'item_id':[1119,167,6265,1440,1427,5404,259,4156,419,415,2834,228,107,440,44,455,44,44,44,44],
                    'score':[9.00,8.00,8.00,9.00,9.00,8.00,7.00,8.00,9.00,10.00,9.00,10.00,10.00,9.00,10.00,10.00,10.00,10.00,10.00,10.00]})

In [102]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [107]:
unique_user = data['user_id'].unique()
unique_item = data['item_id'].unique()
len_users = len(unique_user)
len_items =len(unique_item)
user_map = dict(zip(unique_user, range(len_users)))
item_map = dict(zip(unique_item, range(len_items)))
print(user_map)
print(item_map)
data['user_id'] = data['user_id'].apply(lambda x:user_map[x])
data['item_id'] = data['item_id'].apply(lambda x:item_map[x])

{1: 0, 2: 1}
{1119: 0, 167: 1, 6265: 2, 1440: 3, 1427: 4, 5404: 5, 259: 6, 4156: 7, 419: 8, 415: 9, 2834: 10, 228: 11, 107: 12, 440: 13, 44: 14, 455: 15}


In [83]:
data

Unnamed: 0,user_id,item_id,score
0,0,0,9.0
1,0,1,8.0
2,0,2,8.0
3,0,3,9.0
4,0,4,9.0
5,0,5,8.0
6,0,6,7.0
7,0,7,8.0
8,1,8,9.0
9,1,9,10.0


In [84]:
train, test = train_test_split(data, test_size=0.25)

In [85]:
print(train.shape, test.shape)

(15, 3) (5, 3)


In [108]:
class MatrixDecomposition():
    def __init__(self, lr, n_epochs, n_factors, lmd):
        self.lr = lr #梯度下降法的学习率
        self.n_epochs = n_epochs #梯度下降法的迭代次数
        self.n_factors = n_factors #分解的矩阵的秩，即影响用户打分的隐藏因子，用户向量和物品向量的维度
        self.lmd = lmd #正则化参数
    def fit(self, train):
        print('Fitting data...')
        #随机初始化u和p矩阵
        len_train_users = train['user_id'].nunique()
        len_train_items = train['item_id'].nunique()
        self.users = train['user_id'].unique()
        self.items = train['item_id'].unique()
        self.train_scores = train['score'].values
        u_value = np.random.normal(0, 0.1, (len_train_users, self.n_factors)) #均值为0，方差为0.1，(行数，列数)
        p_value = np.random.normal(0, 0.1, (len_train_items, self.n_factors))
        u = dict(zip(sorted(self.users), u_value))
        p = dict(zip(sorted(self.items), p_value))        
        
        #梯度下降法
        for _ in range(self.n_epochs):
            print('Round:', _)
            for i, j, score_ij in train.values:
                i = int(i)
                j = int(j)
                #套用上面的梯度下降计算公式
                err = score_ij - np.dot(u[i], p[j])
                u[i] -= -self.lr * err * p[j] + self.lr * self.lmd * u[i]
                p[j] -= -self.lr * err * u[i] + self.lr * self.lmd * p[j]
            #计算每轮的误差mse
            error = 0
            for i, j, score_ij in train.values:
                pred_score = np.dot(u[i], p[j])
                error = (pred_score - score_ij) ** 2
            print('train_error:', error / len(train))
        self.u, self.p = u, p
        print('End fitting!')
    def estimate(self, i, j):
        if i in self.users and j in self.items:
            return np.dot(self.u[i], self.p[j])
        else:
            return np.mean(self.train_scores)  #返回训练数据的打分平均值


In [109]:
algo = MatrixDecomposition(0.005, 100, 3, 0.2)
algo.fit(train)

Fitting data...
Round: 0
train_error: 6.686679083874237
Round: 1
train_error: 6.688432772274144
Round: 2
train_error: 6.690490162880588
Round: 3
train_error: 6.692321831614551
Round: 4
train_error: 6.693138644133296
Round: 5
train_error: 6.6916951156577
Round: 6
train_error: 6.686009402655358
Round: 7
train_error: 6.67296621704104
Round: 8
train_error: 6.6477768369349075
Round: 9
train_error: 6.60330853014129
Round: 10
train_error: 6.529400590335553
Round: 11
train_error: 6.412504063224415
Round: 12
train_error: 6.236325896465087
Round: 13
train_error: 5.984442891393699
Round: 14
train_error: 5.645497670319372
Round: 15
train_error: 5.2198403235027575
Round: 16
train_error: 4.723796349643274
Round: 17
train_error: 4.187221295763323
Round: 18
train_error: 3.6444170784847985
Round: 19
train_error: 3.1241933897313388
Round: 20
train_error: 2.6448947770879134
Round: 21
train_error: 2.2150699090823065
Round: 22
train_error: 1.8367409388821518
Round: 23
train_error: 1.5084795251270364
Round:

In [110]:
print(algo.u)
print(algo.p)

{0: array([-0.49013096,  0.7253354 ,  3.84417913]), 1: array([ 1.80577376,  1.47375112, -3.10494444])}
{0: array([-0.3874915 ,  0.2975015 ,  2.16270361]), 1: array([-0.25798899,  0.39343715,  1.90585049]), 2: array([-0.14546386,  0.4416525 ,  1.91258043]), 3: array([-0.36671139,  0.3749567 ,  2.14808294]), 4: array([-0.10810608,  0.35944011,  2.18499754]), 7: array([-0.28335907,  0.45634565,  1.89347671]), 8: array([ 1.03752673,  0.8277211 , -1.80343724]), 9: array([ 1.10587084,  0.93231241, -2.02814095]), 10: array([ 1.05646818,  0.68653602, -1.85911325]), 12: array([ 1.14123404,  1.15143614, -1.89780977]), 13: array([ 1.20960729,  0.84918809, -1.69337897]), 14: array([ 1.11602184,  0.93567402, -2.06841065])}


In [111]:
#预测test的打分
test['pred_score'] = 0.0
for i, j, tm, tm1 in test.values:
    i = int(i)
    j = int(j)
    pred_score = algo.estimate(i, j)
    print(pred_score)
    test.loc[(test['user_id']==i) & (test['item_id']==j), 'pred_score'] = pred_score

9.816533732556527
9.2
9.2
9.2
9.2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [112]:
test

Unnamed: 0,user_id,item_id,score,pred_score
19,1,14,10.0,9.816534
15,1,15,10.0,9.2
6,0,6,7.0,9.2
5,0,5,8.0,9.2
11,1,11,10.0,9.2


In [103]:
#计算测试集的mse误差
test_mse = mean_squared_error(test['score'], test['pred_score'])
test_mse

1.5187447547634745

In [54]:
np.dot(np.array([1,0,1]), np.array(np.array([0,0,1])))

1