In [2]:
import numpy as np
import math
import torch
import torch.nn as nn
import time
import torch.utils.data as Data
import torch.optim as optim

In [5]:
def get_data(path, num_users, num_items, num_total_ratings, train_ratio):
    fp = open(path + 'ratings.dat')
    
    user_train_set = set() #用户训练集
    user_test_set = set() #用户测试集
    item_train_set = set() #物品训练集
    item_test_set = set() #物品测试集
    
    train_r = np.zeros((num_users, num_items)) #训练评分矩阵
    test_r = np.zeros((num_users, num_items))  #测试评分矩阵
    
    train_mask_r = np.zeros((num_users, num_items))  #用来记录在训练集中已经评过分的电影
    test_mask_r = np.zeros((num_users, num_items))  #用来记录在测试集中已经评过分的电影
    
    random_perm_idx = np.random.permutation(num_total_ratings) #将评分编号进行打乱
    train_idx = random_perm_idx[0:int(num_total_ratings * train_ratio)] #将编号靠前的记为训练集编号
    test_idx = random_perm_idx[int(num_total_ratings * train_ratio):] #将编号靠后的记为测试集编号
    
    lines = fp.readlines() #按照行来读取文件
    
    #训练集
    for itr in train_idx:
        line = lines[itr]
        user, item, rating, _ = line.split("::")  #dat文件是用::隔开的
        user_idx = int(user) - 1  #dat文件中的ID是以1开始的
        item_idx = int(item) - 1
        train_r[user_idx, item_idx] = int(rating)  #构造训练集评分表，用户id为user_idx的用户对电影编号为item_idx的电影打了rating的分数
        train_mask_r[user_idx, item_idx] = 1  #代表用户id为user_idx的用户对电影编号为item_idx的电影进行了评分
        
        user_train_set.add(user_idx) #构造用户训练集
        item_train_set.add(item_idx) #构造物品训练集
    
    #测试集
    for itr in test_idx:
        line = lines[itr]
        user, item, rating, _ = line.split('::')
        user_idx = int(user) - 1
        item_idx = int(item) - 1
        test_r[user_idx, item_idx] = int(rating)
        test_mask_r[user_idx, item_idx] = 1
        
        user_test_set.add(user_idx) #构造用户测试集
        item_test_set.add(item_idx) #构造物品测试集
    return train_r, train_mask_r, test_r, test_mask_r, user_train_set, item_train_set, user_test_set, item_test_set

In [6]:
path = './'
num_users = 6040
num_items = 3952
num_total_ratings = 1000209
train_ratio = 0.9 #90%为训练集
train_r, train_mask_r, test_r, test_mask_r, user_train_set, item_train_set, user_test_set, item_test_set = get_data(path, num_users, 
                                                                                                                    num_items, 
                                                                                                                    num_total_ratings,
                                                                                                                    train_ratio)


In [13]:
print(train_r[0:2])
print(train_mask_r[0:2])
print(list(user_train_set)[0:2])
print(list(item_train_set)[0:2])
print(train_r.shape, train_mask_r.shape, test_r.shape, test_mask_r.shape, len(user_train_set), 
      len(item_train_set), len(user_test_set), len(item_test_set))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[0, 1]
[0, 1]
(6040, 3952) (6040, 3952) (6040, 3952) (6040, 3952) 6040 3697 5949 3338


In [25]:
class Autorec(nn.Module):
    def __init__(self, num_users, num_items, hidden_units, lambda_value):
        super(Autorec, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.hidden_units = hidden_units
        self.lambda_value = lambda_value
        
        self.encoder = nn.Sequential(nn.Linear(self.num_items, self.hidden_units),
                                    nn.Sigmoid())
        self.decoder = nn.Sequential(nn.Linear(self.hidden_units, self.num_items),)
    def forward(self, x):
        encoder = self.encoder(x)
        decoder = self.decoder(encoder)
        return decoder
    def loss(self, decoder, inputs, optimizer, mask_input):
        cost = 0
        temp2 = 0
        cost += ((decoder - inputs) * mask_input).pow(2).sum()
        rmse = cost
        
        for i in optimizer.param_groups:
            for j in i['params']:
                if j.data.dim() == 2:
                    temp2 += torch.t(j.data).pow(2).sum()  #正则化项 ，torch.t(j.data)矩阵的转置
        cost += temp2 * self.lambda_value * 0.5
        return cost, rmse
def train(epoch):
    RMSE = 0
    cost_all = 0
    for step, (batch_x, batch_mask_x, batch_y) in enumerate(loader):
        batch_x = batch_x.type(torch.FloatTensor)
        batch_mask_x = batch_mask_x.type(torch.FloatTensor)
        
        decoder = rec(batch_x)
        loss, rmse = rec.loss(decoder=decoder, inputs=batch_x, optimizer=optimizer, mask_input=batch_mask_x)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        cost_all += loss
        RMSE += rmse
    RMSE = np.sqrt(RMSE.detach().cpu().numpy() / (train_mask_r == 1).sum())
    print('epoch:', epoch, 'train RMSE:', RMSE)
def test(epoch):
    with torch.no_grad():
        test_r_tensor = torch.from_numpy(test_r).type(torch.FloatTensor)
        test_mask_r_tensor = torch.from_numpy(test_mask_r).type(torch.FloatTensor)
        
        decoder = rec(test_r_tensor)
        
        unseen_user_test_list = list(user_test_set - user_train_set) #得到未出现在训练集中的用户列表
        unseen_item_test_list = list(item_test_set - item_train_set) #得到未出现在训练集中的电影列表
        
        for user in unseen_user_test_list:
            for item in unseen_item_test_list:
                if test_mask_r[user][item] == 1:#如果在测试集中存在这条评分记录，则进行记录decoder[user,item]=3
                    decoder[user, item] = 3
        mse = ((decoder - test_r_tensor) * test_mask_r_tensor).pow(2).sum()
        RMSE = mse.cpu().numpy() / (test_mask_r==1).sum()
        RMSE = np.sqrt(RMSE)
        print('epoch:', epoch, ' test RMSE:', RMSE)

In [28]:
if __name__ == '__main__':
    path = './'
    num_users = 6040
    num_items = 3952
    num_total_ratings = 1000209
    train_ratio = 0.9 #90%为训练集
    train_r,train_mask_r,test_r,test_mask_r,user_train_set,item_train_set,user_test_set,\
    item_test_set = get_data(path, num_users, num_items, num_total_ratings, train_ratio)
    hidden_units = 512
    lambda_value = 0.5
    rec = Autorec(num_users, num_items, hidden_units, lambda_value)
    optimizer = optim.Adam(rec.parameters(), lr=1e-2, weight_decay=1e-4)
#     print("optimizer param_groups")
#     print(optimizer.param_groups)
    batch_size = 100
    num_batch = int(math.ceil(num_users / batch_size))
    
    torch_dataset = Data.TensorDataset(torch.from_numpy(train_r), torch.from_numpy(train_mask_r), torch.from_numpy(train_r))
    loader = Data.DataLoader(dataset=torch_dataset, batch_size=batch_size, shuffle=True)
    
    epochs = 10
    for epoch in range(epochs):
        train(epoch=epoch)
        test(epoch=epoch)

epoch: 0 train RMSE: 1.417969547989841
epoch: 0  test RMSE: 1.3086908516466031
epoch: 1 train RMSE: 1.199110749718336
epoch: 1  test RMSE: 1.3121022702719454
epoch: 2 train RMSE: 1.188925520397914
epoch: 2  test RMSE: 1.3133530406162874
epoch: 3 train RMSE: 1.1871914351767339
epoch: 3  test RMSE: 1.3278352525798602
epoch: 4 train RMSE: 1.2072828693658366
epoch: 4  test RMSE: 1.339874012697582
epoch: 5 train RMSE: 1.231349846568755
epoch: 5  test RMSE: 1.372903443832858
epoch: 6 train RMSE: 1.2613879583303345
epoch: 6  test RMSE: 1.3527416374397185
epoch: 7 train RMSE: 1.2893357877805633
epoch: 7  test RMSE: 1.366562926637915
epoch: 8 train RMSE: 1.3152140931908918
epoch: 8  test RMSE: 1.4106268341437103
epoch: 9 train RMSE: 1.3404981147913295
epoch: 9  test RMSE: 1.420777405499695
