In [1]:
import pandas as pd
import torch

import numpy as np
from sklearn.model_selection import train_test_split
import random
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, roc_auc_score
import torch.optim as optim

In [77]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
batch_size = 128
epochs = 10
weight_decay = 1e-5
num_classes = 2
lr = 2e-2
negative_sum = 200

## 构造数据

In [6]:
data = pd.read_csv('./ratings.dat', sep='::', names=['userId', 'movieId', 'rating', 'time'], usecols=[0,1])
dataList = data.groupby(by='userId').agg({'movieId':list})
dataList['userId'] = dataList.index
dataList.reset_index(drop=True)
movieIds = data.movieId.unique()

#负采样
negative = dict()
for userId in dataList['userId']:
    negatives = list()
    while len(negatives) < negative_sum:
        movieId = random.randint(1, 3952)
        if movieId not in dataList.loc[userId].movieId:
            negatives.append(movieId)
    negative[userId] = negatives

negative = pd.DataFrame.from_dict(negative, orient='index')
negative['userId'] = negative.index
negative['movieId'] = negative.apply(lambda x:[x[i] for i in range(negative_sum)], axis=1)
negative = negative[['userId', 'movieId']]
negative = negative.explode('movieId').reset_index(drop=True)
negative['label'] = 0
print(negative.head())

data = data.explode('movieId').reset_index(drop=True)
data['label'] = 1
print(data.head())

#测试集和训练集划分
data = pd.concat([data, negative]).astype(np.int32)
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:, 0:2], data.iloc[:, 2], test_size=0.15, random_state=2022)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
print(x_train.head())
print(y_train.head())

  """Entry point for launching an IPython kernel.


   userId movieId  label
0       1    1873      0
1       1    3081      0
2       1    2643      0
3       1     332      0
4       1    2971      0
   userId  movieId  label
0       1     1193      1
1       1      661      1
2       1      914      1
3       1     3408      1
4       1     2355      1
(1876977, 2) (331232, 2) (1876977,) (331232,)
        userId  movieId
605734    3029     1628
290934    1455     2145
540295    2702     2678
329972    1947     1103
147465     949     1580
605734    0
290934    0
540295    0
329972    1
147465    1
Name: label, dtype: int32


In [9]:
user_num = data.userId.nunique()
movie_num = data.movieId.nunique()
print(user_num, movie_num)

6040 3952


In [7]:
class MovieDataset(Dataset):
    def __init__(self, x, y):
        super(MovieDataset, self).__init__()
        self.x = x
        self.y = y
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    def __len__(self):
        return len(self.x)
    
train_dataset = MovieDataset(torch.tensor(x_train.values), torch.tensor(y_train.values))
test_dataset = MovieDataset(torch.tensor(x_test.values), torch.tensor(y_test.values))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

iter_x, iter_y = next(iter(train_loader))
print(iter_x.shape)
print(iter_y)

torch.Size([128, 2])
tensor([0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
        1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0,
        0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
        0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
        0, 1, 1, 0, 1, 0, 0, 0], dtype=torch.int32)


## 构建模型

In [70]:
class NeuralCF(nn.Module):
    def __init__(self, userIds, movieIds, mlp_hidden_units, gmf_embedding_num, mlp_embedding_num, dropout):
        super(NeuralCF, self).__init__()
        self.userIds = userIds
        self.movieIds = movieIds
        self.mlp_hidden_units = mlp_hidden_units
        self.gmf_embedding_num = gmf_embedding_num
        self.mlp_embedding_num = mlp_embedding_num
        self.dropout = dropout
        
        #gmf的embedding层
        self.gmf_user_embedding = nn.Embedding(userIds, gmf_embedding_num)
        self.gmf_movie_embedding = nn.Embedding(movieIds, gmf_embedding_num)
        
        #mlp的embedding层
        self.mlp_user_embedding = nn.Embedding(userIds, mlp_embedding_num)
        self.mlp_movie_embedding = nn.Embedding(movieIds, mlp_embedding_num)
        
        #mlp的多层全连接层
        mlp_layers = []
        input_size = 2 * self.mlp_embedding_num
        mlp_layers.append(nn.Linear(input_size, hidden_units[0]))
        mlp_layers.append(nn.Dropout(self.dropout))
        mlp_layers.append(nn.ReLU())
        for i in range(1, len(hidden_units)):
            input_units = hidden_units[i - 1]
            output_units = hidden_units[i]
            mlp_layers.append(nn.Linear(input_units, output_units))
            mlp_layers.append(nn.Dropout(self.dropout))
            mlp_layers.append(nn.ReLU())
        self.mlp_layers = nn.Sequential(*mlp_layers)
        
        output_layer_input_units = self.gmf_embedding_num + self.mlp_hidden_units[-1]
        self.output_layer = nn.Linear(output_layer_input_units, 1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        users = x[:,0].long()
        movies = x[:,1].long()
        
        user_gmf_embedding = self.gmf_user_embedding(users)
        movie_gmf_embedding = self.gmf_movie_embedding(movies)
        
        user_mlp_embedding = self.mlp_user_embedding(users)
        movie_mlp_embedding = self.mlp_movie_embedding(movies)
        
        #gmf执行element-wise product操作
        gmf_output = user_gmf_embedding * movie_gmf_embedding
        
        #mlp块通过堆叠的全连接层+激活函数
        mlp_input = torch.cat([user_mlp_embedding, movie_mlp_embedding], dim=-1)
        mlp_output = self.mlp_layers(mlp_input)
        
        #将gmf和mlp的输出结果concat起来，送入最后的全连接层预测结果，并使用sigmoid函数将输出结果映射到0-1之间
        output = self.output_layer(torch.cat([gmf_output, mlp_output], dim=-1))
        output = self.sigmoid(output).squeeze()
        return output
    

In [85]:
hidden_units = [8, 2]
gmf_embedding_num = 2
mlp_embedding_num = 16
dropout = 0.
net = NeuralCF(user_num, movie_num, hidden_units, gmf_embedding_num, mlp_embedding_num, dropout)
net

NeuralCF(
  (gmf_user_embedding): Embedding(6040, 2)
  (gmf_movie_embedding): Embedding(3952, 2)
  (mlp_user_embedding): Embedding(6040, 16)
  (mlp_movie_embedding): Embedding(3952, 16)
  (mlp_layers): Sequential(
    (0): Linear(in_features=32, out_features=8, bias=True)
    (1): Dropout(p=0.0, inplace=False)
    (2): ReLU()
    (3): Linear(in_features=8, out_features=2, bias=True)
    (4): Dropout(p=0.0, inplace=False)
    (5): ReLU()
  )
  (output_layer): Linear(in_features=4, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

## 训练和评估模型

In [86]:
def train(epoch):
    #训练模型并输出测试集每一轮的loss
    criterion = nn.BCELoss()
    for t, (batch_x, batch_y) in enumerate(train_loader):
        batch_x = batch_x.float().to(device)
        batch_y = batch_y.float().to(device)
        optimizer.zero_grad()
        total = net.forward(batch_x)
        loss = criterion(total, batch_y)
        loss.backward()
        optimizer.step()
#         print(net.state_dict())

        
    r = test()
    print('Epoch %d, loss=%.4f' % (epoch, r))
def test():
    #测试集测试
    criterion = nn.BCELoss()
    all_loss = 0
    gt_labels = []
    pred_labels = []
    i = 0
    with torch.no_grad():
        for t, (batch_x, batch_y) in enumerate(test_loader):
            batch_x = batch_x.float().to(device)
            batch_y = batch_y.float().to(device)
            pred = net.forward(batch_x)
            gt_label = batch_y.cpu().data.numpy()
            pred_proba = pred.cpu().data.numpy()
            gt_labels.append(gt_label)
            pred_labels.append(pred_proba)
            loss = criterion(pred, batch_y)
            all_loss += loss.item()
            i += 1
        gt_labels, pred_labels = np.concatenate(gt_labels), np.concatenate(pred_labels)
        pred_labels = pred_labels.reshape(-1)
        auc = roc_auc_score(gt_labels, pred_labels)
        print('auc:', auc, 'gt_labels:', gt_labels.shape, 'pred_labels:', pred_labels.shape)
    return all_loss / i
def predict(x):
    with torch.no_grad():
        x = torch.from_numpy(x)
        x = x.float().to(device)
        out1 = net.forward(x)
        out = out1.cpu().data.numpy()
        out[out>=0.5] = 1.0
        out[out<0.5] = 0.0
        return out

In [88]:
net.to(device)
optimizer = optim.Adam(net.parameters(), lr=0.01, weight_decay=0)
for epoch in range(epochs):
    train(epoch)

auc: 0.5005493144428145 gt_labels: (331232,) pred_labels: (331232,)
Epoch 0, loss=0.7004
auc: 0.49961504681403346 gt_labels: (331232,) pred_labels: (331232,)
Epoch 1, loss=0.7004
auc: 0.5008445569355171 gt_labels: (331232,) pred_labels: (331232,)
Epoch 2, loss=0.7000
auc: 0.8890800000683443 gt_labels: (331232,) pred_labels: (331232,)
Epoch 3, loss=0.4307
auc: 0.9051854890691389 gt_labels: (331232,) pred_labels: (331232,)
Epoch 4, loss=0.3934
auc: 0.9082788126662582 gt_labels: (331232,) pred_labels: (331232,)
Epoch 5, loss=0.3861
auc: 0.9091886227642041 gt_labels: (331232,) pred_labels: (331232,)
Epoch 6, loss=0.3844
auc: 0.9097349545449724 gt_labels: (331232,) pred_labels: (331232,)
Epoch 7, loss=0.3815
auc: 0.9100320470032056 gt_labels: (331232,) pred_labels: (331232,)
Epoch 8, loss=0.3819
auc: 0.9086712755668553 gt_labels: (331232,) pred_labels: (331232,)
Epoch 9, loss=0.3871


In [89]:
#训练好的模型预测测试集
test_x = x_test.values
test_pred = predict(test_x)
test_label = y_test.values
acc = accuracy_score(test_pred, test_label)
print('accuracy:',acc)

accuracy: 0.826810211573761
