### import

In [11]:
import torch
import pickle
import numpy as np
import pandas as pd
from torch import nn
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from transformers import BertTokenizer, BertModel


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

True


### 数据预处理

In [12]:
# Tag 数据集
# 读tag_data取保存的 CSV 文件
tag_data = pd.read_csv('../data/selected_movie_top_1200_data_tag.csv')
# 引入用户评价数据
rating_data = pd.read_csv('../data/movie_score.csv')
# 删除 NaN 的行
rating_data.dropna(inplace=True)
# 将用户打的 Tag 加入到 tag_data['Tags'] 中
tag_data['Tags'] = tag_data['Movie'].map(rating_data.groupby('Movie')['Tags'].apply(list).to_dict())
# 对于每一行 Tags，将其转换为一整个字符串
tag_data['Tags'] = tag_data['Tags'].apply(lambda x: ','.join(x))
# 拆分为列表，去重，去除空字符串，'|', '...' 等无意义的 Tag
tag_data['Tags'] = tag_data['Tags'].apply(lambda x: list(set(x.split(','))))
tag_data['Tags'] = tag_data['Tags'].apply(lambda x: list(filter(lambda x: x not in ['', '|', '...'], x)))
# 保存为 CSV 文件
tag_data.to_csv('../res/selected_tags.csv', index=False)
print(tag_data)


# User 数据集
# 读user_data取保存的 CSV 文件
user_data = pd.read_csv('../data/movie_score.csv')
# 去除评分为 0 的行？
user_data = user_data[user_data['Rate'] > 0]
# 去除评价数据过少的用户
user_data = user_data.groupby('User').filter(lambda x: len(x) > 10)
# 去除不必要的列
user_data = user_data[['User', 'Movie', 'Rate']]
# 保存为 CSV 文件
user_data.to_csv('../res/selected_users.csv', index=False)
print(user_data)

        Movie                                               Tags
0     1292052  [弗兰克达拉邦特, 【美】, 释放, 监狱内外, 94, 自由与希望, 1990~1999,...
1     1295644  [吕克·贝松　让·雷诺　娜塔利波特曼, 法国, 娜塔莉波曼, 电影院, GaryOldman...
2     1292720  [阿甘正传, 记录米国, TOM, 【美】, 爱情, Zemeckis, D罗伯特·泽米吉斯...
3     3541415  [幻想科幻魔幻系, 科幻, UK, 2010-2019, 沟通障碍, 四星下, 李奥纳多, ...
4     3742360  [国货电影们, 红旗飘飘, 姜文, 四川话版, 沟通障碍, online.streaming...
...       ...                                                ...
1195  2357711  [日本动漫, 00, 东洋动画, 动漫, 亚洲作品, 动画, 卡通片, carton, 日本...
1196  1295873  [B&W, 烧钱CC版, 中文配音, 10年5月看过, EnzoStaiola, CSC, ...
1197  1997681  [大陆电影, 2005, 国货电影们, 中国内地, 找乐, 国产, 中国大陆, 电影频道, ...
1198  1827955  [動畫, 00, 动漫, 3星半, 喜欢的。, 亚洲作品, 动画, 卡通片, 动作, car...
1199  1307181  [2003年, 西方Movie, 政治, 威瑟斯彭, 爱情, .电影, 喜劇, ChickF...

[1200 rows x 2 columns]
           User    Movie  Rate
1       1386692  1986338     3
2       1386692  4268598     5
3       1386692  1851857     4
4       1386692  4023638     4
5       1386692  1305903     3
...      

### Embedding

In [13]:
# Bert 预训练模型
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese').cuda()

# 保存标签嵌入向量
tag_embedding_dict = {}
with torch.no_grad():
    for index, rows in tqdm(tag_data.iterrows()):
        # 将标签列表转换为字符串
        tags_str = "".join(rows.Tags)
        # 使用BERT中文模型对标签进行编码
        inputs = tokenizer(tags_str, truncation=True, return_tensors='pt')
        outputs = model(inputs.input_ids.cuda(), inputs.token_type_ids.cuda(), inputs.attention_mask.cuda())
        # 使用最后一层的平均隐藏状态作为标签的向量表示
        tag_embedding = outputs.last_hidden_state.mean(dim=1).cpu()
        tag_embedding_dict[rows.Movie] = tag_embedding

# 将映射表存储为二进制文件
with open('../res/tag_embedding_dict.pkl', 'wb') as f:
    pickle.dump(tag_embedding_dict, f)

1200it [00:31, 37.95it/s]


### 定义数据集类

In [14]:
# 定义数据集类
class MovieRatingDataset(Dataset):
    def __init__(self, data, user_to_idx, movie_to_idx, tag_embedding_dict):
        self.data = data
        self.user_to_idx = user_to_idx
        self.movie_to_idx = movie_to_idx
        self.tag_embedding_dict = tag_embedding_dict

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        user = self.user_to_idx[row['User']]
        movie = self.movie_to_idx[row['Movie']]
        rating = row['Rate'].astype('float32')
        text_embedding = self.tag_embedding_dict.get(row['Movie'])
        return user, movie, rating, text_embedding
    
# 创建索引映射
def create_id_mapping(id_list):
    # 从ID列表中删除重复项并创建一个排序的列表
    unique_ids = sorted(set(id_list))
    
    # 创建将原始ID映射到连续索引的字典
    id_to_idx = {id: idx for idx, id in enumerate(unique_ids)}
    
    # 创建将连续索引映射回原始ID的字典
    idx_to_id = {idx: id for id, idx in id_to_idx.items()}
    
    return id_to_idx, idx_to_id

user_ids = user_data['User'].unique()
movie_ids = user_data['Movie'].unique()
user_to_idx, idx_to_user = create_id_mapping(user_ids)
movie_to_idx, idx_to_movie = create_id_mapping(movie_ids)

### 定义模型

In [15]:
# 定义模型，引入 Item User 偏置提高效果
class MF(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim, init_std = 0.1):
        super(MF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)
        self.user_bias = nn.Embedding(num_users, 1)
        self.movie_bias = nn.Embedding(num_movies, 1)
        nn.init.normal_(self.user_embedding.weight, std = init_std)
        nn.init.normal_(self.movie_embedding.weight, std = init_std)
        nn.init.normal_(self.user_bias.weight, std = init_std)
        nn.init.normal_(self.movie_bias.weight, std = init_std)
        
    def forward(self, user, movie):
        user_embedding = self.user_embedding(user)
        movie_embedding = self.movie_embedding(movie)
        user_bias = self.user_bias(user)
        movie_bias = self.movie_bias(movie)
        dot = (user_embedding * movie_embedding).sum(1)
        return dot + user_bias.squeeze() + movie_bias.squeeze()

# 定义模型
model = MF(len(user_to_idx), len(movie_to_idx), 768).cuda()
# 定义损失函数
criterion = nn.MSELoss()
# 定义优化器
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

### 训练模型

In [16]:
# 按用户分组计算NDCG
def compute_ndcg(group):
    true_ratings = group['true'].tolist()
    pred_ratings = group['pred'].tolist()
    return ndcg_score([true_ratings], [pred_ratings], k = 50)

# 划分训练集和测试集
train_data, test_data = train_test_split(user_data, test_size=0.5, random_state=42)
# 创建训练集和测试集的数据集对象
train_dataset = MovieRatingDataset(train_data, user_to_idx, movie_to_idx, tag_embedding_dict)
test_dataset = MovieRatingDataset(test_data, user_to_idx, movie_to_idx, tag_embedding_dict)
# 创建训练集和测试集的数据加载器
train_dataloader = DataLoader(train_dataset, batch_size=4096, shuffle=True, drop_last = True)
test_dataloader = DataLoader(test_dataset, batch_size=4096, shuffle=False, drop_last = True)

# 训练模型
num_epochs = 20
lambda_u, lambda_b = 0.001, 0.001

for epoch in range(num_epochs):
    model.train()
    total_loss_train, total_loss_test = 0.0, 0.0
    for idx, (user_ids, movie_ids, ratings, tag_embedding) in tqdm(enumerate(train_dataloader)):
        # 使用user_ids, movie_ids, ratings进行训练
        optimizer.zero_grad()
        predictions = model(user_ids.to(device), movie_ids.to(device))
        loss = criterion(predictions, ratings.to(device)) + \
            lambda_u * (model.user_embedding.weight.norm(p = 2) + model.movie_embedding.weight.norm(p = 2)) + lambda_b * (model.user_bias.weight.norm(p = 2) + model.movie_bias.weight.norm(p = 2))
        loss.backward()
        optimizer.step()
        total_loss_train += loss.item()

    output_loss_train = total_loss_train / (idx + 1) 
    results = []
    model.eval()
    with torch.no_grad():
        for idx, (user_ids, item_ids, true_ratings, tag_embedding) in enumerate(test_dataloader):
            pred_ratings = model(user_ids.to(device), item_ids.to(device))
            loss = criterion(pred_ratings, ratings.to(device))
            total_loss_test += loss.item()
            # 将结果转换为 numpy arrays
            user_ids_np = user_ids.long().cpu().numpy().reshape(-1, 1)
            pred_ratings_np = pred_ratings.cpu().numpy().reshape(-1, 1)
            true_ratings_np = true_ratings.numpy().reshape(-1, 1)
            # 将这三个 arrays 合并成一个 2D array
            batch_results = np.column_stack((user_ids_np, pred_ratings_np, true_ratings_np))
            # 将这个 2D array 添加到 results
            results.append(batch_results)
        # 将结果的 list 转换为一个大的 numpy array
        results = np.vstack(results)
        # 将结果转换为DataFrame
        results_df = pd.DataFrame(results, columns=['user', 'pred', 'true'])
        results_df['user'] = results_df['user'].astype(int)
        # 按用户分组计算NDCG
        ndcg_scores = results_df.groupby('user').apply(compute_ndcg)
        # 计算平均NDCG
        avg_ndcg = ndcg_scores.mean()
        print(f'Epoch {epoch}, Train loss: {output_loss_train}, Test loss:, {total_loss_test / (idx + 1)}, Average NDCG: {avg_ndcg}')

63it [00:06,  9.15it/s]


Epoch 0, Train loss: 6.054512022979676, Test loss:, 1.4061590433120728, Average NDCG: 0.8829743819420974


63it [00:07,  8.61it/s]


Epoch 1, Train loss: 0.8401029687079172, Test loss:, 1.398927858897618, Average NDCG: 0.8918389251537128


63it [00:08,  7.60it/s]


Epoch 2, Train loss: 0.4745940693787166, Test loss:, 1.4090804467125544, Average NDCG: 0.8968032849112698


63it [00:08,  7.86it/s]


Epoch 3, Train loss: 0.3461354020096007, Test loss:, 1.3801041273843675, Average NDCG: 0.8969722755005403


63it [00:07,  8.12it/s]


Epoch 4, Train loss: 0.29199493688250344, Test loss:, 1.4091123531735132, Average NDCG: 0.8970383364984921


63it [00:07,  8.34it/s]


Epoch 5, Train loss: 0.2691051936338818, Test loss:, 1.4086632917797755, Average NDCG: 0.8977216694274244


63it [00:07,  8.99it/s]


Epoch 6, Train loss: 0.2586870997671097, Test loss:, 1.3667410214742024, Average NDCG: 0.8986278580910495


63it [00:07,  7.97it/s]


Epoch 7, Train loss: 0.2534109590545533, Test loss:, 1.3544818389983404, Average NDCG: 0.8996325808535107


63it [00:07,  8.17it/s]


Epoch 8, Train loss: 0.25037265604450587, Test loss:, 1.344119049253918, Average NDCG: 0.9006444676357661


63it [00:07,  8.68it/s]


Epoch 9, Train loss: 0.24828930743156918, Test loss:, 1.3704514219647361, Average NDCG: 0.901755235871216


63it [00:07,  8.29it/s]


Epoch 10, Train loss: 0.2462947080059657, Test loss:, 1.3622532117934454, Average NDCG: 0.9023203863048813


63it [00:07,  8.76it/s]


Epoch 11, Train loss: 0.24528758795488448, Test loss:, 1.3491712959985884, Average NDCG: 0.9036339944544485


63it [00:07,  8.40it/s]


Epoch 12, Train loss: 0.24496426587066952, Test loss:, 1.3249568163402496, Average NDCG: 0.9040977712929896


63it [00:07,  8.84it/s]


Epoch 13, Train loss: 0.2455528693066703, Test loss:, 1.3259062899483576, Average NDCG: 0.9048873460975086


63it [00:07,  8.62it/s]


Epoch 14, Train loss: 0.2467158500637327, Test loss:, 1.3467443311025227, Average NDCG: 0.9056492426958429


63it [00:07,  8.77it/s]


Epoch 15, Train loss: 0.24945490819121163, Test loss:, 1.2952807290213448, Average NDCG: 0.905833571444622


63it [00:07,  8.46it/s]


Epoch 16, Train loss: 0.25416723130241275, Test loss:, 1.3335483244487218, Average NDCG: 0.9058622164132282


63it [00:06,  9.05it/s]


Epoch 17, Train loss: 0.26187914941045976, Test loss:, 1.3459090145807417, Average NDCG: 0.9068220355050349


63it [00:07,  8.72it/s]


Epoch 18, Train loss: 0.27371161844995284, Test loss:, 1.318461393553113, Average NDCG: 0.9060660361338378


63it [00:07,  8.74it/s]


Epoch 19, Train loss: 0.2916186697899349, Test loss:, 1.3157087148181976, Average NDCG: 0.9052107788269304
