In [1]:
#link to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch faiss-cpu pandas scikit-learn tqdm --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m111.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Step 0.1: 路径设置（统一用你的目录）
import os

BASE_DIR = "/content/drive/MyDrive/recommendation_system"
RAW_DIR = os.path.join(BASE_DIR, "data/raw")
RAW_ZIP = os.path.join(RAW_DIR, "ml-1m.zip")
RAW_FOLDER = os.path.join(RAW_DIR, "ml-1m")
RAW_RATINGS_PATH = os.path.join(RAW_FOLDER, "ratings.dat")

PROCESSED_DIR = os.path.join(BASE_DIR, "data/processed")
CHECKPOINT_PATH = os.path.join(BASE_DIR, "models/dssm_colab.pth")
INDEX_DIR = os.path.join(BASE_DIR, "indexes")

os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)
os.makedirs(INDEX_DIR, exist_ok=True)

print("BASE_DIR:", BASE_DIR)
print("RAW_RATINGS_PATH:", RAW_RATINGS_PATH)

BASE_DIR: /content/drive/MyDrive/recommendation_system
RAW_RATINGS_PATH: /content/drive/MyDrive/recommendation_system/data/raw/ml-1m/ratings.dat


In [4]:
# Step 1: 自动下载 MovieLens-1M 并解压
import urllib.request, zipfile

if not os.path.exists(RAW_RATINGS_PATH):
    url = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
    print("📥 正在下载 MovieLens-1M 数据集 ...")
    urllib.request.urlretrieve(url, RAW_ZIP)
    with zipfile.ZipFile(RAW_ZIP, "r") as zf:
        zf.extractall(RAW_DIR)
    print("✅ 已解压到:", RAW_FOLDER)
else:
    print("✅ 检测到现有数据集:", RAW_RATINGS_PATH)

✅ 检测到现有数据集: /content/drive/MyDrive/recommendation_system/data/raw/ml-1m/ratings.dat


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

print("RAW_RATINGS_PATH =", RAW_RATINGS_PATH)

# 2.1 读取原始数据
df = pd.read_csv(
    RAW_RATINGS_PATH,
    sep="::",
    engine="python",
    header=None,
    names=["user_raw", "item_raw", "rating", "timestamp"],
)

print("✅ 原始数据形状:", df.shape)

# 🔍 打印前几行样例
print("\n=== 原始数据样例 (前 5 行) ===")
print(df.head())

print("\n=== rating 分布统计 ===")
print(df["rating"].value_counts().sort_index())

RAW_RATINGS_PATH = /content/drive/MyDrive/recommendation_system/data/raw/ml-1m/ratings.dat
✅ 原始数据形状: (1000209, 4)

=== 原始数据样例 (前 5 行) ===
   user_raw  item_raw  rating  timestamp
0         1      1193       5  978300760
1         1       661       3  978302109
2         1       914       3  978301968
3         1      3408       4  978300275
4         1      2355       5  978824291

=== rating 分布统计 ===
rating
1     56174
2    107557
3    261197
4    348971
5    226310
Name: count, dtype: int64


In [7]:
# 2.2 ID 映射 & label 构造
user_map = {u: i for i, u in enumerate(df["user_raw"].unique())}
item_map = {m: i for i, m in enumerate(df["item_raw"].unique())}

df["user_id"] = df["user_raw"].map(user_map)
df["item_id"] = df["item_raw"].map(item_map)
df["label"]   = (df["rating"] >= 4).astype(int)

# 再打印一个映射后的样例
print("\n=== 映射后样本样例 (前 5 行) ===")
print(df[["user_id", "item_id", "rating", "label"]].head())

# 2.3 划分 train / test
train_df, test_df = train_test_split(
    df[["user_id", "item_id", "label"]],
    test_size=0.1,
    random_state=42
)

train_path = os.path.join(PROCESSED_DIR, "train.csv")
test_path  = os.path.join(PROCESSED_DIR, "test.csv")
train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path,  index=False)

n_users, n_items = len(user_map), len(item_map)
print(f"\n✅ n_users={n_users}, n_items={n_items}")
print("✅ 训练集 / 测试集已保存：")
print("  train:", train_path)
print("  test :", test_path)

print("\n=== 训练集样例 (前 5 行) ===")
print(train_df.head())


=== 映射后样本样例 (前 5 行) ===
   user_id  item_id  rating  label
0        0        0       5      1
1        0        1       3      0
2        0        2       3      0
3        0        3       4      1
4        0        4       5      1

✅ n_users=6040, n_items=3706
✅ 训练集 / 测试集已保存：
  train: /content/drive/MyDrive/recommendation_system/data/processed/train.csv
  test : /content/drive/MyDrive/recommendation_system/data/processed/test.csv

=== 训练集样例 (前 5 行) ===
        user_id  item_id  label
647085     3893     1962      0
130254      842       15      1
232200     1407      166      0
61200       411      522      0
477192     2928      887      1


In [16]:
# ===== Block 1：定义 PyTorch Dataset & DataLoader =====
import torch
from torch.utils.data import Dataset, DataLoader

class MovieLensDataset(Dataset):
    """
    一个最基础的样本结构：
    - 输入: user_id, item_id
    - 标签: label (1 表示喜欢/点击，0 表示不喜欢)
    """
    def __init__(self, df):
        self.user_ids = df["user_id"].values
        self.item_ids = df["item_id"].values
        self.labels   = df["label"].values.astype("float32")

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        return {
            "user_id": torch.tensor(self.user_ids[idx], dtype=torch.long),
            "item_id": torch.tensor(self.item_ids[idx], dtype=torch.long),
            "label":   torch.tensor(self.labels[idx],   dtype=torch.float32),
        }

batch_size = 1024

train_dataset = MovieLensDataset(train_df)
test_dataset  = MovieLensDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ 使用设备:", device)

✅ 使用设备: cuda


In [17]:
# ===== Block 2.1：定义召回模型（双塔 DSSM） =====
import torch.nn as nn
import torch.nn.functional as F

class DSSMRecall(nn.Module):
    """
    教学版 DSSM 召回模型：
    - user_id -> user embedding
    - item_id -> item embedding
    - 点积作为匹配得分 (logit)
    """
    def __init__(self, n_users, n_items, emb_dim=64):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.item_emb = nn.Embedding(n_items, emb_dim)

        # 初始化权重（小标准差的正态分布）
        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)

    def forward(self, user_ids, item_ids):
        """
        输入:
            user_ids: [B]
            item_ids: [B]
        输出:
            logits: [B] 点积得分
            u: [B, D] 用户向量
            v: [B, D] 物品向量
        """
        u = self.user_emb(user_ids)       # [B, D]
        v = self.item_emb(item_ids)       # [B, D]
        logits = (u * v).sum(dim=-1)      # 点积 -> [B]
        return logits, u, v

recall_model = DSSMRecall(n_users, n_items, emb_dim=64).to(device)

In [18]:
# ===== Block 2.2：训练召回模型（带 tqdm） =====
from torch.optim import Adam
from tqdm.auto import tqdm

def train_recall(model, train_loader, test_loader, epochs=5, lr=1e-3):
    """
    召回训练：
    - 损失: BCEWithLogitsLoss
    - 注意: 这里的 loss 只是用来学习 embedding，
            真正好不好要看后面的 Recall@K 指标
    """
    optimizer = Adam(model.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(1, epochs + 1):
        # ---------- Train ----------
        model.train()
        total_loss = 0.0
        for batch in tqdm(train_loader, desc=f"[Recall] Epoch {epoch} Train", leave=False):
            user_ids = batch["user_id"].to(device)
            item_ids = batch["item_id"].to(device)
            labels   = batch["label"].to(device)

            logits, _, _ = model(user_ids, item_ids)
            loss = criterion(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * user_ids.size(0)

        avg_train_loss = total_loss / len(train_loader.dataset)

        # ---------- Val ----------
        model.eval()
        total_val_loss = 0.0
        with torch.no_grad():
            for batch in tqdm(test_loader, desc=f"[Recall] Epoch {epoch} Val", leave=False):
                user_ids = batch["user_id"].to(device)
                item_ids = batch["item_id"].to(device)
                labels   = batch["label"].to(device)

                logits, _, _ = model(user_ids, item_ids)
                loss = criterion(logits, labels)
                total_val_loss += loss.item() * user_ids.size(0)

        avg_val_loss = total_val_loss / len(test_loader.dataset)
        print(f"[Recall][Epoch {epoch}] train_loss={avg_train_loss:.4f}, val_loss={avg_val_loss:.4f}")

train_recall(recall_model, train_loader, test_loader, epochs=5, lr=1e-3)

torch.save(recall_model.state_dict(), CHECKPOINT_PATH)
print("✅ 召回模型已保存:", CHECKPOINT_PATH)

[Recall] Epoch 1 Train:   0%|          | 0/880 [00:00<?, ?it/s]

[Recall] Epoch 1 Val:   0%|          | 0/98 [00:00<?, ?it/s]

[Recall][Epoch 1] train_loss=0.6258, val_loss=0.5544


[Recall] Epoch 2 Train:   0%|          | 0/880 [00:00<?, ?it/s]

[Recall] Epoch 2 Val:   0%|          | 0/98 [00:00<?, ?it/s]

[Recall][Epoch 2] train_loss=0.5305, val_loss=0.5318


[Recall] Epoch 3 Train:   0%|          | 0/880 [00:00<?, ?it/s]

[Recall] Epoch 3 Val:   0%|          | 0/98 [00:00<?, ?it/s]

[Recall][Epoch 3] train_loss=0.5110, val_loss=0.5245


[Recall] Epoch 4 Train:   0%|          | 0/880 [00:00<?, ?it/s]

[Recall] Epoch 4 Val:   0%|          | 0/98 [00:00<?, ?it/s]

[Recall][Epoch 4] train_loss=0.4977, val_loss=0.5198


[Recall] Epoch 5 Train:   0%|          | 0/880 [00:00<?, ?it/s]

[Recall] Epoch 5 Val:   0%|          | 0/98 [00:00<?, ?it/s]

[Recall][Epoch 5] train_loss=0.4846, val_loss=0.5159
✅ 召回模型已保存: /content/drive/MyDrive/recommendation_system/models/dssm_colab.pth


In [19]:
# ===== Block 2.3：构建 ANN 召回索引 & user_pos_dict =====
import numpy as np
from sklearn.neighbors import NearestNeighbors
import joblib

# 1）拿出所有 item 的 embedding，作为向量库
recall_model.eval()
with torch.no_grad():
    all_item_ids = torch.arange(n_items, device=device)
    all_item_embs = recall_model.item_emb(all_item_ids).cpu().numpy()  # [n_items, D]

print("all_item_embs shape:", all_item_embs.shape)

# 2）构建 NearestNeighbors 索引（教学版 ANN）
ann = NearestNeighbors(n_neighbors=200, metric="cosine")
ann.fit(all_item_embs)

# 保存，以后可以复用
index_path = os.path.join(INDEX_DIR, "item_ann_index.joblib")
emb_path   = os.path.join(INDEX_DIR, "item_embs.npy")
joblib.dump(ann, index_path)
np.save(emb_path, all_item_embs)
print("✅ ANN 索引与 item_embs 已保存:", index_path, emb_path)

# 3）构建 user_pos_dict：每个用户在 test 集中的正样本 item 列表
user_pos_dict = {}
for row in test_df.itertuples():
    u = row.user_id
    i = row.item_id
    y = row.label
    if y == 1:
        user_pos_dict.setdefault(u, []).append(i)

for u in test_df["user_id"].unique():
    user_pos_dict.setdefault(u, [])

print("✅ user_pos_dict 已构建，用户数:", len(user_pos_dict))

all_item_embs shape: (3706, 64)
✅ ANN 索引与 item_embs 已保存: /content/drive/MyDrive/recommendation_system/indexes/item_ann_index.joblib /content/drive/MyDrive/recommendation_system/indexes/item_embs.npy
✅ user_pos_dict 已构建，用户数: 5970


In [20]:
# ===== Block 2.4：评估召回质量（Recall@K / HitRate@K / NDCG@K） =====
def evaluate_recall(model, test_df, user_pos_dict, item_emb_matrix, ann, K=100):
    """
    对每个 user:
      - 用 ANN 从 item_emb 中召回 K 个候选
      - 看 test 集里的正样本有多少被召回
    指标:
      - Recall@K: 正样本被召回的比例
      - HitRate@K: 是否至少召回一个正样本
      - NDCG@K: 排名质量（越靠前越好）
    """
    model.eval()
    users = test_df["user_id"].unique()

    recalls = []
    hits = []
    ndcgs = []

    for u in tqdm(users, desc=f"[Eval Recall@{K}]"):
        pos_items = user_pos_dict[u]
        if len(pos_items) == 0:
            continue

        # user embedding
        with torch.no_grad():
            u_emb = model.user_emb(torch.tensor([u], device=device)).cpu().numpy()

        # ANN 召回 K 个 item
        distances, indices = ann.kneighbors(u_emb, n_neighbors=K)
        rec_items = indices[0]  # [K]

        # Recall@K
        hit_count = len(set(rec_items) & set(pos_items))
        recalls.append(hit_count / len(pos_items))

        # HitRate@K
        hits.append(1 if hit_count > 0 else 0)

        # NDCG@K（pos item 排在越前面贡献越大）
        ndcg = 0.0
        for rank, item in enumerate(rec_items):
            if item in pos_items:
                ndcg += 1.0 / np.log2(rank + 2)
        ndcgs.append(ndcg)

    print(f"📊 Recall@{K}:  {np.mean(recalls):.4f}")
    print(f"📊 HitRate@{K}: {np.mean(hits):.4f}")
    print(f"📊 NDCG@{K}:    {np.mean(ndcgs):.4f}")

item_embs = all_item_embs  # 方便后面用同名变量
evaluate_recall(recall_model, test_df, user_pos_dict, item_embs, ann, K=100)

[Eval Recall@100]:   0%|          | 0/5970 [00:00<?, ?it/s]

📊 Recall@100:  0.0838
📊 HitRate@100: 0.3977
📊 NDCG@100:    0.1650


In [21]:
# ===== Block 3.1：定义粗排模型（小 MLP） =====
class PreRankMLP(nn.Module):
    """
    粗排：
    - 输入: user_emb, item_emb, recall_logit
    - 输出: 粗粒度 CTR logit
    """
    def __init__(self, emb_dim=64, hidden_dim=128):
        super().__init__()
        in_dim = emb_dim * 2 + 1
        self.mlp = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, user_emb, item_emb, recall_logit):
        x = torch.cat([user_emb, item_emb, recall_logit.unsqueeze(-1)], dim=-1)
        logit = self.mlp(x).squeeze(-1)
        return logit

prerank_model = PreRankMLP(emb_dim=64, hidden_dim=128).to(device)

In [22]:
# ===== Block 3.2：训练粗排模型 =====
from sklearn.metrics import roc_auc_score

# 冻结召回模型参数
for p in recall_model.parameters():
    p.requires_grad = False
recall_model.eval()

def train_prerank(prerank_model, recall_model, train_loader, test_loader, epochs=3, lr=1e-3):
    optimizer = Adam(prerank_model.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(1, epochs + 1):
        prerank_model.train()
        total_loss = 0.0

        # ----- Train -----
        for batch in tqdm(train_loader, desc=f"[PreRank] Epoch {epoch} Train", leave=False):
            u = batch["user_id"].to(device)
            i = batch["item_id"].to(device)
            y = batch["label"].to(device)

            with torch.no_grad():
                recall_logits, u_emb, v_emb = recall_model(u, i)

            logits = prerank_model(u_emb, v_emb, recall_logits)
            loss   = criterion(logits, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * u.size(0)

        avg_train_loss = total_loss / len(train_loader.dataset)

        # ----- Val: AUC + LogLoss -----
        prerank_model.eval()
        ys, ps = [], []
        with torch.no_grad():
            for batch in test_loader:
                u = batch["user_id"].to(device)
                i = batch["item_id"].to(device)
                y = batch["label"].cpu().numpy()

                recall_logits, u_emb, v_emb = recall_model(u, i)
                logit = prerank_model(u_emb, v_emb, recall_logits)
                p = torch.sigmoid(logit).cpu().numpy()

                ys.append(y)
                ps.append(p)

        ys = np.concatenate(ys)
        ps = np.concatenate(ps)
        auc = roc_auc_score(ys, ps)
        logloss = -np.mean(ys*np.log(ps+1e-9)+(1-ys)*np.log(1-ps+1e-9))

        print(f"[PreRank][Epoch {epoch}] train_loss={avg_train_loss:.4f}, AUC={auc:.4f}, LogLoss={logloss:.4f}")

train_prerank(prerank_model, recall_model, train_loader, test_loader, epochs=3, lr=1e-3)

[PreRank] Epoch 1 Train:   0%|          | 0/880 [00:00<?, ?it/s]

[PreRank][Epoch 1] train_loss=0.4651, AUC=0.8110, LogLoss=0.5288


[PreRank] Epoch 2 Train:   0%|          | 0/880 [00:00<?, ?it/s]

[PreRank][Epoch 2] train_loss=0.4421, AUC=0.8099, LogLoss=0.5367


[PreRank] Epoch 3 Train:   0%|          | 0/880 [00:00<?, ?it/s]

[PreRank][Epoch 3] train_loss=0.4315, AUC=0.8060, LogLoss=0.5378


In [23]:
# ===== Block 4.1：定义精排模型（更复杂 MLP） =====
class RankMLP(nn.Module):
    """
    精排：
    - 输入: user_emb, item_emb, recall_logit, prerank_logit
    - 输出: 最终 CTR logit
    """
    def __init__(self, emb_dim=64, hidden_dim=256):
        super().__init__()
        in_dim = emb_dim * 2 + 2
        self.mlp = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, user_emb, item_emb, recall_logit, prerank_logit):
        x = torch.cat(
            [user_emb, item_emb,
             recall_logit.unsqueeze(-1),
             prerank_logit.unsqueeze(-1)], dim=-1
        )
        logit = self.mlp(x).squeeze(-1)
        return logit

rank_model = RankMLP(emb_dim=64, hidden_dim=256).to(device)

In [24]:
# ===== Block 4.2：训练精排模型 =====
for p in recall_model.parameters():
    p.requires_grad = False
for p in prerank_model.parameters():
    p.requires_grad = False
recall_model.eval()
prerank_model.eval()

def train_rank(rank_model, recall_model, prerank_model, train_loader, test_loader, epochs=3, lr=1e-3):
    optimizer = Adam(rank_model.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(1, epochs + 1):
        rank_model.train()
        total_loss = 0.0

        # ----- Train -----
        for batch in tqdm(train_loader, desc=f"[Rank] Epoch {epoch} Train", leave=False):
            u = batch["user_id"].to(device)
            i = batch["item_id"].to(device)
            y = batch["label"].to(device)

            with torch.no_grad():
                recall_logits, u_emb, v_emb = recall_model(u, i)
                prerank_logits = prerank_model(u_emb, v_emb, recall_logits)

            logits = rank_model(u_emb, v_emb, recall_logits, prerank_logits)
            loss   = criterion(logits, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * u.size(0)

        avg_train_loss = total_loss / len(train_loader.dataset)

        # ----- Val: AUC + ranking metrics -----
        rank_model.eval()
        ys, ps = [], []
        with torch.no_grad():
            for batch in test_loader:
                u = batch["user_id"].to(device)
                i = batch["item_id"].to(device)
                y = batch["label"].cpu().numpy()

                recall_logits, u_emb, v_emb = recall_model(u, i)
                prerank_logits = prerank_model(u_emb, v_emb, recall_logits)
                p = torch.sigmoid(rank_model(u_emb, v_emb, recall_logits, prerank_logits)).cpu().numpy()

                ys.append(y)
                ps.append(p)

        ys = np.concatenate(ys)
        ps = np.concatenate(ps)
        auc = roc_auc_score(ys, ps)
        logloss = -np.mean(ys*np.log(ps+1e-9)+(1-ys)*np.log(1-ps+1e-9))

        print(f"[Rank][Epoch {epoch}] train_loss={avg_train_loss:.4f}, AUC={auc:.4f}, LogLoss={logloss:.4f}")

train_rank(rank_model, recall_model, prerank_model, train_loader, test_loader, epochs=3, lr=1e-3)

[Rank] Epoch 1 Train:   0%|          | 0/880 [00:00<?, ?it/s]

[Rank][Epoch 1] train_loss=0.4150, AUC=0.7979, LogLoss=0.5600


[Rank] Epoch 2 Train:   0%|          | 0/880 [00:00<?, ?it/s]

[Rank][Epoch 2] train_loss=0.3884, AUC=0.7964, LogLoss=0.5710


[Rank] Epoch 3 Train:   0%|          | 0/880 [00:00<?, ?it/s]

[Rank][Epoch 3] train_loss=0.3735, AUC=0.7946, LogLoss=0.5949


In [25]:
# ===== Block 4.3：精排的 Precision@K & NDCG@K =====
def evaluate_rank_topk(rank_model, recall_model, prerank_model, test_df, K=20):
    users = test_df["user_id"].unique()
    precisions = []
    ndcgs = []

    rank_model.eval()
    with torch.no_grad():
        for u in tqdm(users, desc=f"[Eval Rank TopK@{K}]"):
            sub = test_df[test_df["user_id"] == u]
            if len(sub) < K:
                continue

            # 用真实 label 当作“理想排序”（教学简化）
            sub = sub.copy()
            # 当前 user 的所有候选 item
            item_ids = torch.tensor(sub["item_id"].values, device=device)
            u_tensor = torch.full_like(item_ids, u, device=device)

            recall_logits, u_emb, v_emb = recall_model(u_tensor, item_ids)
            prerank_logits = prerank_model(u_emb, v_emb, recall_logits)
            scores = rank_model(u_emb, v_emb, recall_logits, prerank_logits).cpu().numpy()

            # 根据 rank_model 分数排序
            order = np.argsort(-scores)
            pred_topk = sub.iloc[order[:K]]

            # 真正的正样本
            true_pos_items = set(sub[sub["label"] == 1]["item_id"].values)

            # Precision@K
            hit = pred_topk["item_id"].isin(true_pos_items).sum()
            precisions.append(hit / K)

            # NDCG@K
            ndcg = 0.0
            for rank, (_, row) in enumerate(pred_topk.iterrows()):
                if row["item_id"] in true_pos_items:
                    ndcg += 1.0 / np.log2(rank + 2)
            ndcgs.append(ndcg)

    print(f"📊 [Rank] Precision@{K}: {np.mean(precisions):.4f}")
    print(f"📊 [Rank] NDCG@{K}:      {np.mean(ndcgs):.4f}")

evaluate_rank_topk(rank_model, recall_model, prerank_model, test_df, K=20)

[Eval Rank TopK@20]:   0%|          | 0/5970 [00:00<?, ?it/s]

📊 [Rank] Precision@20: 0.7016
📊 [Rank] NDCG@20:      5.2356


In [26]:
# ===== Block 5.1：MMR 多样性重排 =====
def mmr_rerank(item_ids, scores, item_embs, lambda_mmr=0.5, top_k=20):
    """
    item_ids: [N]
    scores:   [N] 来自 rank_model 的相关性分数
    item_embs:[N, D] 这些 item 的 embedding（用来算相似度）
    lambda_mmr: 相关性 vs 多样性 权重
    top_k: 输出多少个
    """
    item_ids = np.array(item_ids)
    scores   = np.array(scores)
    item_embs = np.array(item_embs)

    N = len(item_ids)
    selected = []
    selected_idx = []

    # 单位化 embedding
    norm_embs = item_embs / (np.linalg.norm(item_embs, axis=1, keepdims=True) + 1e-8)

    while len(selected) < min(top_k, N):
        if not selected_idx:
            # 第一个：直接取 rank score 最大
            idx = np.argmax(scores)
            selected.append(item_ids[idx])
            selected_idx.append(idx)
        else:
            remain_idx = [i for i in range(N) if i not in selected_idx]
            best_score = -1e9
            best_i = None

            for i in remain_idx:
                rel = scores[i]
                sim = np.max(np.dot(norm_embs[i], norm_embs[selected_idx].T))
                mmr_score = lambda_mmr * rel - (1 - lambda_mmr) * sim
                if mmr_score > best_score:
                    best_score = mmr_score
                    best_i = i

            selected.append(item_ids[best_i])
            selected_idx.append(best_i)

    return selected

In [27]:
# ===== Block 5.2：Intra-list Diversity (ILD) =====
def evaluate_rerank_diversity(item_list, item_embs):
    """
    ILD: 1 - 平均相似度
    值越高：列表越多样
    """
    idx = np.array(item_list)
    embs = item_embs[idx]
    embs = embs / (np.linalg.norm(embs, axis=1, keepdims=True)+1e-8)

    N = len(idx)
    sims = 0.0
    cnt = 0
    for i in range(N):
        for j in range(i+1, N):
            sims += np.dot(embs[i], embs[j])
            cnt += 1
    mean_sim = sims / max(cnt, 1)
    ild = 1 - mean_sim
    print(f"📊 [ReRank] ILD = {ild:.4f}")

In [28]:
# ===== Block 6：给某个 user 做一次完整推荐 =====
def recommend_for_user(user_id, topk_recall=200, topk_final=20, lambda_mmr=0.5):
    """
    全流程：
      1) 召回：用 user_emb 去 ANN 检索 topk_recall 个 item
      2) 粗排 + 精排：对召回结果打分
      3) 重排：MMR 做多样性重排
    返回：最终推荐的 item_id 列表
    """
    recall_model.eval()
    prerank_model.eval()
    rank_model.eval()

    with torch.no_grad():
        # 1) user embedding
        u_emb = recall_model.user_emb(torch.tensor([user_id], device=device))  # [1, D]
        u_vec = u_emb.cpu().numpy()

        # 2) ANN 召回候选 item
        distances, indices = ann.kneighbors(u_vec, n_neighbors=topk_recall)
        cand_item_ids = indices[0].tolist()

        # 3) 对候选进行粗排 + 精排打分
        cand_item_ids_t = torch.tensor(cand_item_ids, device=device)
        cand_user_ids_t = torch.full_like(cand_item_ids_t, user_id)

        recall_logits, u_emb_batch, v_emb_batch = recall_model(cand_user_ids_t, cand_item_ids_t)
        prerank_logits = prerank_model(u_emb_batch, v_emb_batch, recall_logits)
        rank_logits    = rank_model(u_emb_batch, v_emb_batch, recall_logits, prerank_logits)
        rank_scores    = rank_logits.cpu().numpy()

        # 4) 重排 + 多样性评估
        cand_item_embs = item_embs[cand_item_ids]
        reranked_ids = mmr_rerank(
            item_ids=cand_item_ids,
            scores=rank_scores,
            item_embs=cand_item_embs,
            lambda_mmr=lambda_mmr,
            top_k=topk_final,
        )

    print("最终推荐的 item_id 列表:", reranked_ids)
    evaluate_rerank_diversity(reranked_ids, item_embs)
    return reranked_ids

# 随机挑一个用户试一下
sample_user = train_df["user_id"].iloc[0]
print("为用户", sample_user, "推荐：")
_ = recommend_for_user(sample_user, topk_recall=200, topk_final=20, lambda_mmr=0.5)

为用户 3893 推荐：
最终推荐的 item_id 列表: [np.int64(896), np.int64(13), np.int64(38), np.int64(1643), np.int64(259), np.int64(814), np.int64(1648), np.int64(2929), np.int64(917), np.int64(345), np.int64(1033), np.int64(453), np.int64(831), np.int64(530), np.int64(1838), np.int64(1671), np.int64(1487), np.int64(433), np.int64(1471), np.int64(1526)]
📊 [ReRank] ILD = 0.3158
