# 9. 序列建模（SASRec / DIEN）

推荐系统里“序列”几乎是标配：用户兴趣会随时间变化，单纯的静态特征不够。

本 notebook 目标：

1) 实现一个可训练的 **SASRec（Transformer）召回模型**，并用 FAISS 做 TopK 检索评估

2) 实现一个 **DIEN-like（GRU + 注意力）排序模型**，展示“兴趣抽取/演化”的思路

## 面试要点

- 序列推荐 vs 静态推荐：兴趣演化、短期意图
- Transformer 优势：并行、长依赖（但要注意计算/存储）
- 训练目标：next-item / contrastive / sampled softmax
- 线上：用户向量实时更新 + ANN 检索


In [None]:
import os
import math
import pickle
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import tensorflow as tf
import faiss
from dotenv import find_dotenv, load_dotenv
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

tf.get_logger().setLevel('ERROR')


def find_repo_root(start: Path) -> Path:
    cur = start
    for _ in range(10):
        if (cur / 'pyproject.toml').exists() or (cur / '.git').exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start


REPO_ROOT = find_repo_root(Path.cwd())
dotenv_path = find_dotenv(usecwd=True)
if dotenv_path:
    load_dotenv(dotenv_path)
os.environ.setdefault('FUNREC_RAW_DATA_PATH', str(REPO_ROOT / 'data'))
os.environ.setdefault('FUNREC_PROCESSED_DATA_PATH', str(REPO_ROOT / 'tmp'))

RAW_DATA_PATH = Path(os.getenv('FUNREC_RAW_DATA_PATH'))
PROCESSED_DATA_PATH = Path(os.getenv('FUNREC_PROCESSED_DATA_PATH'))

DATA_PATH = RAW_DATA_PATH / 'dataset' / 'news_recommendation'
if not DATA_PATH.exists():
    DATA_PATH = RAW_DATA_PATH / 'news_recommendation'

PROJECT_PATH = PROCESSED_DATA_PATH / 'projects' / 'news_recommendation_system'
SASREC_DIR = PROJECT_PATH / 'artifacts' / 'sequence' / 'sasrec_inbatch'
SASREC_DIR.mkdir(parents=True, exist_ok=True)

DATA_PATH, PROJECT_PATH, SASREC_DIR


## Part A：SASRec（召回）


In [None]:
train_hist = pd.read_pickle(PROJECT_PATH / 'train_hist.pkl')
valid_last = pd.read_pickle(PROJECT_PATH / 'valid_last.pkl')
articles = pd.read_csv(DATA_PATH / 'articles.csv')

DEBUG = True
MAX_USERS = 20000
SEED = 42

if DEBUG:
    rng = np.random.default_rng(SEED)
    users = train_hist['user_id'].unique()
    if len(users) > MAX_USERS:
        sample_users = rng.choice(users, size=MAX_USERS, replace=False)
        train_hist = train_hist[train_hist['user_id'].isin(sample_users)]
        valid_last = valid_last[valid_last['user_id'].isin(sample_users)]

train_hist = train_hist.sort_values(['user_id', 'click_timestamp'])
user_hist: Dict[int, List[int]] = train_hist.groupby('user_id')['click_article_id'].apply(list).to_dict()

len(user_hist), len(train_hist)


In [None]:
@dataclass(frozen=True)
class IdMap:
    name: str
    classes_: np.ndarray
    offset: int = 1
    unknown_value: int = 0

    @property
    def vocab_size(self) -> int:
        return int(len(self.classes_) + self.offset)

    def transform(self, values) -> np.ndarray:
        index = pd.Index(self.classes_)
        arr = np.asarray(values)
        flat = arr.reshape(-1)
        idx = index.get_indexer(flat)
        out = idx.astype(np.int64) + self.offset
        out[idx < 0] = self.unknown_value
        return out.reshape(arr.shape).astype(np.int32)

    @classmethod
    def fit(cls, name: str, values, offset: int = 1) -> 'IdMap':
        uniq = pd.unique(pd.Series(list(values)))
        try:
            uniq = np.array(sorted(uniq))
        except Exception:
            uniq = np.array(list(uniq))
        return cls(name=name, classes_=uniq, offset=offset)


def pad_left(seqs: List[List[int]], max_len: int, pad_value: int = 0) -> np.ndarray:
    out = np.full((len(seqs), max_len), pad_value, dtype=np.int32)
    for i, seq in enumerate(seqs):
        if not seq:
            continue
        seq = seq[-max_len:]
        out[i, -len(seq):] = np.asarray(seq, dtype=np.int32)
    return out


item_id_map = IdMap.fit('article_id', articles['article_id'].astype(int).unique(), offset=1)
item_id_map.vocab_size


In [None]:
# ==================== 构造训练样本（next-item） ====================
MAX_SEQ_LEN = 50
MAX_SAMPLES_PER_USER = 20 if DEBUG else 200

seq_samples: List[List[int]] = []
pos_items: List[int] = []

rng = np.random.default_rng(SEED)
for u, seq in tqdm(user_hist.items(), desc='build_seq_samples'):
    if len(seq) < 2:
        continue
    positions = list(range(1, len(seq)))
    if len(positions) > MAX_SAMPLES_PER_USER:
        tail = positions[-MAX_SAMPLES_PER_USER * 3 :]
        positions = rng.choice(tail, size=MAX_SAMPLES_PER_USER, replace=False).tolist()
        positions.sort()

    for t in positions:
        hist = seq[max(0, t - MAX_SEQ_LEN) : t]
        target = int(seq[t])
        seq_samples.append([int(x) for x in hist])
        pos_items.append(target)

X_seq = pad_left([item_id_map.transform(np.asarray(s, dtype=np.int64)).tolist() for s in seq_samples], max_len=MAX_SEQ_LEN)
X_pos = item_id_map.transform(np.asarray(pos_items, dtype=np.int64))

print('num_samples:', len(X_pos))
X_seq.shape, X_pos.shape


### SASRec（Transformer）实现（in-batch negatives）

为了让 notebook 在大 vocab（36w item）下也能跑，这里不做全量 softmax，而是用 **in-batch negatives**：

- user 表示来自 Transformer 对序列的编码
- item 表示来自 item embedding
- batch 内做对比学习（InfoNCE）


In [None]:
def build_sasrec_inbatch(
    item_vocab_size: int,
    max_seq_len: int = 50,
    emb_dim: int = 64,
    num_heads: int = 2,
    num_blocks: int = 2,
    ff_dim: int = 128,
    dropout: float = 0.2,
    temperature: float = 0.05,
):
    if emb_dim % num_heads != 0:
        raise ValueError(f'emb_dim ({emb_dim}) must be divisible by num_heads ({num_heads})')

    seq_inp = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32, name='seq_ids')
    pos_inp = tf.keras.layers.Input(shape=(), dtype=tf.int32, name='pos_item')

    item_emb = tf.keras.layers.Embedding(item_vocab_size, emb_dim, mask_zero=True, name='item_emb')
    pos_emb = tf.keras.layers.Embedding(max_seq_len, emb_dim, name='pos_emb')

    x = item_emb(seq_inp)  # [B, L, D]
    positions = tf.range(max_seq_len)
    x = x + pos_emb(positions)
    x = tf.keras.layers.Dropout(dropout)(x)

    for i in range(num_blocks):
        attn_in = tf.keras.layers.LayerNormalization()(x)
        attn_out = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=emb_dim // num_heads,
            dropout=dropout,
            name=f'mha_{i}',
        )(attn_in, attn_in, use_causal_mask=True)
        x = x + attn_out

        ffn_in = tf.keras.layers.LayerNormalization()(x)
        ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation='relu'),
            tf.keras.layers.Dropout(dropout),
            tf.keras.layers.Dense(emb_dim),
        ], name=f'ffn_{i}')(ffn_in)
        x = x + ffn

    x = tf.keras.layers.LayerNormalization()(x)
    user_vec = tf.keras.layers.Lambda(lambda t: t[:, -1, :], name='last_state')(x)
    user_vec = tf.keras.layers.Dense(emb_dim, activation=None)(user_vec)
    user_vec = tf.keras.layers.Lambda(lambda t: tf.nn.l2_normalize(t, axis=1), name='user_vec')(user_vec)

    item_vec = tf.keras.layers.Flatten()(item_emb(pos_inp))
    item_vec = tf.keras.layers.Dense(emb_dim, activation=None)(item_vec)
    item_vec = tf.keras.layers.Lambda(lambda t: tf.nn.l2_normalize(t, axis=1), name='item_vec')(item_vec)

    logits = tf.keras.layers.Lambda(lambda z: tf.matmul(z[0], z[1], transpose_b=True) / temperature, name='logits')([user_vec, item_vec])

    model = tf.keras.Model(inputs={'seq_ids': seq_inp, 'pos_item': pos_inp}, outputs=logits, name='SASRecInBatch')
    user_model = tf.keras.Model(inputs={'seq_ids': seq_inp}, outputs=user_vec, name='sasrec_user')
    item_model = tf.keras.Model(inputs={'pos_item': pos_inp}, outputs=item_vec, name='sasrec_item')
    return model, user_model, item_model


def inbatch_symmetric_loss(y_true, logits):
    b = tf.shape(logits)[0]
    labels = tf.range(b)
    loss_u2i = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)
    loss_i2u = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=tf.transpose(logits))
    return tf.reduce_mean(loss_u2i + loss_i2u) / 2.0


model, sasrec_user, sasrec_item = build_sasrec_inbatch(
    item_vocab_size=item_id_map.vocab_size,
    max_seq_len=MAX_SEQ_LEN,
    emb_dim=64,
    num_heads=2,
    num_blocks=2,
    ff_dim=128,
    dropout=0.2,
    temperature=0.05,
)
model.compile(optimizer=tf.keras.optimizers.Adam(2e-4), loss=inbatch_symmetric_loss)
model.summary()


In [None]:
BATCH_SIZE = 1024
EPOCHS = 3

train_X = {'seq_ids': X_seq, 'pos_item': X_pos}
dummy_y = np.zeros(len(X_pos), dtype=np.float32)

model.fit(train_X, dummy_y, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1)


### FAISS 建索引与 Recall 评估


In [None]:
# 全量 item embedding（用 raw article_id 作为索引 id）
all_items_raw = articles['article_id'].astype(int).unique()
all_items_enc = item_id_map.transform(all_items_raw)

item_embs = sasrec_item.predict({'pos_item': all_items_enc}, batch_size=4096, verbose=0).astype('float32')
faiss.normalize_L2(item_embs)

index = faiss.IndexIDMap2(faiss.IndexFlatIP(item_embs.shape[1]))
index.add_with_ids(item_embs, all_items_raw.astype('int64'))

faiss.write_index(index, str(SASREC_DIR / 'faiss_index.bin'))
np.save(SASREC_DIR / 'item_embeddings.npy', item_embs)

print('indexed items:', index.ntotal)


In [None]:
def evaluate_recall(user_hist: Dict[int, List[int]], valid_last: pd.DataFrame, topk: int = 20) -> Dict[str, float]:
    users = valid_last['user_id'].astype(int).tolist()
    targets = valid_last['click_article_id'].astype(int).tolist()

    X_u = []
    X_seq_raw = []
    y = []
    for u, t in zip(users, targets):
        seq = user_hist.get(u)
        if not seq:
            continue
        X_u.append(u)
        X_seq_raw.append(seq[-MAX_SEQ_LEN:])
        y.append(t)

    X_seq_enc = pad_left([item_id_map.transform(np.asarray(s, dtype=np.int64)).tolist() for s in X_seq_raw], max_len=MAX_SEQ_LEN)
    user_embs = sasrec_user.predict({'seq_ids': X_seq_enc}, batch_size=4096, verbose=0).astype('float32')
    faiss.normalize_L2(user_embs)

    search_k = topk + MAX_SEQ_LEN + 10
    D, I = index.search(user_embs, search_k)

    hit = 0
    ndcg = 0.0
    total = 0
    for i in range(len(X_u)):
        hist_set = set(user_hist.get(X_u[i], []))
        recs = []
        for item_id in I[i].tolist():
            item_id = int(item_id)
            if item_id < 0:
                continue
            if item_id in hist_set:
                continue
            recs.append(item_id)
            if len(recs) >= topk:
                break
        target = int(y[i])
        if target in recs:
            hit += 1
            rank = recs.index(target)
            ndcg += 1.0 / np.log2(rank + 2)
        total += 1

    return {f'hit_rate@{topk}': hit / max(1, total), f'ndcg@{topk}': ndcg / max(1, total), 'num_users': total}


metrics = evaluate_recall(user_hist, valid_last, topk=20)
metrics


In [None]:
# 生成召回候选集（给排序用）
TOPK_CANDIDATES = 100
SEARCH_K = TOPK_CANDIDATES + MAX_SEQ_LEN + 10

rows = []
users_all = list(user_hist.keys())
batch_size = 4096
for start in tqdm(range(0, len(users_all), batch_size), desc='recall_all_users'):
    end = min(start + batch_size, len(users_all))
    u_raw_batch = users_all[start:end]
    seq_raw_batch = [user_hist[u][-MAX_SEQ_LEN:] for u in u_raw_batch]
    X_seq_enc = pad_left([item_id_map.transform(np.asarray(s, dtype=np.int64)).tolist() for s in seq_raw_batch], max_len=MAX_SEQ_LEN)
    user_embs = sasrec_user.predict({'seq_ids': X_seq_enc}, batch_size=4096, verbose=0).astype('float32')
    faiss.normalize_L2(user_embs)
    D, I = index.search(user_embs, SEARCH_K)

    for local_i, u in enumerate(u_raw_batch):
        hist_set = set(user_hist.get(u, []))
        rank = 0
        for item_id, score in zip(I[local_i].tolist(), D[local_i].tolist()):
            item_id = int(item_id)
            if item_id < 0:
                continue
            if item_id in hist_set:
                continue
            rank += 1
            rows.append((int(u), int(item_id), float(score), int(rank)))
            if rank >= TOPK_CANDIDATES:
                break

sasrec_recall_df = pd.DataFrame(rows, columns=['user_id', 'article_id', 'recall_score', 'recall_rank'])
out_path = PROJECT_PATH / 'recall_candidates_sasrec.pkl'
sasrec_recall_df.to_pickle(out_path)

out_path, sasrec_recall_df.head()


In [None]:
# 保存模型与指标
model.save(SASREC_DIR / 'sasrec_inbatch.keras')
sasrec_user.save(SASREC_DIR / 'user_tower.keras')
sasrec_item.save(SASREC_DIR / 'item_tower.keras')

with open(SASREC_DIR / 'item_id_map.pkl', 'wb') as f:
    pickle.dump(item_id_map, f)
with open(SASREC_DIR / 'metrics.pkl', 'wb') as f:
    pickle.dump(metrics, f)

print('saved to:', SASREC_DIR)


## Part B：DIEN-like（排序）

DIEN 的核心：

- 用 GRU 从行为序列抽取兴趣状态（Interest Extractor）
- 用注意力让兴趣与当前候选 item 对齐
- 再做一层“兴趣演化”（这里实现 AIGRU：注意力加权后再 GRU）

为了避免重复构造训练集，这里直接复用基础版 5.feature_engineering 的输出：`rank_train.pkl`。

如果你还没生成它，请先运行：`news_recommendation_system/5.feature_engineering.ipynb`。


In [None]:
rank_path = PROJECT_PATH / 'rank_train.pkl'
if not rank_path.exists():
    raise FileNotFoundError(f'Missing {rank_path}. Run 5.feature_engineering.ipynb first.')

rank_df = pd.read_pickle(rank_path)

# 重新加载 train_hist（不要复用上面为了 SASRec 而采样过的 train_hist/user_hist）
train_hist_full = pd.read_pickle(PROJECT_PATH / 'train_hist.pkl').sort_values(['user_id', 'click_timestamp'])
user_hist_full: Dict[int, List[int]] = train_hist_full.groupby('user_id')['click_article_id'].apply(list).to_dict()

# 按 user 划分训练/验证
rng = np.random.default_rng(42)
users = rank_df['user_id'].unique()
rng.shuffle(users)
split = int(len(users) * 0.8)
train_users = set(users[:split])

train_r = rank_df[rank_df['user_id'].isin(train_users)].copy()
valid_r = rank_df[~rank_df['user_id'].isin(train_users)].copy()

# 可选限制规模
MAX_TRAIN_ROWS = 300000
if DEBUG and len(train_r) > MAX_TRAIN_ROWS:
    train_r = train_r.sample(MAX_TRAIN_ROWS, random_state=42)

train_r[['label']].value_counts(), valid_r[['label']].value_counts(), len(train_r), len(valid_r)


In [None]:
# 选一些 dense 特征（与 DIN 类似）
DENSE_COLS = [
    'recall_score', 'recall_rank',
    'user_click_count', 'user_unique_items',
    'item_click_count', 'words_count',
    'item_age_hours', 'time_gap_hours',
    'emb_sim_last', 'is_same_category',
]

scaler = StandardScaler()
X_train_dense = scaler.fit_transform(train_r[DENSE_COLS].fillna(0).values.astype('float32'))
X_valid_dense = scaler.transform(valid_r[DENSE_COLS].fillna(0).values.astype('float32'))

y_train = train_r['label'].values.astype('float32')
y_valid = valid_r['label'].values.astype('float32')

# user/item 编码（用全量 vocab，避免历史 unknown）
all_users = pd.unique(pd.concat([train_r['user_id'], valid_r['user_id']]).astype(int))
raw_to_user_enc = {int(v): int(i + 1) for i, v in enumerate(np.sort(all_users))}
user_vocab_size = int(len(raw_to_user_enc) + 1)

train_user = np.asarray([raw_to_user_enc.get(int(x), 0) for x in train_r['user_id'].values], dtype=np.int32)
valid_user = np.asarray([raw_to_user_enc.get(int(x), 0) for x in valid_r['user_id'].values], dtype=np.int32)

train_item = item_id_map.transform(train_r['article_id'].astype(int).values)
valid_item = item_id_map.transform(valid_r['article_id'].astype(int).values)

# 历史序列（来自 train_hist）
MAX_HIST_LEN = 30
hist_map = {int(u): pad_left([item_id_map.transform(np.asarray(seq[-MAX_HIST_LEN:], dtype=np.int64)).tolist()], max_len=MAX_HIST_LEN)[0] for u, seq in user_hist_full.items()}

train_hist_mat = np.vstack([hist_map.get(int(u), np.zeros(MAX_HIST_LEN, dtype=np.int32)) for u in train_r['user_id'].values])
valid_hist_mat = np.vstack([hist_map.get(int(u), np.zeros(MAX_HIST_LEN, dtype=np.int32)) for u in valid_r['user_id'].values])

train_hist_mat.shape, train_item.shape


In [None]:
class SimpleAttention(tf.keras.layers.Layer):
    def __init__(self, hidden_units: List[int] = [80, 40], **kwargs):
        super().__init__(**kwargs)
        self.mlp = [tf.keras.layers.Dense(u, activation='relu') for u in hidden_units]
        self.out = tf.keras.layers.Dense(1, activation=None)

    def call(self, inputs):
        query, keys, mask = inputs  # query: [B, D], keys: [B, L, H]
        q = tf.expand_dims(query, axis=1)
        q = tf.tile(q, [1, tf.shape(keys)[1], 1])
        x = tf.concat([q, keys, q - keys, q * keys], axis=-1)
        for dense in self.mlp:
            x = dense(x)
        scores = tf.squeeze(self.out(x), axis=-1)
        paddings = tf.ones_like(scores) * (-1e9)
        scores = tf.where(mask > 0, scores, paddings)
        weights = tf.nn.softmax(scores, axis=-1)
        return weights  # [B, L]


def build_dien_like(user_vocab_size: int, item_vocab_size: int, hist_len: int, dense_dim: int, emb_dim: int = 16, gru_units: int = 32):
    user_inp = tf.keras.layers.Input(shape=(), dtype=tf.int32, name='user_id')
    hist_inp = tf.keras.layers.Input(shape=(hist_len,), dtype=tf.int32, name='hist_items')
    item_inp = tf.keras.layers.Input(shape=(), dtype=tf.int32, name='target_item')
    dense_inp = tf.keras.layers.Input(shape=(dense_dim,), dtype=tf.float32, name='dense')

    user_emb = tf.keras.layers.Flatten()(tf.keras.layers.Embedding(user_vocab_size, emb_dim)(user_inp))
    item_emb_layer = tf.keras.layers.Embedding(item_vocab_size, emb_dim, mask_zero=True)
    target_emb = tf.keras.layers.Flatten()(item_emb_layer(item_inp))
    hist_emb = item_emb_layer(hist_inp)

    mask = tf.cast(tf.not_equal(hist_inp, 0), tf.int32)

    # Interest Extractor: GRU hidden states
    hs = tf.keras.layers.GRU(gru_units, return_sequences=True)(hist_emb)

    # Attention weights conditioned on target
    att = SimpleAttention()([target_emb, hs, mask])  # [B, L]
    att = tf.expand_dims(att, axis=-1)  # [B, L, 1]

    # AIGRU: attention * hidden states -> GRU
    aigru_inp = hs * att
    evolved = tf.keras.layers.GRU(gru_units, return_sequences=False)(aigru_inp)

    x = tf.keras.layers.Concatenate()([user_emb, target_emb, evolved, dense_inp])
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    out = tf.keras.layers.Flatten()(out)

    model = tf.keras.Model(inputs={'user_id': user_inp, 'hist_items': hist_inp, 'target_item': item_inp, 'dense': dense_inp}, outputs=out, name='DIEN_like')
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(name='auc')])
    return model


dien = build_dien_like(user_vocab_size=user_vocab_size, item_vocab_size=item_id_map.vocab_size, hist_len=MAX_HIST_LEN, dense_dim=X_train_dense.shape[1])
dien.summary()


In [None]:
train_inputs = {'user_id': train_user, 'hist_items': train_hist_mat, 'target_item': train_item, 'dense': X_train_dense}
valid_inputs = {'user_id': valid_user, 'hist_items': valid_hist_mat, 'target_item': valid_item, 'dense': X_valid_dense}

dien.fit(train_inputs, y_train, batch_size=2048, epochs=2, validation_data=(valid_inputs, y_valid), verbose=1)


In [None]:
def hit_ndcg_at_k(df: pd.DataFrame, preds: np.ndarray, k: int = 5) -> Dict[str, float]:
    tmp = df[['user_id', 'label']].copy()
    tmp['pred'] = preds
    hit = 0
    ndcg = 0.0
    total = 0
    for _, g in tmp.groupby('user_id'):
        g = g.sort_values('pred', ascending=False).head(k)
        if g['label'].max() > 0:
            hit += 1
            rank = int(g.reset_index(drop=True).index[g['label'] == 1][0])
            ndcg += 1.0 / math.log2(rank + 2)
        total += 1
    return {f'hit_rate@{k}': hit / max(1, total), f'ndcg@{k}': ndcg / max(1, total), 'num_users': total}


pred_valid = dien.predict(valid_inputs, batch_size=4096, verbose=0)
hit_ndcg_at_k(valid_r, pred_valid, k=5)


In [None]:
DIEN_DIR = PROJECT_PATH / 'artifacts' / 'ranking' / 'dien_like'
DIEN_DIR.mkdir(parents=True, exist_ok=True)

dien.save(DIEN_DIR / 'dien_like.keras')
with open(DIEN_DIR / 'scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open(DIEN_DIR / 'encoders.pkl', 'wb') as f:
    pickle.dump({'raw_to_user_enc': raw_to_user_enc}, f)

print('saved to:', DIEN_DIR)
