# 8. 深度排序（DeepFM / DIN）

这一节把基础版的 `LGBMRanker` 排序升级为深度学习排序（工业常见）：

- **DeepFM**：FM 二阶交叉 + DNN 高阶交叉
- **DIN**：对用户历史序列做注意力（兴趣与候选 item 相关）

并补齐更贴近生产的流程：

1) 召回候选集（可融合 ItemCF / 双塔 / 热门）

2) 特征工程（复用基础版逻辑）

3) 深度排序训练与评估（HitRate@K / NDCG@K）

## 面试要点

- 为什么要两阶段（Recall→Rank）：效率 vs 效果
- DeepFM vs DIN：特征交叉 vs 序列兴趣
- 训练样本怎么构造：候选集 + 正负样本
- 评估：按 user 分组的 TopK 指标（避免把样本当独立）


In [9]:
import os
import math
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from dotenv import find_dotenv, load_dotenv
from tqdm import tqdm

tf.get_logger().setLevel('ERROR')


def find_repo_root(start: Path) -> Path:
    cur = start
    for _ in range(10):
        if (cur / 'pyproject.toml').exists() or (cur / '.git').exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start


REPO_ROOT = find_repo_root(Path.cwd())
dotenv_path = find_dotenv(usecwd=True)
if dotenv_path:
    load_dotenv(dotenv_path)
os.environ.setdefault('FUNREC_RAW_DATA_PATH', str(REPO_ROOT / 'data'))
os.environ.setdefault('FUNREC_PROCESSED_DATA_PATH', str(REPO_ROOT / 'tmp'))

RAW_DATA_PATH = Path(os.getenv('FUNREC_RAW_DATA_PATH'))
PROCESSED_DATA_PATH = Path(os.getenv('FUNREC_PROCESSED_DATA_PATH'))

DATA_PATH = RAW_DATA_PATH / 'dataset' / 'news_recommendation'
if not DATA_PATH.exists():
    DATA_PATH = RAW_DATA_PATH / 'news_recommendation'

PROJECT_PATH = PROCESSED_DATA_PATH / 'projects' / 'news_recommendation_system'
ARTIFACTS_DIR = PROJECT_PATH / 'artifacts' / 'ranking' / 'deep_models'
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

DATA_PATH, PROJECT_PATH, ARTIFACTS_DIR


(PosixPath('/Users/wangjunfei/Desktop/fun-rec/data/dataset/news_recommendation'),
 PosixPath('/Users/wangjunfei/Desktop/fun-rec/tmp/projects/news_recommendation_system'),
 PosixPath('/Users/wangjunfei/Desktop/fun-rec/tmp/projects/news_recommendation_system/artifacts/ranking/deep_models'))

In [10]:
# ==================== 读取基础数据与离线切分 ====================
train_hist = pd.read_pickle(PROJECT_PATH / 'train_hist.pkl')
valid_last = pd.read_pickle(PROJECT_PATH / 'valid_last.pkl')
articles = pd.read_csv(DATA_PATH / 'articles.csv')

# 用户历史序列（用于 DIN）
train_hist_sorted = train_hist.sort_values(['user_id', 'click_timestamp'])
user_hist: Dict[int, List[int]] = train_hist_sorted.groupby('user_id')['click_article_id'].apply(list).to_dict()

len(train_hist), len(valid_last), len(user_hist)


(912623, 200000, 200000)

## 1) 召回候选集（可融合多路 recall）

你可以把候选集理解为“精排的检索空间”。本 notebook 支持：

- 基础版候选：`recall_candidates.pkl`（来自 4.recall）
- 双塔候选：`recall_candidates_two_tower.pkl`（来自 7.two_tower_recall）

默认用 **RRF（倒数排名融合）** 合并。


In [11]:
USE_BASELINE_RECALL = True
USE_TWO_TOWER_RECALL = True
MAX_CANDIDATES = 100


def load_recall_df(path: Path, name: str) -> pd.DataFrame:
    if not path.exists():
        print(f'[skip] {name}: {path} not found')
        return pd.DataFrame(columns=['user_id', 'article_id', 'recall_score', 'source'])
    df = pd.read_pickle(path)
    df = df[['user_id', 'article_id', 'recall_score']].copy()
    df['source'] = name
    return df


def rrf_merge(dfs: List[pd.DataFrame], weights: Dict[str, float], topk: int = 100) -> pd.DataFrame:
    # Reciprocal Rank Fusion: sum(w / (rank+1))
    merged = []
    for df in dfs:
        if df.empty:
            continue
        name = df['source'].iloc[0]
        w = float(weights.get(name, 1.0))
        tmp = df.copy()
        tmp['rank'] = tmp.groupby('user_id')['recall_score'].rank(ascending=False, method='first')
        tmp['rrf'] = w / (tmp['rank'] + 1.0)
        merged.append(tmp[['user_id', 'article_id', 'rrf']])

    if not merged:
        raise ValueError('No recall sources available.')

    merged = pd.concat(merged, axis=0, ignore_index=True)
    merged = merged.groupby(['user_id', 'article_id'], as_index=False)['rrf'].sum()
    merged = merged.rename(columns={'rrf': 'recall_score'})
    merged['recall_rank'] = merged.groupby('user_id')['recall_score'].rank(ascending=False, method='first')
    merged = merged[merged['recall_rank'] <= topk]
    return merged


sources = []
weights = {}

if USE_BASELINE_RECALL:
    df_base = load_recall_df(PROJECT_PATH / 'recall_candidates.pkl', 'baseline')
    if not df_base.empty:
        sources.append(df_base)
        weights['baseline'] = 1.0

if USE_TWO_TOWER_RECALL:
    df_tt = load_recall_df(PROJECT_PATH / 'recall_candidates_two_tower.pkl', 'two_tower')
    if not df_tt.empty:
        sources.append(df_tt)
        weights['two_tower'] = 1.0

recall_df = rrf_merge(sources, weights=weights, topk=MAX_CANDIDATES)
recall_df.head(), recall_df['user_id'].nunique(), len(recall_df)


(   user_id  article_id  recall_score  recall_rank
 2        0       16346      0.031250         61.0
 3        0       18187      0.142857         11.0
 4        0       22019      0.033333         57.0
 5        0       22024      0.200000          7.0
 6        0       22529      0.020833         93.0,
 20000,
 2000000)

## 2) 特征工程（与基础版一致）

深度排序与树模型一样，需要构造用户/物品/交互/召回特征。

为了保持可复现与可对比，这里尽量复用基础版 `5.feature_engineering.ipynb` 的逻辑。


In [12]:
# 用户特征
user_click_count = train_hist.groupby('user_id').size().rename('user_click_count')
user_unique_items = train_hist.groupby('user_id')['click_article_id'].nunique().rename('user_unique_items')
user_last_click_ts = train_hist.groupby('user_id')['click_timestamp'].max().rename('user_last_click_ts')

click_with_cat = train_hist.merge(articles, left_on='click_article_id', right_on='article_id', how='left')
user_top_category = click_with_cat.groupby('user_id')['category_id'].agg(lambda x: x.value_counts().idxmax()).rename('user_top_category')

user_features = pd.concat([user_click_count, user_unique_items, user_last_click_ts, user_top_category], axis=1).reset_index()

# 物品特征
item_click_count = train_hist.groupby('click_article_id').size().rename('item_click_count')
item_last_click_ts = train_hist.groupby('click_article_id')['click_timestamp'].max().rename('item_last_click_ts')

item_features = (
    articles.merge(item_click_count, left_on='article_id', right_index=True, how='left')
    .merge(item_last_click_ts, left_on='article_id', right_index=True, how='left')
)
item_features['item_click_count'] = item_features['item_click_count'].fillna(0)
item_features['item_last_click_ts'] = item_features['item_last_click_ts'].fillna(0)

# 用户最后一次点击（基于 train_hist）
user_last_click = (
    train_hist.sort_values(['user_id', 'click_timestamp'])
    .groupby('user_id')
    .tail(1)[['user_id', 'click_article_id', 'click_timestamp']]
    .rename(columns={'click_article_id': 'last_click_article_id', 'click_timestamp': 'last_click_timestamp'})
)

# 合并成候选样本表
candidates = (
    recall_df.merge(user_features, on='user_id', how='left')
    .merge(user_last_click, on='user_id', how='left')
    .merge(item_features, left_on='article_id', right_on='article_id', how='left')
)

candidates['is_same_category'] = (candidates['category_id'] == candidates['user_top_category']).astype(int)
candidates['item_age_hours'] = (candidates['last_click_timestamp'] - candidates['created_at_ts']) / 3600_000
candidates['time_gap_hours'] = (candidates['last_click_timestamp'] - candidates['item_last_click_ts']) / 3600_000
candidates[['item_age_hours', 'time_gap_hours']] = candidates[['item_age_hours', 'time_gap_hours']].fillna(0)

# label：valid_last 里每个 user 的目标 item
target = valid_last[['user_id', 'click_article_id']].rename(columns={'click_article_id': 'target_article_id'})
candidates = candidates.merge(target, on='user_id', how='left')
candidates['label'] = (candidates['article_id'] == candidates['target_article_id']).astype(int)

candidates[['user_id', 'article_id', 'recall_score', 'recall_rank', 'label']].head()


Unnamed: 0,user_id,article_id,recall_score,recall_rank,label
0,0,16346,0.03125,61.0,0
1,0,18187,0.142857,11.0,0
2,0,22019,0.033333,57.0,0
3,0,22024,0.2,7.0,0
4,0,22529,0.020833,93.0,0


In [13]:
# 可选：内容向量相似度特征（emb_sim_last）
USE_CONTENT_EMB_SIM = True

if USE_CONTENT_EMB_SIM:
    article_emb = pd.read_csv(DATA_PATH / 'articles_emb.csv')
    emb_cols = [c for c in article_emb.columns if c.startswith('emb_')]
    emb_matrix = article_emb[emb_cols].values.astype('float32')
    emb_matrix /= np.linalg.norm(emb_matrix, axis=1, keepdims=True) + 1e-12
    article_ids = article_emb['article_id'].values
    id2idx = {int(a): int(i) for i, a in enumerate(article_ids)}

    cand_idx = candidates['article_id'].map(id2idx)
    last_idx = candidates['last_click_article_id'].map(id2idx)
    mask = cand_idx.notna() & last_idx.notna()

    sim = np.zeros(len(candidates), dtype='float32')
    sim[mask.values] = (
        emb_matrix[cand_idx[mask].astype(int)]
        * emb_matrix[last_idx[mask].astype(int)]
    ).sum(axis=1)
    candidates['emb_sim_last'] = sim
else:
    candidates['emb_sim_last'] = 0.0

candidates[['emb_sim_last']].describe()


Unnamed: 0,emb_sim_last
count,2000000.0
mean,0.3432449
std,0.2977172
min,-0.5161761
25%,0.1268502
50%,0.2505311
75%,0.5093505
max,1.0


## 3) 训练/验证划分（按 user 划分）


In [14]:
rng = np.random.default_rng(42)
users = candidates['user_id'].unique()
rng.shuffle(users)
split = int(len(users) * 0.8)
train_users = set(users[:split])

train_df = candidates[candidates['user_id'].isin(train_users)].copy()
valid_df = candidates[~candidates['user_id'].isin(train_users)].copy()

# 可选：限制训练规模（避免 notebook 太慢）
MAX_TRAIN_ROWS = 300000 if USE_CONTENT_EMB_SIM else 500000
if len(train_df) > MAX_TRAIN_ROWS:
    train_df = train_df.sample(MAX_TRAIN_ROWS, random_state=42)

train_df[['label']].value_counts(), valid_df[['label']].value_counts(), len(train_df), len(valid_df)


(label
 0        298809
 1          1191
 Name: count, dtype: int64,
 label
 0        398455
 1          1545
 Name: count, dtype: int64,
 300000,
 400000)

## 4) DeepFM

输入：

- sparse：user_id, article_id, category_id, user_top_category, last_click_article_id
- dense：recall_score, recall_rank, user/item 统计特征, 时间特征, emb_sim_last


In [15]:
SPARSE_COLS = ['user_id', 'article_id', 'category_id', 'user_top_category', 'last_click_article_id']
DENSE_COLS = [
    'recall_score',
    'recall_rank',
    'user_click_count',
    'user_unique_items',
    'item_click_count',
    'words_count',
    'item_age_hours',
    'time_gap_hours',
    'emb_sim_last',
    'is_same_category',
]


def encode_categorical(train_s: pd.Series, valid_s: pd.Series) -> Tuple[np.ndarray, np.ndarray, int, List[str]]:
    all_s = pd.concat([train_s, valid_s], axis=0)
    codes, uniques = pd.factorize(all_s.astype(str), sort=True)
    codes = codes.astype(np.int32) + 1  # 0 reserved
    train_codes = codes[: len(train_s)]
    valid_codes = codes[len(train_s) :]
    vocab_size = int(len(uniques) + 1)
    return train_codes, valid_codes, vocab_size, list(uniques)


# 编码 sparse
train_sparse = {}
valid_sparse = {}
vocab_sizes = {}
factor_uniques = {}
for col in SPARSE_COLS:
    tr, va, vs, uniq = encode_categorical(train_df[col].fillna(0), valid_df[col].fillna(0))
    train_sparse[col] = tr
    valid_sparse[col] = va
    vocab_sizes[col] = vs
    factor_uniques[col] = uniq

# 归一化 dense
scaler = StandardScaler()
X_train_dense = scaler.fit_transform(train_df[DENSE_COLS].fillna(0).values.astype('float32'))
X_valid_dense = scaler.transform(valid_df[DENSE_COLS].fillna(0).values.astype('float32'))

y_train = train_df['label'].values.astype('float32')
y_valid = valid_df['label'].values.astype('float32')

len(y_train), len(y_valid)


(300000, 400000)

In [16]:
def build_deepfm(sparse_vocab_sizes: Dict[str, int], dense_dim: int, emb_dim: int = 16, dnn_units: List[int] = [128, 64]):
    inputs = {}
    embed_vecs = []
    linear_terms = []

    for feat, vocab_size in sparse_vocab_sizes.items():
        inp = tf.keras.layers.Input(shape=(), dtype=tf.int32, name=feat)
        inputs[feat] = inp

        emb = tf.keras.layers.Embedding(vocab_size, emb_dim, name=f'emb_{feat}')(inp)
        emb = tf.keras.layers.Flatten()(emb)
        embed_vecs.append(emb)

        lin = tf.keras.layers.Embedding(vocab_size, 1, name=f'lin_{feat}')(inp)
        lin = tf.keras.layers.Flatten()(lin)
        linear_terms.append(lin)

    dense_inp = tf.keras.layers.Input(shape=(dense_dim,), dtype=tf.float32, name='dense')
    inputs['dense'] = dense_inp

    # FM 二阶交叉
    # 注意：在新版本 Keras 中，不能直接对 KerasTensor 调用 tf.reduce_sum/tf.square；用 Lambda 包一下更稳。
    stack = tf.keras.layers.Lambda(lambda x: tf.stack(x, axis=1), name='fm_stack')(embed_vecs)  # [B, F, D]
    summed = tf.keras.layers.Lambda(lambda t: tf.reduce_sum(t, axis=1), name='fm_sum')(stack)  # [B, D]
    sum_square = tf.keras.layers.Lambda(lambda t: tf.square(t), name='fm_sum_square')(summed)  # [B, D]
    square_sum = tf.keras.layers.Lambda(lambda t: tf.reduce_sum(tf.square(t), axis=1), name='fm_square_sum')(stack)  # [B, D]
    fm = tf.keras.layers.Lambda(lambda z: 0.5 * tf.reduce_sum(z[0] - z[1], axis=1, keepdims=True), name='fm_logit')([sum_square, square_sum])  # [B, 1]

    # Deep 部分
    dnn_inp = tf.keras.layers.Concatenate()(embed_vecs + [dense_inp])
    x = dnn_inp
    for i, units in enumerate(dnn_units):
        x = tf.keras.layers.Dense(units, activation='relu', name=f'deep_dense_{i}')(x)
        x = tf.keras.layers.Dropout(0.2, name=f'deep_dropout_{i}')(x)
    deep_logit = tf.keras.layers.Dense(1, activation=None, name='deep_logit')(x)

    # Linear 部分
    linear_sparse = tf.keras.layers.Add()(linear_terms) if len(linear_terms) > 1 else linear_terms[0]
    # 这里必须显式命名，避免和 Input(name='dense') 冲突导致 ValueError
    linear_dense = tf.keras.layers.Dense(1, activation=None, name='linear_dense')(dense_inp)

    logit = tf.keras.layers.Add()([linear_sparse, linear_dense, fm, deep_logit])
    out = tf.keras.layers.Activation('sigmoid')(logit)
    out = tf.keras.layers.Flatten()(out)

    model = tf.keras.Model(inputs=inputs, outputs=out, name='DeepFM')
    try:
        optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-3)
    except Exception:
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(name='auc')])
    return model


deepfm = build_deepfm({k: vocab_sizes[k] for k in SPARSE_COLS}, dense_dim=X_train_dense.shape[1])
deepfm.summary()


Model: "DeepFM"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_id (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 article_id (InputLayer)     [(None,)]                    0         []                            
                                                                                                  
 category_id (InputLayer)    [(None,)]                    0         []                            
                                                                                                  
 user_top_category (InputLa  [(None,)]                    0         []                            
 yer)                                                                                        

In [17]:
train_inputs = {**{k: v for k, v in train_sparse.items()}, 'dense': X_train_dense}
valid_inputs = {**{k: v for k, v in valid_sparse.items()}, 'dense': X_valid_dense}

deepfm.fit(
    train_inputs,
    y_train,
    batch_size=2048,
    epochs=2,
    validation_data=(valid_inputs, y_valid),
    verbose=1,
)


Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x30b1470d0>

In [18]:
def hit_ndcg_at_k(df: pd.DataFrame, preds: np.ndarray, k: int = 5) -> Dict[str, float]:
    tmp = df[['user_id', 'label']].copy()
    tmp['pred'] = preds
    hit = 0
    ndcg = 0.0
    total = 0
    for _, g in tmp.groupby('user_id'):
        g = g.sort_values('pred', ascending=False).head(k)
        if g['label'].max() > 0:
            hit += 1
            rank = int(g.reset_index(drop=True).index[g['label'] == 1][0])
            ndcg += 1.0 / math.log2(rank + 2)
        total += 1
    return {f'hit_rate@{k}': hit / max(1, total), f'ndcg@{k}': ndcg / max(1, total), 'num_users': total}


pred_valid_deepfm = deepfm.predict(valid_inputs, batch_size=4096, verbose=0)
hit_ndcg_at_k(valid_df, pred_valid_deepfm, k=5)


{'hit_rate@5': 0.2175, 'ndcg@5': 0.1475663304779215, 'num_users': 4000}

## 5) DIN（带序列注意力）

DIN 的关键是 attention pooling：对同一个用户的历史序列，会根据“当前候选 item”不同而产生不同的兴趣聚合。


In [19]:
MAX_HIST_LEN = 30

# 为了让 DIN 能用到完整的历史序列，这里对全量 article_id 做编码（raw->enc）
# vocab_size 大约 36 万，embedding 参数量仍可控（36w * 16 ≈ 576 万）
all_article_ids = articles['article_id'].astype(int).unique()
raw_to_item_enc = {int(v): int(i + 1) for i, v in enumerate(np.sort(all_article_ids))}
item_vocab_size = int(len(raw_to_item_enc) + 1)

def encode_item_seq(seq: List[int], max_len: int) -> np.ndarray:
    enc = [raw_to_item_enc.get(int(x), 0) for x in seq][-max_len:]
    out = np.zeros(max_len, dtype=np.int32)
    if enc:
        out[-len(enc):] = np.asarray(enc, dtype=np.int32)
    return out


user_hist_enc = {int(u): encode_item_seq(seq, MAX_HIST_LEN) for u, seq in user_hist.items()}

def build_hist_matrix(df: pd.DataFrame) -> np.ndarray:
    return np.vstack([user_hist_enc.get(int(u), np.zeros(MAX_HIST_LEN, dtype=np.int32)) for u in df['user_id'].values])


train_hist_mat = build_hist_matrix(train_df)
valid_hist_mat = build_hist_matrix(valid_df)

# target item 编码
train_target_item = np.asarray([raw_to_item_enc.get(int(x), 0) for x in train_df['article_id'].values], dtype=np.int32)
valid_target_item = np.asarray([raw_to_item_enc.get(int(x), 0) for x in valid_df['article_id'].values], dtype=np.int32)

print('item_vocab_size:', item_vocab_size)
train_hist_mat.shape, train_target_item.shape


item_vocab_size: 364048


((300000, 30), (300000,))

In [20]:
class DinAttention(tf.keras.layers.Layer):
    def __init__(self, hidden_units: List[int] = [80, 40], **kwargs):
        super().__init__(**kwargs)
        self.hidden_units = hidden_units
        self.mlp = [tf.keras.layers.Dense(u, activation='relu') for u in hidden_units]
        self.out = tf.keras.layers.Dense(1, activation=None)

    def call(self, inputs):
        query, keys, mask = inputs  # query: [B, D], keys: [B, L, D], mask: [B, L]
        q = tf.expand_dims(query, axis=1)  # [B, 1, D]
        q = tf.tile(q, [1, tf.shape(keys)[1], 1])  # [B, L, D]
        x = tf.concat([q, keys, q - keys, q * keys], axis=-1)  # [B, L, 4D]
        for dense in self.mlp:
            x = dense(x)
        scores = tf.squeeze(self.out(x), axis=-1)  # [B, L]
        paddings = tf.ones_like(scores) * (-1e9)
        scores = tf.where(mask > 0, scores, paddings)
        weights = tf.nn.softmax(scores, axis=-1)  # [B, L]
        weights = tf.expand_dims(weights, axis=-1)  # [B, L, 1]
        return tf.reduce_sum(weights * keys, axis=1)  # [B, D]


def build_din(item_vocab_size: int, user_vocab_size: int, dense_dim: int, emb_dim: int = 16, dnn_units: List[int] = [128, 64]):
    user_inp = tf.keras.layers.Input(shape=(), dtype=tf.int32, name='user_id')
    hist_inp = tf.keras.layers.Input(shape=(MAX_HIST_LEN,), dtype=tf.int32, name='hist_items')
    item_inp = tf.keras.layers.Input(shape=(), dtype=tf.int32, name='target_item')
    dense_inp = tf.keras.layers.Input(shape=(dense_dim,), dtype=tf.float32, name='dense')

    user_emb = tf.keras.layers.Embedding(user_vocab_size, emb_dim)(user_inp)
    user_emb = tf.keras.layers.Flatten()(user_emb)

    item_emb_layer = tf.keras.layers.Embedding(item_vocab_size, emb_dim, mask_zero=True)
    target_emb = tf.keras.layers.Flatten()(item_emb_layer(item_inp))  # [B, D]
    hist_emb = item_emb_layer(hist_inp)  # [B, L, D]

    mask = tf.cast(tf.not_equal(hist_inp, 0), tf.int32)  # [B, L]
    att_out = DinAttention()([target_emb, hist_emb, mask])

    x = tf.keras.layers.Concatenate()([user_emb, target_emb, att_out, dense_inp])
    for units in dnn_units:
        x = tf.keras.layers.Dense(units, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    out = tf.keras.layers.Flatten()(out)

    model = tf.keras.Model(inputs={'user_id': user_inp, 'hist_items': hist_inp, 'target_item': item_inp, 'dense': dense_inp}, outputs=out, name='DIN')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.AUC(name='auc')],
    )
    return model


# user_id 也做一个编码（用于 embedding）
all_user_ids = pd.unique(pd.concat([train_df['user_id'], valid_df['user_id']]).astype(int))
raw_to_user_enc = {int(v): int(i + 1) for i, v in enumerate(np.sort(all_user_ids))}
user_vocab_size = int(len(raw_to_user_enc) + 1)
train_user_enc = np.asarray([raw_to_user_enc.get(int(x), 0) for x in train_df['user_id'].values], dtype=np.int32)
valid_user_enc = np.asarray([raw_to_user_enc.get(int(x), 0) for x in valid_df['user_id'].values], dtype=np.int32)

din = build_din(item_vocab_size=item_vocab_size, user_vocab_size=user_vocab_size, dense_dim=X_train_dense.shape[1])
din.summary()




Model: "DIN"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 target_item (InputLayer)    [(None,)]                    0         []                            
                                                                                                  
 hist_items (InputLayer)     [(None, 30)]                 0         []                            
                                                                                                  
 user_id (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 embedding_1 (Embedding)     multiple                     5824768   ['target_item[0][0]',         
                                                                     'hist_items[0][0]']        

In [21]:
din_train_inputs = {
    'user_id': train_user_enc,
    'hist_items': train_hist_mat,
    'target_item': train_target_item,
    'dense': X_train_dense,
}
din_valid_inputs = {
    'user_id': valid_user_enc,
    'hist_items': valid_hist_mat,
    'target_item': valid_target_item,
    'dense': X_valid_dense,
}

din.fit(
    din_train_inputs,
    y_train,
    batch_size=2048,
    epochs=2,
    validation_data=(din_valid_inputs, y_valid),
    verbose=1,
)


Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x17ff630d0>

In [22]:
pred_valid_din = din.predict(din_valid_inputs, batch_size=4096, verbose=0)
hit_ndcg_at_k(valid_df, pred_valid_din, k=5)


{'hit_rate@5': 0.213, 'ndcg@5': 0.14482263336251341, 'num_users': 4000}

## 6) 保存产物


In [23]:
deepfm.save(ARTIFACTS_DIR / 'deepfm.keras')
din.save(ARTIFACTS_DIR / 'din.keras')

# 额外导出 SavedModel（更通用，且避免 .keras 在 safe_mode 下的 Lambda 反序列化限制）
deepfm_sm = ARTIFACTS_DIR / 'deepfm_savedmodel'
din_sm = ARTIFACTS_DIR / 'din_savedmodel'
if not deepfm_sm.exists():
    deepfm.save(deepfm_sm)
if not din_sm.exists():
    din.save(din_sm)

import pickle
with open(ARTIFACTS_DIR / 'scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open(ARTIFACTS_DIR / 'deepfm_factorizers.pkl', 'wb') as f:
    pickle.dump({'sparse_cols': SPARSE_COLS, 'dense_cols': DENSE_COLS, 'uniques': factor_uniques}, f)
with open(ARTIFACTS_DIR / 'din_encoders.pkl', 'wb') as f:
    pickle.dump({'raw_to_item_enc': raw_to_item_enc, 'raw_to_user_enc': raw_to_user_enc}, f)

print('saved to:', ARTIFACTS_DIR)


saved to: /Users/wangjunfei/Desktop/fun-rec/tmp/projects/news_recommendation_system/artifacts/ranking/deep_models
