# 10. 多任务学习（Shared-Bottom / MMoE / ESMM / PLE）

工业推荐里经常是多目标：点击（CTR）、转化（CVR）、深度阅读/停留、点赞、收藏、分享……

这节我们在新闻数据上把“多任务建模”模块补齐：

- **Shared-Bottom**：共享底座 + 任务塔（基线）
- **MMoE**：多专家 + gate（常见面试题）
- **ESMM**：CTR/CVR 去偏建模（需要转化类标签）
- **PLE**：分层专家抽取（多任务更强的结构）

## 重要说明（关于标签）

公开新闻数据集通常只有点击日志，没有真实的“转化/停留/点赞”等多目标反馈。

为了让项目具备可运行的多任务训练流程，这里构造一个**可解释的 proxy 标签**：

- `ctr`：是否为验证目标点击（来自 `rank_train.pkl` 的 label）
- `ctcvr`：是否为“高价值点击”= `ctr==1` 且 `words_count >= THRESH_WORDS`

你在简历/面试里要明确写清楚：

- 这是 proxy label（用于展示方法与工程能力）
- 真实业务应使用真实转化/停留等日志，并做样本选择偏差（SSB）处理


In [None]:
import os
import pickle
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import tensorflow as tf
from dotenv import find_dotenv, load_dotenv
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

tf.get_logger().setLevel('ERROR')


def find_repo_root(start: Path) -> Path:
    cur = start
    for _ in range(10):
        if (cur / 'pyproject.toml').exists() or (cur / '.git').exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start


REPO_ROOT = find_repo_root(Path.cwd())
dotenv_path = find_dotenv(usecwd=True)
if dotenv_path:
    load_dotenv(dotenv_path)
os.environ.setdefault('FUNREC_RAW_DATA_PATH', str(REPO_ROOT / 'data'))
os.environ.setdefault('FUNREC_PROCESSED_DATA_PATH', str(REPO_ROOT / 'tmp'))

RAW_DATA_PATH = Path(os.getenv('FUNREC_RAW_DATA_PATH'))
PROCESSED_DATA_PATH = Path(os.getenv('FUNREC_PROCESSED_DATA_PATH'))
PROJECT_PATH = PROCESSED_DATA_PATH / 'projects' / 'news_recommendation_system'

ARTIFACTS_DIR = PROJECT_PATH / 'artifacts' / 'multitask'
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

PROJECT_PATH, ARTIFACTS_DIR


In [None]:
rank_path = PROJECT_PATH / 'rank_train.pkl'
if not rank_path.exists():
    raise FileNotFoundError(f'Missing {rank_path}. Run 5.feature_engineering.ipynb first.')

df = pd.read_pickle(rank_path)

THRESH_WORDS = 300
df['ctr'] = df['label'].astype(int)
df['ctcvr'] = ((df['label'] == 1) & (df['words_count'].fillna(0) >= THRESH_WORDS)).astype(int)

# 按 user 划分训练/验证
rng = np.random.default_rng(42)
users = df['user_id'].unique()
rng.shuffle(users)
split = int(len(users) * 0.8)
train_users = set(users[:split])

train_df = df[df['user_id'].isin(train_users)].copy()
valid_df = df[~df['user_id'].isin(train_users)].copy()

DEBUG = True
MAX_TRAIN_ROWS = 300000
if DEBUG and len(train_df) > MAX_TRAIN_ROWS:
    train_df = train_df.sample(MAX_TRAIN_ROWS, random_state=42)

train_df[['ctr', 'ctcvr']].mean(), valid_df[['ctr', 'ctcvr']].mean(), len(train_df), len(valid_df)


## 1) 特征准备

和深度排序一致：

- sparse：user/item/category 等
- dense：召回分数 + 统计/时间/相似度等


In [None]:
SPARSE_COLS = ['user_id', 'article_id', 'category_id', 'user_top_category', 'last_click_article_id']
DENSE_COLS = [
    'recall_score', 'recall_rank',
    'user_click_count', 'user_unique_items',
    'item_click_count', 'words_count',
    'item_age_hours', 'time_gap_hours',
    'emb_sim_last', 'is_same_category',
]


def encode_categorical(train_s: pd.Series, valid_s: pd.Series) -> Tuple[np.ndarray, np.ndarray, int, List[str]]:
    all_s = pd.concat([train_s, valid_s], axis=0)
    codes, uniques = pd.factorize(all_s.astype(str), sort=True)
    codes = codes.astype(np.int32) + 1  # 0 reserved
    train_codes = codes[: len(train_s)]
    valid_codes = codes[len(train_s) :]
    vocab_size = int(len(uniques) + 1)
    return train_codes, valid_codes, vocab_size, list(uniques)


train_sparse = {}
valid_sparse = {}
vocab_sizes = {}
factor_uniques = {}
for col in SPARSE_COLS:
    tr, va, vs, uniq = encode_categorical(train_df[col].fillna(0), valid_df[col].fillna(0))
    train_sparse[col] = tr
    valid_sparse[col] = va
    vocab_sizes[col] = vs
    factor_uniques[col] = uniq

scaler = StandardScaler()
X_train_dense = scaler.fit_transform(train_df[DENSE_COLS].fillna(0).values.astype('float32'))
X_valid_dense = scaler.transform(valid_df[DENSE_COLS].fillna(0).values.astype('float32'))

y_train = {
    'ctr': train_df['ctr'].values.astype('float32'),
    'ctcvr': train_df['ctcvr'].values.astype('float32'),
}
y_valid = {
    'ctr': valid_df['ctr'].values.astype('float32'),
    'ctcvr': valid_df['ctcvr'].values.astype('float32'),
}

train_inputs = {**train_sparse, 'dense': X_train_dense}
valid_inputs = {**valid_sparse, 'dense': X_valid_dense}

len(y_train['ctr']), len(y_train['ctcvr'])


In [None]:
def gauc(user_ids: np.ndarray, labels: np.ndarray, preds: np.ndarray) -> float:
    user_ids = np.asarray(user_ids)
    labels = np.asarray(labels)
    preds = np.asarray(preds)
    uniq = np.unique(user_ids)
    aucs = []
    weights = []
    for u in uniq:
        m = user_ids == u
        y = labels[m]
        if len(np.unique(y)) < 2:
            continue
        p = preds[m]
        aucs.append(roc_auc_score(y, p))
        weights.append(np.sum(m))
    if not aucs:
        return 0.0
    return float(np.average(aucs, weights=weights))


## 2) Shared-Bottom（基线）


In [None]:
def build_shared_bottom(sparse_vocab_sizes: Dict[str, int], dense_dim: int, task_names: List[str], emb_dim: int = 16, shared_units: List[int] = [128, 64], tower_units: List[int] = [64]):
    inputs = {}
    embed_vecs = []
    for feat, vocab_size in sparse_vocab_sizes.items():
        inp = tf.keras.layers.Input(shape=(), dtype=tf.int32, name=feat)
        inputs[feat] = inp
        emb = tf.keras.layers.Flatten()(tf.keras.layers.Embedding(vocab_size, emb_dim)(inp))
        embed_vecs.append(emb)
    dense_inp = tf.keras.layers.Input(shape=(dense_dim,), dtype=tf.float32, name='dense')
    inputs['dense'] = dense_inp

    x = tf.keras.layers.Concatenate()(embed_vecs + [dense_inp])
    for u in shared_units:
        x = tf.keras.layers.Dense(u, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.2)(x)

    outputs = {}
    for t in task_names:
        h = x
        for u in tower_units:
            h = tf.keras.layers.Dense(u, activation='relu')(h)
        out = tf.keras.layers.Dense(1, activation='sigmoid', name=t)(h)
        outputs[t] = tf.keras.layers.Flatten()(out)

    model = tf.keras.Model(inputs=inputs, outputs=outputs, name='SharedBottom')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss={t: 'binary_crossentropy' for t in task_names},
        metrics={t: [tf.keras.metrics.AUC(name='auc')] for t in task_names},
    )
    return model


shared_bottom = build_shared_bottom({k: vocab_sizes[k] for k in SPARSE_COLS}, dense_dim=X_train_dense.shape[1], task_names=['ctr', 'ctcvr'])
shared_bottom.summary()


In [None]:
shared_bottom.fit(train_inputs, y_train, batch_size=2048, epochs=2, validation_data=(valid_inputs, y_valid), verbose=1)

pred = shared_bottom.predict(valid_inputs, batch_size=4096, verbose=0)
metrics_shared = {
    'ctr_auc': roc_auc_score(y_valid['ctr'], pred['ctr']),
    'ctr_gauc': gauc(valid_df['user_id'].values, y_valid['ctr'], pred['ctr']),
    'ctcvr_auc': roc_auc_score(y_valid['ctcvr'], pred['ctcvr']),
    'ctcvr_gauc': gauc(valid_df['user_id'].values, y_valid['ctcvr'], pred['ctcvr']),
}
metrics_shared


## 3) MMoE


In [None]:
def build_mmoe(
    sparse_vocab_sizes: Dict[str, int],
    dense_dim: int,
    task_names: List[str],
    emb_dim: int = 16,
    num_experts: int = 4,
    expert_units: List[int] = [128, 64],
    tower_units: List[int] = [64],
):
    inputs = {}
    embed_vecs = []
    for feat, vocab_size in sparse_vocab_sizes.items():
        inp = tf.keras.layers.Input(shape=(), dtype=tf.int32, name=feat)
        inputs[feat] = inp
        emb = tf.keras.layers.Flatten()(tf.keras.layers.Embedding(vocab_size, emb_dim)(inp))
        embed_vecs.append(emb)
    dense_inp = tf.keras.layers.Input(shape=(dense_dim,), dtype=tf.float32, name='dense')
    inputs['dense'] = dense_inp

    x = tf.keras.layers.Concatenate()(embed_vecs + [dense_inp])

    # Experts
    expert_outputs = []
    for i in range(num_experts):
        h = x
        for u in expert_units:
            h = tf.keras.layers.Dense(u, activation='relu')(h)
        expert_outputs.append(h)
    experts = tf.keras.layers.Lambda(lambda z: tf.stack(z, axis=1), name='experts_stack')(expert_outputs)  # [B, E, H]

    outputs = {}
    for t in task_names:
        gate = tf.keras.layers.Dense(num_experts, activation='softmax', name=f'gate_{t}')(x)  # [B, E]
        gate = tf.keras.layers.Lambda(lambda g: tf.expand_dims(g, axis=-1))(gate)  # [B, E, 1]
        task_inp = tf.keras.layers.Lambda(lambda z: tf.reduce_sum(z[0] * z[1], axis=1))([experts, gate])  # [B, H]

        h = task_inp
        for u in tower_units:
            h = tf.keras.layers.Dense(u, activation='relu')(h)
        out = tf.keras.layers.Dense(1, activation='sigmoid', name=t)(h)
        outputs[t] = tf.keras.layers.Flatten()(out)

    model = tf.keras.Model(inputs=inputs, outputs=outputs, name='MMoE')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss={t: 'binary_crossentropy' for t in task_names},
        metrics={t: [tf.keras.metrics.AUC(name='auc')] for t in task_names},
    )
    return model


mmoe = build_mmoe({k: vocab_sizes[k] for k in SPARSE_COLS}, dense_dim=X_train_dense.shape[1], task_names=['ctr', 'ctcvr'])
mmoe.summary()


In [None]:
mmoe.fit(train_inputs, y_train, batch_size=2048, epochs=2, validation_data=(valid_inputs, y_valid), verbose=1)

pred = mmoe.predict(valid_inputs, batch_size=4096, verbose=0)
metrics_mmoe = {
    'ctr_auc': roc_auc_score(y_valid['ctr'], pred['ctr']),
    'ctr_gauc': gauc(valid_df['user_id'].values, y_valid['ctr'], pred['ctr']),
    'ctcvr_auc': roc_auc_score(y_valid['ctcvr'], pred['ctcvr']),
    'ctcvr_gauc': gauc(valid_df['user_id'].values, y_valid['ctcvr'], pred['ctcvr']),
}
metrics_mmoe


## 4) ESMM（CTR + CVR → CTCVR）

ESMM 经典用于解决 CVR 的样本选择偏差（只在点击后才观测转化）。

这里为了能跑通，我们用 `ctcvr` proxy 标签；结构仍然与 ESMM 一致：

- 输出 `pCTR`
- 输出 `pCVR`
- 输出 `pCTCVR = pCTR * pCVR`
- 用 `ctr` 和 `ctcvr` 两个 loss 训练


In [None]:
def build_esmm(sparse_vocab_sizes: Dict[str, int], dense_dim: int, emb_dim: int = 16, shared_units: List[int] = [128, 64]):
    inputs = {}
    embed_vecs = []
    for feat, vocab_size in sparse_vocab_sizes.items():
        inp = tf.keras.layers.Input(shape=(), dtype=tf.int32, name=feat)
        inputs[feat] = inp
        emb = tf.keras.layers.Flatten()(tf.keras.layers.Embedding(vocab_size, emb_dim)(inp))
        embed_vecs.append(emb)
    dense_inp = tf.keras.layers.Input(shape=(dense_dim,), dtype=tf.float32, name='dense')
    inputs['dense'] = dense_inp

    x = tf.keras.layers.Concatenate()(embed_vecs + [dense_inp])
    for u in shared_units:
        x = tf.keras.layers.Dense(u, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.2)(x)

    ctr_logit = tf.keras.layers.Dense(1, activation=None)(x)
    cvr_logit = tf.keras.layers.Dense(1, activation=None)(x)
    pctr = tf.keras.layers.Activation('sigmoid', name='ctr')(ctr_logit)
    pcvr = tf.keras.layers.Activation('sigmoid', name='cvr')(cvr_logit)
    pctcvr = tf.keras.layers.Multiply(name='ctcvr')([pctr, pcvr])

    outputs = {
        'ctr': tf.keras.layers.Flatten()(pctr),
        'ctcvr': tf.keras.layers.Flatten()(pctcvr),
    }

    model = tf.keras.Model(inputs=inputs, outputs=outputs, name='ESMM')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss={'ctr': 'binary_crossentropy', 'ctcvr': 'binary_crossentropy'},
        metrics={'ctr': [tf.keras.metrics.AUC(name='auc')], 'ctcvr': [tf.keras.metrics.AUC(name='auc')]},
    )
    return model


esmm = build_esmm({k: vocab_sizes[k] for k in SPARSE_COLS}, dense_dim=X_train_dense.shape[1])
esmm.summary()


In [None]:
esmm.fit(train_inputs, y_train, batch_size=2048, epochs=2, validation_data=(valid_inputs, y_valid), verbose=1)

pred = esmm.predict(valid_inputs, batch_size=4096, verbose=0)
metrics_esmm = {
    'ctr_auc': roc_auc_score(y_valid['ctr'], pred['ctr']),
    'ctr_gauc': gauc(valid_df['user_id'].values, y_valid['ctr'], pred['ctr']),
    'ctcvr_auc': roc_auc_score(y_valid['ctcvr'], pred['ctcvr']),
    'ctcvr_gauc': gauc(valid_df['user_id'].values, y_valid['ctcvr'], pred['ctcvr']),
}
metrics_esmm


## 5) PLE（简化版：1 层）

PLE 比 MMoE 更常见于多任务强耦合场景；这里实现一个最小可用版本：

- shared experts + task experts
- task gate 在 shared+task experts 上做混合


In [None]:
def build_ple(
    sparse_vocab_sizes: Dict[str, int],
    dense_dim: int,
    task_names: List[str],
    emb_dim: int = 16,
    num_shared_experts: int = 2,
    num_task_experts: int = 2,
    expert_units: List[int] = [128, 64],
    tower_units: List[int] = [64],
):
    inputs = {}
    embed_vecs = []
    for feat, vocab_size in sparse_vocab_sizes.items():
        inp = tf.keras.layers.Input(shape=(), dtype=tf.int32, name=feat)
        inputs[feat] = inp
        emb = tf.keras.layers.Flatten()(tf.keras.layers.Embedding(vocab_size, emb_dim)(inp))
        embed_vecs.append(emb)
    dense_inp = tf.keras.layers.Input(shape=(dense_dim,), dtype=tf.float32, name='dense')
    inputs['dense'] = dense_inp

    x = tf.keras.layers.Concatenate()(embed_vecs + [dense_inp])

    def make_expert(name: str):
        h = x
        for u in expert_units:
            h = tf.keras.layers.Dense(u, activation='relu', name=f'{name}_dense_{u}')(h)
        return h

    shared_experts = [make_expert(f'shared_{i}') for i in range(num_shared_experts)]

    outputs = {}
    for t in task_names:
        task_experts = [make_expert(f'{t}_expert_{i}') for i in range(num_task_experts)]
        all_experts = shared_experts + task_experts
        experts_stack = tf.keras.layers.Lambda(lambda z: tf.stack(z, axis=1), name=f'{t}_experts_stack')(all_experts)

        gate = tf.keras.layers.Dense(len(all_experts), activation='softmax', name=f'{t}_gate')(x)
        gate = tf.keras.layers.Lambda(lambda g: tf.expand_dims(g, axis=-1))(gate)
        task_inp = tf.keras.layers.Lambda(lambda z: tf.reduce_sum(z[0] * z[1], axis=1))([experts_stack, gate])

        h = task_inp
        for u in tower_units:
            h = tf.keras.layers.Dense(u, activation='relu')(h)
        out = tf.keras.layers.Dense(1, activation='sigmoid', name=t)(h)
        outputs[t] = tf.keras.layers.Flatten()(out)

    model = tf.keras.Model(inputs=inputs, outputs=outputs, name='PLE_1layer')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss={t: 'binary_crossentropy' for t in task_names},
        metrics={t: [tf.keras.metrics.AUC(name='auc')] for t in task_names},
    )
    return model


ple = build_ple({k: vocab_sizes[k] for k in SPARSE_COLS}, dense_dim=X_train_dense.shape[1], task_names=['ctr', 'ctcvr'])
ple.summary()


In [None]:
ple.fit(train_inputs, y_train, batch_size=2048, epochs=2, validation_data=(valid_inputs, y_valid), verbose=1)

pred = ple.predict(valid_inputs, batch_size=4096, verbose=0)
metrics_ple = {
    'ctr_auc': roc_auc_score(y_valid['ctr'], pred['ctr']),
    'ctr_gauc': gauc(valid_df['user_id'].values, y_valid['ctr'], pred['ctr']),
    'ctcvr_auc': roc_auc_score(y_valid['ctcvr'], pred['ctcvr']),
    'ctcvr_gauc': gauc(valid_df['user_id'].values, y_valid['ctcvr'], pred['ctcvr']),
}
metrics_ple


## 6) 总结与保存


In [None]:
summary = pd.DataFrame([
    {'model': 'shared_bottom', **metrics_shared},
    {'model': 'mmoe', **metrics_mmoe},
    {'model': 'esmm', **metrics_esmm},
    {'model': 'ple', **metrics_ple},
])
summary


In [None]:
shared_bottom.save(ARTIFACTS_DIR / 'shared_bottom.keras')
mmoe.save(ARTIFACTS_DIR / 'mmoe.keras')
esmm.save(ARTIFACTS_DIR / 'esmm.keras')
ple.save(ARTIFACTS_DIR / 'ple.keras')

with open(ARTIFACTS_DIR / 'scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open(ARTIFACTS_DIR / 'vocab_sizes.pkl', 'wb') as f:
    pickle.dump(vocab_sizes, f)
with open(ARTIFACTS_DIR / 'preprocess.pkl', 'wb') as f:
    pickle.dump({'sparse_cols': SPARSE_COLS, 'dense_cols': DENSE_COLS, 'uniques': factor_uniques, 'thresh_words': THRESH_WORDS}, f)
summary.to_csv(ARTIFACTS_DIR / 'multi_task_report.csv', index=False)

print('saved to:', ARTIFACTS_DIR)
