# 11. 统一评估（Recall / Ranking / 多样性 / GAUC）

这一节把前面 7-10 产出的模型与候选集做统一评估，并生成一份可保存的报告。

覆盖指标：

- Recall：HitRate@K、NDCG@K、MRR@K
- Ranking：按 user 分组的 HitRate@K / NDCG@K
- 多样性：Coverage、Category Entropy（多类别覆盖）
- 多任务：AUC + GAUC（按 user 分组 AUC）


In [7]:
import os
import math
import pickle
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import tensorflow as tf
from dotenv import find_dotenv, load_dotenv
from sklearn.metrics import roc_auc_score

tf.get_logger().setLevel('ERROR')


def find_repo_root(start: Path) -> Path:
    cur = start
    for _ in range(10):
        if (cur / 'pyproject.toml').exists() or (cur / '.git').exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start


REPO_ROOT = find_repo_root(Path.cwd())
dotenv_path = find_dotenv(usecwd=True)
if dotenv_path:
    load_dotenv(dotenv_path)
os.environ.setdefault('FUNREC_RAW_DATA_PATH', str(REPO_ROOT / 'data'))
os.environ.setdefault('FUNREC_PROCESSED_DATA_PATH', str(REPO_ROOT / 'tmp'))

RAW_DATA_PATH = Path(os.getenv('FUNREC_RAW_DATA_PATH'))
PROCESSED_DATA_PATH = Path(os.getenv('FUNREC_PROCESSED_DATA_PATH'))

DATA_PATH = RAW_DATA_PATH / 'dataset' / 'news_recommendation'
if not DATA_PATH.exists():
    DATA_PATH = RAW_DATA_PATH / 'news_recommendation'

PROJECT_PATH = PROCESSED_DATA_PATH / 'projects' / 'news_recommendation_system'
EVAL_DIR = PROJECT_PATH / 'artifacts' / 'evaluation'
EVAL_DIR.mkdir(parents=True, exist_ok=True)

DATA_PATH, PROJECT_PATH, EVAL_DIR


(PosixPath('/Users/wangjunfei/Desktop/fun-rec/data/dataset/news_recommendation'),
 PosixPath('/Users/wangjunfei/Desktop/fun-rec/tmp/projects/news_recommendation_system'),
 PosixPath('/Users/wangjunfei/Desktop/fun-rec/tmp/projects/news_recommendation_system/artifacts/evaluation'))

In [8]:
train_hist = pd.read_pickle(PROJECT_PATH / 'train_hist.pkl')
valid_last = pd.read_pickle(PROJECT_PATH / 'valid_last.pkl')
articles = pd.read_csv(DATA_PATH / 'articles.csv')

valid_last_map = dict(zip(valid_last['user_id'].astype(int), valid_last['click_article_id'].astype(int)))
item_to_cat = dict(zip(articles['article_id'].astype(int), articles['category_id'].astype(int)))

# 物品热度（用于 novelty/流行度偏置分析）
item_pop = train_hist['click_article_id'].value_counts().to_dict()
total_clicks = float(len(train_hist))

len(valid_last_map), len(item_pop)


(200000, 26343)

## 1) Recall 评估（候选集）


In [9]:
def eval_recall_df(recall_df: pd.DataFrame, valid_map: Dict[int, int], k_list: List[int] = [20, 50]) -> Dict[str, float]:
    df = recall_df.copy()
    if 'recall_rank' in df.columns:
        df = df.sort_values(['user_id', 'recall_rank'])
    else:
        df = df.sort_values(['user_id', 'recall_score'], ascending=[True, False])

    metrics = {}
    for k in k_list:
        hit = 0
        ndcg = 0.0
        mrr = 0.0
        total = 0
        for u, g in df.groupby('user_id'):
            u = int(u)
            target = valid_map.get(u)
            if target is None:
                continue
            items = g['article_id'].astype(int).tolist()[:k]
            if target in items:
                hit += 1
                rank = items.index(target)
                ndcg += 1.0 / math.log2(rank + 2)
                mrr += 1.0 / float(rank + 1)
            total += 1

        metrics[f'hit_rate@{k}'] = hit / max(1, total)
        metrics[f'ndcg@{k}'] = ndcg / max(1, total)
        metrics[f'mrr@{k}'] = mrr / max(1, total)
        metrics[f'users@{k}'] = float(total)
    return metrics


def diversity_metrics(recall_df: pd.DataFrame, k: int = 20) -> Dict[str, float]:
    df = recall_df.copy()
    if 'recall_rank' in df.columns:
        df = df.sort_values(['user_id', 'recall_rank'])
    else:
        df = df.sort_values(['user_id', 'recall_score'], ascending=[True, False])

    # Coverage
    topk = df.groupby('user_id').head(k)
    unique_items = topk['article_id'].nunique()
    total_items = articles['article_id'].nunique()
    coverage = unique_items / max(1, total_items)

    # Category entropy (per user)
    ent_list = []
    gini_list = []
    pop_list = []
    novelty_list = []
    for u, g in topk.groupby('user_id'):
        cats = [item_to_cat.get(int(i), -1) for i in g['article_id'].astype(int).tolist()]
        vc = pd.Series(cats).value_counts(normalize=True)
        p = vc.values
        ent = float(-(p * np.log(p + 1e-12)).sum())
        gini = float(1.0 - (p ** 2).sum())
        ent_list.append(ent)
        gini_list.append(gini)

        pops = [item_pop.get(int(i), 0) for i in g['article_id'].astype(int).tolist()]
        pop_list.append(float(np.mean(pops)))
        novelty = [float(-math.log((item_pop.get(int(i), 0) + 1.0) / (total_clicks + 1.0))) for i in g['article_id'].astype(int).tolist()]
        novelty_list.append(float(np.mean(novelty)))

    return {
        f'coverage@{k}': float(coverage),
        f'cat_entropy@{k}': float(np.mean(ent_list)) if ent_list else 0.0,
        f'cat_gini@{k}': float(np.mean(gini_list)) if gini_list else 0.0,
        f'avg_popularity@{k}': float(np.mean(pop_list)) if pop_list else 0.0,
        f'novelty@{k}': float(np.mean(novelty_list)) if novelty_list else 0.0,
        f'users@{k}': float(len(ent_list)),
    }


In [10]:
recall_files = {
    'baseline': PROJECT_PATH / 'recall_candidates.pkl',
    'two_tower': PROJECT_PATH / 'recall_candidates_two_tower.pkl',
    'sasrec': PROJECT_PATH / 'recall_candidates_sasrec.pkl',
}

recall_reports = []
for name, path in recall_files.items():
    if not path.exists():
        continue
    df = pd.read_pickle(path)
    r = {'model': name, **eval_recall_df(df, valid_last_map, k_list=[20, 50]), **diversity_metrics(df, k=20)}
    recall_reports.append(r)

recall_report = pd.DataFrame(recall_reports).sort_values('model')
recall_report


Unnamed: 0,model,hit_rate@20,ndcg@20,mrr@20,users@20,hit_rate@50,ndcg@50,mrr@50,users@50,coverage@20,cat_entropy@20,cat_gini@20,avg_popularity@20,novelty@20
0,baseline,0.2822,0.142132,0.103172,20000.0,0.39325,0.164164,0.106705,20000.0,0.081234,1.93017,0.777242,2174.516187,9.737705
2,sasrec,0.0,0.0,0.0,20000.0,0.0,0.0,0.0,20000.0,0.002986,2.942505,0.946074,0.043968,13.703523
1,two_tower,0.0,0.0,0.0,20000.0,0.0,0.0,0.0,20000.0,0.001816,2.689413,0.9192,0.16189,13.666247


## 2) Ranking 评估（DeepFM / DIN / DIEN-like）

这里评估的是“同一个 user 的候选集排序质量”，因此必须按 user 分组计算 TopK 指标。

说明：为了严格复现训练时的输入编码，需要保存并复用编码器；
如果你中途改过 Notebook8/9 的编码逻辑，请重新训练并刷新 artifacts。


In [11]:
def hit_ndcg_at_k(df: pd.DataFrame, preds: np.ndarray, k: int = 5) -> Dict[str, float]:
    tmp = df[['user_id', 'label']].copy()
    tmp['pred'] = preds
    hit = 0
    ndcg = 0.0
    total = 0
    for _, g in tmp.groupby('user_id'):
        g = g.sort_values('pred', ascending=False).head(k)
        if g['label'].max() > 0:
            hit += 1
            rank = int(g.reset_index(drop=True).index[g['label'] == 1][0])
            ndcg += 1.0 / math.log2(rank + 2)
        total += 1
    return {f'hit_rate@{k}': hit / max(1, total), f'ndcg@{k}': ndcg / max(1, total), 'num_users': total}


rank_df = pd.read_pickle(PROJECT_PATH / 'rank_train.pkl')

# 与基础版一致：按 user 划分训练/验证（为了评估一致性，这里固定 random seed）
rng = np.random.default_rng(42)
users = rank_df['user_id'].unique()
rng.shuffle(users)
split = int(len(users) * 0.8)
valid_users = set(users[split:])
valid_r = rank_df[rank_df['user_id'].isin(valid_users)].copy()

len(valid_r), valid_r['user_id'].nunique()


(200000, 4000)

In [12]:
# ------- DeepFM -------
deepfm_path = PROJECT_PATH / 'artifacts' / 'ranking' / 'deep_models' / 'deepfm.keras'
deepfm_scaler_path = PROJECT_PATH / 'artifacts' / 'ranking' / 'deep_models' / 'scaler.pkl'
deepfm_factorizer_path = PROJECT_PATH / 'artifacts' / 'ranking' / 'deep_models' / 'deepfm_factorizers.pkl'

def safe_load_model(path: Path, *, custom_objects=None):
    # 优先不编译加载：评估只需要 forward
    try:
        return tf.keras.models.load_model(path, custom_objects=custom_objects, compile=False, safe_mode=False)
    except TypeError:
        return tf.keras.models.load_model(path, custom_objects=custom_objects, compile=False)

ranking_reports = []

deepfm_sm_path = PROJECT_PATH / 'artifacts' / 'ranking' / 'deep_models' / 'deepfm_savedmodel'
deepfm_load_path = deepfm_sm_path if deepfm_sm_path.exists() else deepfm_path

if deepfm_load_path.exists() and deepfm_scaler_path.exists() and deepfm_factorizer_path.exists():
    try:
        deepfm = safe_load_model(deepfm_load_path)
    except Exception as e:
        print('[skip] failed to load DeepFM:', type(e).__name__, e)
        print('提示：可重新运行 8.deep_ranking.ipynb 的保存单元，生成 SavedModel 后再评估')
        deepfm = None

if deepfm is not None:
    with open(deepfm_scaler_path, 'rb') as f:
        scaler = pickle.load(f)
    with open(deepfm_factorizer_path, 'rb') as f:
        fac = pickle.load(f)

    sparse_cols = fac['sparse_cols']
    dense_cols = fac['dense_cols']
    uniques = fac['uniques']  # dict(col -> list[str])

    def encode_with_uniques(series: pd.Series, uniq_list: List[str]) -> np.ndarray:
        idx = {v: i + 1 for i, v in enumerate(uniq_list)}
        return np.asarray([idx.get(str(x), 0) for x in series.values], dtype=np.int32)

    inputs = {}
    for c in sparse_cols:
        inputs[c] = encode_with_uniques(valid_r[c].fillna(0).astype(str), uniques[c])
    inputs['dense'] = scaler.transform(valid_r[dense_cols].fillna(0).values.astype('float32'))

    preds = deepfm.predict(inputs, batch_size=4096, verbose=0)
    m = hit_ndcg_at_k(valid_r, preds, k=5)
    ranking_reports.append({'model': 'deepfm', **m})

ranking_reports


[{'model': 'deepfm',
  'hit_rate@5': 0.21225,
  'ndcg@5': 0.14279763111664995,
  'num_users': 4000}]

In [13]:
# ------- DIN -------
din_path = PROJECT_PATH / 'artifacts' / 'ranking' / 'deep_models' / 'din.keras'
din_scaler_path = PROJECT_PATH / 'artifacts' / 'ranking' / 'deep_models' / 'scaler.pkl'
din_enc_path = PROJECT_PATH / 'artifacts' / 'ranking' / 'deep_models' / 'din_encoders.pkl'

# 用于加载 DIN（notebook8 中的自定义 attention）
class DinAttention(tf.keras.layers.Layer):
    def __init__(self, hidden_units: List[int] = [80, 40], **kwargs):
        super().__init__(**kwargs)
        self.hidden_units = hidden_units
        self.mlp = [tf.keras.layers.Dense(u, activation='relu') for u in hidden_units]
        self.out = tf.keras.layers.Dense(1, activation=None)

    def call(self, inputs):
        query, keys, mask = inputs
        q = tf.expand_dims(query, axis=1)
        q = q + tf.zeros_like(keys)
        x = tf.concat([q, keys, q - keys, q * keys], axis=-1)
        for dense in self.mlp:
            x = dense(x)
        scores = tf.squeeze(self.out(x), axis=-1)
        paddings = tf.ones_like(scores) * (-1e9)
        scores = tf.where(mask > 0, scores, paddings)
        weights = tf.nn.softmax(scores, axis=-1)
        weights = tf.expand_dims(weights, axis=-1)
        return tf.reduce_sum(weights * keys, axis=1)


din_sm_path = PROJECT_PATH / 'artifacts' / 'ranking' / 'deep_models' / 'din_savedmodel'
din_load_path = din_sm_path if din_sm_path.exists() else din_path

din = None
if din_load_path.exists() and din_scaler_path.exists() and din_enc_path.exists():
    try:
        din = safe_load_model(din_load_path, custom_objects={'DinAttention': DinAttention})
    except Exception as e:
        print('[skip] failed to load DIN:', type(e).__name__, e)
        print('提示：可重新运行 8.deep_ranking.ipynb 的保存单元，生成 SavedModel 后再评估')

if din is not None:
    with open(din_scaler_path, 'rb') as f:
        scaler = pickle.load(f)
    with open(din_enc_path, 'rb') as f:
        enc = pickle.load(f)
    raw_to_item_enc = enc['raw_to_item_enc']
    raw_to_user_enc = enc['raw_to_user_enc']

    dense_cols = ['recall_score','recall_rank','user_click_count','user_unique_items','item_click_count','words_count','item_age_hours','time_gap_hours','emb_sim_last','is_same_category']
    X_dense = scaler.transform(valid_r[dense_cols].fillna(0).values.astype('float32'))

    # 构造 hist 序列
    train_hist_full = pd.read_pickle(PROJECT_PATH / 'train_hist.pkl').sort_values(['user_id','click_timestamp'])
    user_hist_full = train_hist_full.groupby('user_id')['click_article_id'].apply(list).to_dict()
    # hist_len 与训练时保持一致（从模型输入推断）
    MAX_HIST_LEN = 30
    try:
        MAX_HIST_LEN = int([t.shape[1] for t in din.inputs if 'hist_items' in t.name][0])
    except Exception:
        pass
    def encode_seq(seq: List[int]) -> np.ndarray:
        enc = [raw_to_item_enc.get(int(x), 0) for x in seq][-MAX_HIST_LEN:]
        out = np.zeros(MAX_HIST_LEN, dtype=np.int32)
        if enc:
            out[-len(enc):] = np.asarray(enc, dtype=np.int32)
        return out

    hist_mat = np.vstack([encode_seq(user_hist_full.get(int(u), [])) for u in valid_r['user_id'].values])
    user_enc = np.asarray([raw_to_user_enc.get(int(u), 0) for u in valid_r['user_id'].values], dtype=np.int32)
    item_enc = np.asarray([raw_to_item_enc.get(int(i), 0) for i in valid_r['article_id'].values], dtype=np.int32)

    inputs = {'user_id': user_enc, 'hist_items': hist_mat, 'target_item': item_enc, 'dense': X_dense}
    preds = din.predict(inputs, batch_size=4096, verbose=0)
    m = hit_ndcg_at_k(valid_r, preds, k=5)
    ranking_reports.append({'model': 'din', **m})

ranking_reports


[{'model': 'deepfm',
  'hit_rate@5': 0.21225,
  'ndcg@5': 0.14279763111664995,
  'num_users': 4000},
 {'model': 'din',
  'hit_rate@5': 0.1875,
  'ndcg@5': 0.1259067905901265,
  'num_users': 4000}]

In [15]:
# ------- DIEN-like -------
dien_path = PROJECT_PATH / 'artifacts' / 'ranking' / 'dien_like' / 'dien_like.keras'
dien_scaler_path = PROJECT_PATH / 'artifacts' / 'ranking' / 'dien_like' / 'scaler.pkl'
dien_enc_path = PROJECT_PATH / 'artifacts' / 'ranking' / 'dien_like' / 'encoders.pkl'

# 用于加载 DIEN-like（notebook9 中的自定义 attention）
class SimpleAttention(tf.keras.layers.Layer):
    def __init__(self, hidden_units: List[int] = [80, 40], **kwargs):
        super().__init__(**kwargs)
        self.hidden_units = hidden_units
        self.mlp = [tf.keras.layers.Dense(u, activation='relu') for u in hidden_units]
        self.out = tf.keras.layers.Dense(1, activation=None)

    def call(self, inputs):
        query, keys, mask = inputs
        q = tf.expand_dims(query, axis=1)
        q = q + tf.zeros_like(keys)
        x = tf.concat([q, keys, q - keys, q * keys], axis=-1)
        for dense in self.mlp:
            x = dense(x)
        scores = tf.squeeze(self.out(x), axis=-1)
        paddings = tf.ones_like(scores) * (-1e9)
        scores = tf.where(mask > 0, scores, paddings)
        weights = tf.nn.softmax(scores, axis=-1)
        return weights


dien_sm_path = PROJECT_PATH / 'artifacts' / 'ranking' / 'dien_like' / 'dien_savedmodel'
dien_load_path = dien_sm_path if dien_sm_path.exists() else dien_path

dien = None
if dien_load_path.exists() and dien_scaler_path.exists() and dien_enc_path.exists():
    try:
        dien = safe_load_model(dien_load_path, custom_objects={'SimpleAttention': SimpleAttention})
    except Exception as e:
        print('[skip] failed to load DIEN-like:', type(e).__name__, e)
        print('提示：可重新运行 9.sequence_modeling.ipynb 的保存单元，生成 SavedModel 后再评估')

if dien is not None:
    with open(dien_scaler_path, 'rb') as f:
        scaler = pickle.load(f)
    with open(dien_enc_path, 'rb') as f:
        enc = pickle.load(f)
    raw_to_user_enc = enc['raw_to_user_enc']

    dense_cols = ['recall_score','recall_rank','user_click_count','user_unique_items','item_click_count','words_count','item_age_hours','time_gap_hours','emb_sim_last','is_same_category']
    X_dense = scaler.transform(valid_r[dense_cols].fillna(0).values.astype('float32'))

    # item_id_map（来自 SASRec artifacts，若不存在则用 articles 重建）
    sas_item_map_path = PROJECT_PATH / 'artifacts' / 'sequence' / 'sasrec_inbatch' / 'item_id_map.pkl'
    if sas_item_map_path.exists():
        try:
            with open(sas_item_map_path, 'rb') as f:
                item_id_map = pickle.load(f)
            if hasattr(item_id_map, 'transform'):
                def enc_item(arr):
                    return item_id_map.transform(np.asarray(arr, dtype=np.int64))
            else:
                raise TypeError('item_id_map.pkl is not a compatible object (missing transform)')
        except Exception as e:
            # notebook9 里 IdMap 是在 notebook 中定义的 dataclass，pickle 在其它 notebook 反序列化会失败。
            print('[warn] failed to load sasrec item_id_map.pkl, fallback to rebuild from articles:', type(e).__name__, e)
            all_items = articles['article_id'].astype(int).unique()
            idx = {int(v): int(i + 1) for i, v in enumerate(np.sort(all_items))}
            def enc_item(arr):
                return np.asarray([idx.get(int(x), 0) for x in arr], dtype=np.int32)
    else:
        # 简单重建
        all_items = articles['article_id'].astype(int).unique()
        idx = {int(v): int(i + 1) for i, v in enumerate(np.sort(all_items))}
        def enc_item(arr):
            return np.asarray([idx.get(int(x), 0) for x in arr], dtype=np.int32)

    train_hist_full = pd.read_pickle(PROJECT_PATH / 'train_hist.pkl').sort_values(['user_id','click_timestamp'])
    user_hist_full = train_hist_full.groupby('user_id')['click_article_id'].apply(list).to_dict()
    MAX_HIST_LEN = 30
    try:
        MAX_HIST_LEN = int([t.shape[1] for t in dien.inputs if 'hist_items' in t.name][0])
    except Exception:
        pass
    def encode_seq_raw(seq: List[int]) -> np.ndarray:
        enc = enc_item(seq)[-MAX_HIST_LEN:]
        out = np.zeros(MAX_HIST_LEN, dtype=np.int32)
        if len(enc) > 0:
            out[-len(enc):] = enc.astype(np.int32)
        return out

    hist_mat = np.vstack([encode_seq_raw(user_hist_full.get(int(u), [])) for u in valid_r['user_id'].values])
    user_enc = np.asarray([raw_to_user_enc.get(int(u), 0) for u in valid_r['user_id'].values], dtype=np.int32)
    item_enc = enc_item(valid_r['article_id'].values)

    inputs = {'user_id': user_enc, 'hist_items': hist_mat, 'target_item': item_enc, 'dense': X_dense}
    preds = dien.predict(inputs, batch_size=4096, verbose=0)
    m = hit_ndcg_at_k(valid_r, preds, k=5)
    ranking_reports.append({'model': 'dien_like', **m})

ranking_reports


[warn] failed to load sasrec item_id_map.pkl, fallback to rebuild from articles: AttributeError Can't get attribute 'IdMap' on <module '__main__'>


[{'model': 'deepfm',
  'hit_rate@5': 0.21225,
  'ndcg@5': 0.14279763111664995,
  'num_users': 4000},
 {'model': 'din',
  'hit_rate@5': 0.1875,
  'ndcg@5': 0.1259067905901265,
  'num_users': 4000},
 {'model': 'dien_like',
  'hit_rate@5': 0.2245,
  'ndcg@5': 0.15424781275328858,
  'num_users': 4000}]

In [16]:
ranking_report = pd.DataFrame(ranking_reports).sort_values('model') if ranking_reports else pd.DataFrame()
ranking_report


Unnamed: 0,model,hit_rate@5,ndcg@5,num_users
0,deepfm,0.21225,0.142798,4000
2,dien_like,0.2245,0.154248,4000
1,din,0.1875,0.125907,4000


## 3) 多任务评估（AUC / GAUC）


In [17]:
def gauc(user_ids: np.ndarray, labels: np.ndarray, preds: np.ndarray) -> float:
    user_ids = np.asarray(user_ids)
    labels = np.asarray(labels)
    preds = np.asarray(preds)
    uniq = np.unique(user_ids)
    aucs = []
    weights = []
    for u in uniq:
        m = user_ids == u
        y = labels[m]
        if len(np.unique(y)) < 2:
            continue
        p = preds[m]
        aucs.append(roc_auc_score(y, p))
        weights.append(np.sum(m))
    if not aucs:
        return 0.0
    return float(np.average(aucs, weights=weights))


mt_dir = PROJECT_PATH / 'artifacts' / 'multitask'
mt_models = {
    'shared_bottom': mt_dir / 'shared_bottom.keras',
    'mmoe': mt_dir / 'mmoe.keras',
    'esmm': mt_dir / 'esmm.keras',
    'ple': mt_dir / 'ple.keras',
}

mt_reports = []
rank_path = PROJECT_PATH / 'rank_train.pkl'
if not rank_path.exists():
    print('[skip] missing rank_train.pkl, run 6.ranking.ipynb (or 8.deep_ranking.ipynb) first')
    mt_report = pd.DataFrame()
else:
    df = pd.read_pickle(rank_path)

    preprocess_path = mt_dir / 'preprocess.pkl'
    scaler_path = mt_dir / 'scaler.pkl'
    if not preprocess_path.exists() or not scaler_path.exists():
        print('[skip] missing multitask preprocess/scaler, run 10.multi_task.ipynb first')
        mt_report = pd.DataFrame()
    else:
        with open(preprocess_path, 'rb') as f:
            prep = pickle.load(f)
        with open(scaler_path, 'rb') as f:
            scaler = pickle.load(f)

        thresh_words = int(prep.get('thresh_words', 300))
        df['ctr'] = df['label'].astype(int)
        df['ctcvr'] = ((df['label'] == 1) & (df['words_count'].fillna(0) >= thresh_words)).astype(int)

        # 固定 split（与 Notebook10 一致）
        rng = np.random.default_rng(42)
        users = df['user_id'].unique()
        rng.shuffle(users)
        split = int(len(users) * 0.8)
        valid_users = set(users[split:])
        valid_df = df[df['user_id'].isin(valid_users)].copy()

        sparse_cols = prep['sparse_cols']
        dense_cols = prep['dense_cols']
        uniques = prep['uniques']

        def encode_with_uniques(series: pd.Series, uniq_list: List[str]) -> np.ndarray:
            idx = {v: i + 1 for i, v in enumerate(uniq_list)}
            return np.asarray([idx.get(str(x), 0) for x in series.values], dtype=np.int32)

        inputs = {c: encode_with_uniques(valid_df[c].fillna(0).astype(str), uniques[c]) for c in sparse_cols}
        inputs['dense'] = scaler.transform(valid_df[dense_cols].fillna(0).values.astype('float32'))

        y_ctr = valid_df['ctr'].values.astype(int)
        y_ctcvr = valid_df['ctcvr'].values.astype(int)

        for name, path in mt_models.items():
            if not path.exists():
                continue
            m = tf.keras.models.load_model(path)
            pred = m.predict(inputs, batch_size=4096, verbose=0)
            p_ctr = pred['ctr'] if isinstance(pred, dict) else pred[0]
            p_ctcvr = pred['ctcvr'] if isinstance(pred, dict) else pred[1]
            p_ctr = np.asarray(p_ctr).reshape(-1)
            p_ctcvr = np.asarray(p_ctcvr).reshape(-1)

            ctr_auc = roc_auc_score(y_ctr, p_ctr) if len(np.unique(y_ctr)) > 1 else float('nan')
            ctcvr_auc = roc_auc_score(y_ctcvr, p_ctcvr) if len(np.unique(y_ctcvr)) > 1 else float('nan')
            mt_reports.append({
                'model': name,
                'ctr_auc': ctr_auc,
                'ctr_gauc': gauc(valid_df['user_id'].values, y_ctr, p_ctr),
                'ctcvr_auc': ctcvr_auc,
                'ctcvr_gauc': gauc(valid_df['user_id'].values, y_ctcvr, p_ctcvr),
            })

        mt_report = pd.DataFrame(mt_reports).sort_values('model') if mt_reports else pd.DataFrame()
        mt_report


[skip] missing multitask preprocess/scaler, run 10.multi_task.ipynb first


## 4) 保存报告


In [18]:
recall_report.to_csv(EVAL_DIR / 'recall_report.csv', index=False)
ranking_report.to_csv(EVAL_DIR / 'ranking_report.csv', index=False)
mt_report.to_csv(EVAL_DIR / 'multitask_report.csv', index=False)

print('saved to:', EVAL_DIR)


saved to: /Users/wangjunfei/Desktop/fun-rec/tmp/projects/news_recommendation_system/artifacts/evaluation
