# 排序模型与离线评估（增强版）

包含以下模型：
1. **LGBMRanker** - 基于 LambdaRank 的排序模型
2. **LGBMClassifier** - 二分类模型，预测点击概率
3. **DIN** - 深度兴趣网络（Deep Interest Network）

以及模型融合策略：
- 加权平均融合
- Stacking 融合

评估指标：Hit Rate, MRR, NDCG, Coverage

In [1]:
import os
import pickle
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from lightgbm import LGBMRanker, LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

from funrec.utils import load_env_with_fallback

load_env_with_fallback()
PROCESSED_DATA_PATH = Path(os.getenv('FUNREC_PROCESSED_DATA_PATH'))
RAW_DATA_PATH = Path(os.getenv('FUNREC_RAW_DATA_PATH'))
PROJECT_PATH = PROCESSED_DATA_PATH / 'projects' / 'news_recommendation_system'

DATA_PATH = RAW_DATA_PATH / 'dataset' / 'news_recommendation'
if not DATA_PATH.exists():
    DATA_PATH = RAW_DATA_PATH / 'news_recommendation'

In [2]:
# ==================== 加载数据 ====================

rank_df = pd.read_pickle(PROJECT_PATH / 'rank_train.pkl')

with open(PROJECT_PATH / 'feature_cols.pkl', 'rb') as f:
    feature_cols = pickle.load(f)

rank_df[feature_cols] = rank_df[feature_cols].fillna(0)

print(f"数据集大小: {len(rank_df)}")
print(f"特征数: {len(feature_cols)}")
print(f"特征列表: {feature_cols[:10]}...")
rank_df[feature_cols].head()

数据集大小: 1000000
特征数: 13
特征列表: ['recall_score', 'recall_rank', 'user_click_count', 'user_unique_items', 'user_last_click_ts', 'item_click_count', 'item_last_click_ts', 'category_id', 'words_count', 'item_age_hours']...


Unnamed: 0,recall_score,recall_rank,user_click_count,user_unique_items,user_last_click_ts,item_click_count,item_last_click_ts,category_id,words_count,item_age_hours,time_gap_hours,is_same_category,emb_sim_last
0,1.0,1.0,11,11,1508160616342,1.0,1507301000000.0,301,254,8803.967595,238.673324,0,0.105229
1,0.8,2.0,11,11,1508160616342,1.0,1507585000000.0,331,277,7535.535928,159.822751,1,0.926856
2,0.5,3.0,11,11,1508160616342,3.0,1507665000000.0,228,183,423.125928,137.65713,0,0.515788
3,0.4,4.0,11,11,1508160616342,0.0,0.0,331,272,-158.512127,418933.504539,1,0.923254
4,0.333333,5.0,11,11,1508160616342,13.0,1507572000000.0,228,118,278.451762,163.425198,0,0.616363


In [3]:
rng = np.random.default_rng(42)
users = rank_df['user_id'].unique()
rng.shuffle(users)
split = int(len(users) * 0.8)
train_users = set(users[:split])

train_df = rank_df[rank_df['user_id'].isin(train_users)]
valid_df = rank_df[~rank_df['user_id'].isin(train_users)]

def make_group(df):
    return df.groupby('user_id').size().to_list()

X_train = train_df[feature_cols]
y_train = train_df['label']
X_valid = valid_df[feature_cols]
y_valid = valid_df['label']

group_train = make_group(train_df)
group_valid = make_group(valid_df)


In [4]:
model = LGBMRanker(
    objective='lambdarank',
    metric='ndcg',
    n_estimators=200,
    learning_rate=0.05,
    num_leaves=63,
    random_state=42,
)
model.fit(
    X_train,
    y_train,
    group=group_train,
    eval_set=[(X_valid, y_valid)],
    eval_group=[group_valid],
    eval_at=[5],
)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2359
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 13


0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.05
,n_estimators,200
,subsample_for_bin,200000
,objective,'lambdarank'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [5]:
valid_df = valid_df.copy()
valid_df['pred'] = model.predict(X_valid)

def hit_rate_at_k(df, k=5):
    hit = 0
    total = 0
    for _, group in df.groupby('user_id'):
        topk = group.sort_values('pred', ascending=False).head(k)
        if topk['label'].max() > 0:
            hit += 1
        total += 1
    return hit / total if total else 0.0

hit_rate_at_k(valid_df, k=5)


0.3235

In [6]:
model.booster_.save_model(PROJECT_PATH / 'lgb_ranker.txt')
with open(PROJECT_PATH / 'feature_cols.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)


## 可选：生成测试集推荐结果


In [7]:
RUN_INFERENCE = False

if RUN_INFERENCE:
    import math
    from collections import defaultdict
    import faiss
    from tqdm import tqdm

    RAW_DATA_PATH = Path(os.getenv('FUNREC_RAW_DATA_PATH'))
    DATA_PATH = RAW_DATA_PATH / 'dataset' / 'news_recommendation'
    if not DATA_PATH.exists():
        DATA_PATH = RAW_DATA_PATH / 'news_recommendation'

    train_click = pd.read_csv(DATA_PATH / 'train_click_log.csv')
    test_click = pd.read_csv(DATA_PATH / 'testA_click_log.csv')
    articles = pd.read_csv(DATA_PATH / 'articles.csv')
    article_emb = pd.read_csv(DATA_PATH / 'articles_emb.csv')

    with open(PROJECT_PATH / 'popular_items.pkl', 'rb') as f:
        popular_items = pickle.load(f)

    i2i_path = PROJECT_PATH / 'itemcf_i2i.pkl'
    i2i_sim = None
    if i2i_path.exists():
        with open(i2i_path, 'rb') as f:
            i2i_sim = pickle.load(f)

    test_click = test_click.sort_values(['user_id', 'click_timestamp'])
    test_user_hist = (
        test_click.groupby('user_id')['click_article_id'].apply(list).to_dict()
    )

    def popular_recall(hist_items, k=50):
        recs = []
        hist_set = set(hist_items)
        for item in popular_items:
            if item in hist_set:
                continue
            recs.append((item, 1.0))
            if len(recs) >= k:
                break
        return recs

    def itemcf_recall(hist_items, sim_topk=20, recall_num=50):
        if i2i_sim is None:
            return []
        rank = defaultdict(float)
        hist_set = set(hist_items)
        for item in hist_items:
            for j, score in sorted(
                i2i_sim.get(item, {}).items(), key=lambda x: x[1], reverse=True
            )[:sim_topk]:
                if j in hist_set:
                    continue
                rank[j] += score
        if len(rank) < recall_num:
            for item in popular_items:
                if item in rank or item in hist_set:
                    continue
                rank[item] = -1.0
                if len(rank) >= recall_num:
                    break
        return sorted(rank.items(), key=lambda x: x[1], reverse=True)[:recall_num]

    emb_cols = [c for c in article_emb.columns if c.startswith('emb_')]
    emb_matrix = article_emb[emb_cols].values.astype('float32')
    emb_matrix /= np.linalg.norm(emb_matrix, axis=1, keepdims=True) + 1e-12
    article_ids = article_emb['article_id'].values
    id2idx = {aid: idx for idx, aid in enumerate(article_ids)}
    index = faiss.IndexFlatIP(emb_matrix.shape[1])
    index.add(emb_matrix)

    def emb_recall(last_item, hist_items, topk=50):
        idx = id2idx.get(last_item)
        if idx is None:
            return []
        query = emb_matrix[idx].reshape(1, -1)
        scores, indices = index.search(query, topk + len(hist_items))
        recs = []
        hist_set = set(hist_items)
        for score, j in zip(scores[0], indices[0]):
            item_id = int(article_ids[j])
            if item_id in hist_set:
                continue
            recs.append((item_id, float(score)))
            if len(recs) >= topk:
                break
        return recs

    def merge_recall(recall_dicts, weights, topk=50):
        merged = defaultdict(dict)
        for name, recall in recall_dicts.items():
            weight = weights.get(name, 1.0)
            for user, items in recall.items():
                for rank, (item, score) in enumerate(items):
                    merged[user][item] = merged[user].get(item, 0.0) + weight / (rank + 1)
        merged_sorted = {
            user: sorted(items.items(), key=lambda x: x[1], reverse=True)[:topk]
            for user, items in merged.items()
        }
        return merged_sorted

    recall_pop = {u: popular_recall(items, k=50) for u, items in test_user_hist.items()}
    recall_itemcf = {u: itemcf_recall(items, sim_topk=20, recall_num=50) for u, items in test_user_hist.items()}
    recall_emb = {u: emb_recall(items[-1], items, topk=50) for u, items in test_user_hist.items()}

    recall_merged = merge_recall(
        {'pop': recall_pop, 'itemcf': recall_itemcf, 'emb': recall_emb},
        weights={'pop': 0.2, 'itemcf': 1.0, 'emb': 0.8},
        topk=50,
    )

    def recall_to_df(recall_dict):
        rows = []
        for user, items in recall_dict.items():
            for item, score in items:
                rows.append((user, item, score))
        return pd.DataFrame(rows, columns=['user_id', 'article_id', 'recall_score'])

    recall_df = recall_to_df(recall_merged)
    recall_df['recall_rank'] = recall_df.groupby('user_id')['recall_score'].rank(
        ascending=False, method='first'
    )

    user_click_count = train_click.groupby('user_id').size().rename('user_click_count')
    user_unique_items = (
        train_click.groupby('user_id')['click_article_id'].nunique().rename('user_unique_items')
    )
    user_last_click_ts = train_click.groupby('user_id')['click_timestamp'].max().rename('user_last_click_ts')

    click_with_cat = train_click.merge(
        articles, left_on='click_article_id', right_on='article_id', how='left'
    )
    user_top_category = click_with_cat.groupby('user_id')['category_id'].agg(
        lambda x: x.value_counts().idxmax()
    ).rename('user_top_category')

    user_features = pd.concat(
        [user_click_count, user_unique_items, user_last_click_ts, user_top_category],
        axis=1,
    ).reset_index()

    item_click_count = train_click.groupby('click_article_id').size().rename('item_click_count')
    item_last_click_ts = (
        train_click.groupby('click_article_id')['click_timestamp'].max().rename('item_last_click_ts')
    )

    item_features = (
        articles.merge(item_click_count, left_on='article_id', right_index=True, how='left')
        .merge(item_last_click_ts, left_on='article_id', right_index=True, how='left')
    )
    item_features['item_click_count'] = item_features['item_click_count'].fillna(0)
    item_features['item_last_click_ts'] = item_features['item_last_click_ts'].fillna(0)

    user_last_click = (
        test_click.sort_values(['user_id', 'click_timestamp'])
        .groupby('user_id')
        .tail(1)[['user_id', 'click_article_id', 'click_timestamp']]
        .rename(
            columns={
                'click_article_id': 'last_click_article_id',
                'click_timestamp': 'last_click_timestamp',
            }
        )
    )

    candidates = (
        recall_df.merge(user_features, on='user_id', how='left')
        .merge(user_last_click, on='user_id', how='left')
        .merge(item_features, left_on='article_id', right_on='article_id', how='left')
    )

    candidates['is_same_category'] = (
        candidates['category_id'] == candidates['user_top_category']
    ).astype(int)

    candidates['item_age_hours'] = (
        candidates['last_click_timestamp'] - candidates['created_at_ts']
    ) / 3600_000

    candidates['time_gap_hours'] = (
        candidates['last_click_timestamp'] - candidates['item_last_click_ts']
    ) / 3600_000

    candidates[['item_age_hours', 'time_gap_hours']] = candidates[
        ['item_age_hours', 'time_gap_hours']
    ].fillna(0)

    cand_idx = candidates['article_id'].map(id2idx)
    last_idx = candidates['last_click_article_id'].map(id2idx)
    mask = cand_idx.notna() & last_idx.notna()

    sim = np.zeros(len(candidates), dtype='float32')
    sim[mask] = (
        emb_matrix[cand_idx[mask].astype(int)]
        * emb_matrix[last_idx[mask].astype(int)]
    ).sum(axis=1)

    candidates['emb_sim_last'] = sim

    candidates[feature_cols] = candidates[feature_cols].fillna(0)
    candidates['pred'] = model.predict(candidates[feature_cols])

    topk = 5
    recs = (
        candidates.sort_values(['user_id', 'pred'], ascending=[True, False])
        .groupby('user_id')
        .head(topk)
    )

    submit = recs.groupby('user_id')['article_id'].apply(list).reset_index()
    for i in range(topk):
        submit[f'article_{i+1}'] = submit['article_id'].apply(
            lambda x: x[i] if i < len(x) else -1
        )
    submit = submit.drop(columns=['article_id'])
    submit.to_csv(PROJECT_PATH / 'submission.csv', index=False)
    submit.head()
