# Baseline：离线划分与热门召回

使用每个用户的最后一次点击作为验证目标，其他历史点击作为训练历史，构造简单的热门召回基线。


In [1]:
import os
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

from funrec.utils import load_env_with_fallback

load_env_with_fallback()
RAW_DATA_PATH = Path(os.getenv('FUNREC_RAW_DATA_PATH'))
PROCESSED_DATA_PATH = Path(os.getenv('FUNREC_PROCESSED_DATA_PATH'))

DATA_PATH = RAW_DATA_PATH / 'dataset' / 'news_recommendation'
if not DATA_PATH.exists():
    DATA_PATH = RAW_DATA_PATH / 'news_recommendation'

PROJECT_PATH = PROCESSED_DATA_PATH / 'projects' / 'news_recommendation_system'
PROJECT_PATH.mkdir(parents=True, exist_ok=True)


In [2]:
# ==================== 配置参数 ====================
# 设置为 False 使用全量数据（约20万用户，需要更长时间）
# 设置为 True 使用采样数据（快速调试）

DEBUG = False  # 改为 False 使用全量数据
MAX_USERS = 30000  # DEBUG=True 时的采样用户数
RANDOM_SEED = 42

print("=" * 50)
print(f"运行模式: {'调试模式 (采样 {MAX_USERS} 用户)' if DEBUG else '全量模式 (所有用户)'}")
print("=" * 50)

train_click = pd.read_csv(DATA_PATH / 'train_click_log.csv')
print(f"原始数据: {len(train_click)} 条点击, {train_click['user_id'].nunique()} 用户")

if DEBUG:
    rng = np.random.default_rng(RANDOM_SEED)
    users = train_click['user_id'].unique()
    if MAX_USERS < len(users):
        sample_users = rng.choice(users, size=MAX_USERS, replace=False)
        train_click = train_click[train_click['user_id'].isin(sample_users)]
    print(f"采样后: {len(train_click)} 条点击, {train_click['user_id'].nunique()} 用户")

train_click.head()

运行模式: 全量模式 (所有用户)
原始数据: 1112623 条点击, 200000 用户


Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,199999,160417,1507029570190,4,1,17,1,13,1
1,199999,5408,1507029571478,4,1,17,1,13,1
2,199999,50823,1507029601478,4,1,17,1,13,1
3,199998,157770,1507029532200,4,1,17,1,25,5
4,199998,96613,1507029671831,4,1,17,1,25,5


In [3]:
def build_offline_split(click_df):
    click_df = click_df.sort_values(['user_id', 'click_timestamp'])
    last_click = click_df.groupby('user_id').tail(1)
    hist = click_df.drop(last_click.index)
    valid_users = hist['user_id'].unique()
    hist = hist[hist['user_id'].isin(valid_users)]
    last_click = last_click[last_click['user_id'].isin(valid_users)]
    return hist.reset_index(drop=True), last_click.reset_index(drop=True)

train_hist, valid_last = build_offline_split(train_click)

len(train_hist), len(valid_last)


(912623, 200000)

In [4]:
train_hist = train_hist.sort_values(['user_id', 'click_timestamp'])
user_hist = (
    train_hist.groupby('user_id')['click_article_id'].apply(list).to_dict()
)
valid_last_map = dict(zip(valid_last['user_id'], valid_last['click_article_id']))

train_hist.to_pickle(PROJECT_PATH / 'train_hist.pkl')
valid_last.to_pickle(PROJECT_PATH / 'valid_last.pkl')

with open(PROJECT_PATH / 'user_hist.pkl', 'wb') as f:
    pickle.dump(user_hist, f)

with open(PROJECT_PATH / 'valid_last_map.pkl', 'wb') as f:
    pickle.dump(valid_last_map, f)


In [5]:
topk = 20
popular_items = train_hist['click_article_id'].value_counts().index.tolist()

with open(PROJECT_PATH / 'popular_items.pkl', 'wb') as f:
    pickle.dump(popular_items, f)

def popular_recall(hist_items, k):
    recs = []
    hist_set = set(hist_items)
    for item in popular_items:
        if item in hist_set:
            continue
        recs.append(item)
        if len(recs) >= k:
            break
    return recs

def evaluate_hit_rate(user_hist, target_map, k=20):
    hit = 0
    total = 0
    for user, target in target_map.items():
        recs = popular_recall(user_hist.get(user, []), k)
        if target in recs:
            hit += 1
        total += 1
    return hit / total if total else 0.0

evaluate_hit_rate(user_hist, valid_last_map, k=topk)


0.14989