# 特征工程（增强版）

包含以下特征类型：
1. **用户特征** - 点击统计、活跃度、类别偏好、时间习惯
2. **物品特征** - 热度、新鲜度、类别、字数
3. **交互特征** - Last-N 相似度、时间差、字数差
4. **召回特征** - 召回分数、召回排名

In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

from funrec.utils import load_env_with_fallback

load_env_with_fallback()
RAW_DATA_PATH = Path(os.getenv('FUNREC_RAW_DATA_PATH'))
PROCESSED_DATA_PATH = Path(os.getenv('FUNREC_PROCESSED_DATA_PATH'))

DATA_PATH = RAW_DATA_PATH / 'dataset' / 'news_recommendation'
if not DATA_PATH.exists():
    DATA_PATH = RAW_DATA_PATH / 'news_recommendation'

PROJECT_PATH = PROCESSED_DATA_PATH / 'projects' / 'news_recommendation_system'


In [2]:
train_hist = pd.read_pickle(PROJECT_PATH / 'train_hist.pkl')
valid_last = pd.read_pickle(PROJECT_PATH / 'valid_last.pkl')
recall_df = pd.read_pickle(PROJECT_PATH / 'recall_candidates.pkl')

articles = pd.read_csv(DATA_PATH / 'articles.csv')
article_emb = pd.read_csv(DATA_PATH / 'articles_emb.csv')


In [3]:
user_click_count = train_hist.groupby('user_id').size().rename('user_click_count')
user_unique_items = (
    train_hist.groupby('user_id')['click_article_id'].nunique().rename('user_unique_items')
)
user_last_click_ts = train_hist.groupby('user_id')['click_timestamp'].max().rename('user_last_click_ts')

click_with_cat = train_hist.merge(
    articles, left_on='click_article_id', right_on='article_id', how='left'
)
user_top_category = click_with_cat.groupby('user_id')['category_id'].agg(
    lambda x: x.value_counts().idxmax()
).rename('user_top_category')

user_features = pd.concat(
    [user_click_count, user_unique_items, user_last_click_ts, user_top_category],
    axis=1,
).reset_index()

user_features.head()


Unnamed: 0,user_id,user_click_count,user_unique_items,user_last_click_ts,user_top_category
0,0,1,1,1508211672520,26
1,1,1,1,1508211316889,418
2,2,1,1,1508211438695,43
3,3,1,1,1508211359672,99
4,4,1,1,1508211625466,67


In [4]:
item_click_count = train_hist.groupby('click_article_id').size().rename('item_click_count')
item_last_click_ts = (
    train_hist.groupby('click_article_id')['click_timestamp'].max().rename('item_last_click_ts')
)

item_features = (
    articles.merge(item_click_count, left_on='article_id', right_index=True, how='left')
    .merge(item_last_click_ts, left_on='article_id', right_index=True, how='left')
)
item_features['item_click_count'] = item_features['item_click_count'].fillna(0)
item_features['item_last_click_ts'] = item_features['item_last_click_ts'].fillna(0)

item_features.head()


Unnamed: 0,article_id,category_id,created_at_ts,words_count,item_click_count,item_last_click_ts
0,0,0,1513144419000,168,0.0,0.0
1,1,1,1405341936000,189,0.0,0.0
2,2,1,1408667706000,250,0.0,0.0
3,3,1,1408468313000,230,0.0,0.0
4,4,1,1407071171000,162,0.0,0.0


In [5]:
user_last_click = (
    train_hist.sort_values(['user_id', 'click_timestamp'])
    .groupby('user_id')
    .tail(1)[['user_id', 'click_article_id', 'click_timestamp']]
    .rename(
        columns={
            'click_article_id': 'last_click_article_id',
            'click_timestamp': 'last_click_timestamp',
        }
    )
)

user_last_click.head()


Unnamed: 0,user_id,last_click_article_id,last_click_timestamp
0,0,30760,1508211672520
1,1,289197,1508211316889
2,2,36162,1508211438695
3,3,50644,1508211359672
4,4,42567,1508211625466


In [6]:
MAX_CANDIDATES = 100

recall_df['recall_rank'] = recall_df.groupby('user_id')['recall_score'].rank(
    ascending=False, method='first'
)
recall_df = recall_df[recall_df['recall_rank'] <= MAX_CANDIDATES]

candidates = (
    recall_df.merge(user_features, on='user_id', how='left')
    .merge(user_last_click, on='user_id', how='left')
    .merge(item_features, left_on='article_id', right_on='article_id', how='left')
)

candidates['is_same_category'] = (
    candidates['category_id'] == candidates['user_top_category']
).astype(int)

candidates['item_age_hours'] = (
    candidates['last_click_timestamp'] - candidates['created_at_ts']
) / 3600_000

candidates['time_gap_hours'] = (
    candidates['last_click_timestamp'] - candidates['item_last_click_ts']
) / 3600_000

candidates[['item_age_hours', 'time_gap_hours']] = candidates[
    ['item_age_hours', 'time_gap_hours']
].fillna(0)


In [7]:
emb_cols = [c for c in article_emb.columns if c.startswith('emb_')]
emb_matrix = article_emb[emb_cols].values.astype('float32')
emb_matrix /= np.linalg.norm(emb_matrix, axis=1, keepdims=True) + 1e-12
article_ids = article_emb['article_id'].values
id2idx = {aid: idx for idx, aid in enumerate(article_ids)}

cand_idx = candidates['article_id'].map(id2idx)
last_idx = candidates['last_click_article_id'].map(id2idx)
mask = cand_idx.notna() & last_idx.notna()

sim = np.zeros(len(candidates), dtype='float32')
sim[mask] = (
    emb_matrix[cand_idx[mask].astype(int)]
    * emb_matrix[last_idx[mask].astype(int)]
).sum(axis=1)

candidates['emb_sim_last'] = sim


In [8]:
target = valid_last[['user_id', 'click_article_id']].rename(
    columns={'click_article_id': 'target_article_id'}
)
candidates = candidates.merge(target, on='user_id', how='left')
candidates['label'] = (
    candidates['article_id'] == candidates['target_article_id']
).astype(int)

candidates['label'].value_counts()


label
0    992135
1      7865
Name: count, dtype: int64

In [9]:
candidates.to_pickle(PROJECT_PATH / 'rank_train.pkl')
user_features.to_pickle(PROJECT_PATH / 'user_features.pkl')
item_features.to_pickle(PROJECT_PATH / 'item_features.pkl')
user_last_click.to_pickle(PROJECT_PATH / 'user_last_click.pkl')
