## 1 - 加载数据集并转换为 Torch Dataset

In [None]:
from pathlib import Path
from src.data.load import load_dataset
from src.data.utils import load_compressed, save_compressed
from src.data.post_sequence import build_user_historical_sequences, PostSequenceDataset

raw_data = load_dataset()
def load_histories(dataset_path='./data/histories.pkl.gz'):
    if Path(dataset_path).exists():
        return load_compressed(dataset_path)
    else:
        obj = build_user_historical_sequences(raw_data['train'])
        save_compressed(obj, dataset_path)
        return obj
user_histories = load_histories()
training_dataset = PostSequenceDataset(user_histories)
len(training_dataset)

## 2 - 创建模型和相关参数

In [None]:
from src.models.recurrent import RNN
from src.trainer import Trainer, TrainingArguments

feature_size = training_dataset[0][0].shape[-1]
seq_reg_model = RNN(feature_size, hidden_size=16)
trainer = Trainer(
    training_dataset, None, None,
    seq_reg_model,
    TrainingArguments(
        epochs=30,
        batch_size=128,
        learning_rate=0.05,
        lr_gamma=0.9
    )
)

## 3 - 训练循环网络和预测器

In [None]:
trainer.train()

## 4 - 利用循环网络提取用户特征

In [None]:
import torch
from tqdm import tqdm

user_feature: dict[str, torch.FloatTensor] = {}
seq_reg_model.to('cpu')
seq_reg_model.eval()
for user_sample in tqdm(user_histories):
    # 这里暂时不知道怎么并行 大概要等 1 分钟
    x = user_sample['x_tensor'].unsqueeze(1)
    x_len = user_sample['x_len']
    y = user_sample['y'].unsqueeze(1)
    model_outputs = seq_reg_model(x, x_len, y)
    user_feature[user_sample['uid']] = model_outputs.last_hidden_state.view(-1).clone()

## 5 - 在验证集和测试集上进行回归预测

In [None]:
from src.data.features import extract_features
from src.data.process import exp_targets, extract_targets

valid_set_feature = extract_features(raw_data['valid'], None)
valid_set_targets = extract_targets(raw_data['valid'], 'linear')

In [None]:
import numpy as np
all_pred = []
for index, row in raw_data['valid'].iterrows():
    if row['uid'] in user_feature:
        hidden = user_feature[row['uid']].unsqueeze(0)
    else:
        hidden = torch.zeros((1, seq_reg_model.hidden_size))
    x_in = torch.Tensor( row['feature_content'].tolist()+row['feature_datetime'].tolist() ).unsqueeze(0)
    with torch.no_grad():
        model_out = seq_reg_model(x=x_in, last_hidden=hidden)
    all_pred.append(model_out.logits.numpy())

predicts = exp_targets( np.concatenate(all_pred, axis=0) )

In [None]:
from src.metric import compute_metrics

compute_metrics(predicts, valid_set_targets)

## 