In [1]:
import numpy as np
import pandas as pd
import torch

for m in [np, pd, torch]:
    print(m.__name__, m.__version__)

from torch import nn, optim
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from funcs.factor_utils import FactorUtil

import warnings
warnings.filterwarnings('ignore')

numpy 1.26.4
pandas 2.2.3
torch 2.2.2


In [2]:
# -----------------------------
# 1. 读取数据集
# -----------------------------
bar_df = FactorUtil.get_bar_df('FG2409_MIN_1', '20240813', batch_size=364)
bar_df.columns

Index(['O', 'H', 'L', 'C', 'V', 'AP', 'AV', 'BP', 'BV'], dtype='object')

In [3]:
bar_df.shape

(364, 9)

In [4]:
# 0-5行数据集，最后一行第3列
bar_df.values[0:5][-1,3]

1280.0

In [5]:
# 第5行第3列
bar_df.values[4][3]

1280.0

In [6]:
seq_len = 5
returns = []

for i in range(seq_len, len(bar_df) - 1):
    c_start = bar_df.iloc[i - seq_len]['C']      # 第一个时间步的 C
    c_end = bar_df.iloc[i + 1]['C']              # 第六个时间步的 C
    returns.append((c_end - c_start) / c_start)  # 


In [7]:
# 总行数
len(returns)

358

In [8]:
df_returns = bar_df.iloc[seq_len:-1].copy()
df_returns['return'] = returns

In [9]:
# 分桶
bins = [-np.inf, -0.003, -0.001, 0.001, 0.003, np.inf]
df_returns['label'] = pd.cut(df_returns['return'], bins=bins, labels=[0, 1, 2, 3, 4]).astype(int)

In [10]:
print(df_returns['label'].value_counts().sort_index())

label
0     75
1     66
2    131
3     53
4     33
Name: count, dtype: int64


In [11]:
df_returns.head()

Unnamed: 0_level_0,O,H,L,C,V,AP,AV,BP,BV,return,label
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-08-13 09:05:00,1280.0,1282.0,1280.0,1282.0,1451,1281.261,61719,1280.326,56268,0.004702,4
2024-08-13 09:06:00,1282.0,1282.0,1280.0,1282.0,2363,1281.779,69101,1280.77,80040,0.00313,4
2024-08-13 09:07:00,1281.0,1283.0,1281.0,1282.0,2462,1282.058,134440,1281.035,87557,0.004695,4
2024-08-13 09:08:00,1282.0,1285.0,1282.0,1284.0,5782,1284.238,45260,1283.091,62190,0.001561,3
2024-08-13 09:09:00,1284.0,1284.0,1283.0,1283.0,1575,1284.026,55508,1283.003,124716,0.002344,3


In [12]:
df_returns['return'].describe()

count    358.000000
mean      -0.000562
std        0.002703
min       -0.007223
25%       -0.002370
50%       -0.000779
75%        0.000797
max        0.008554
Name: return, dtype: float64

In [69]:
class TimeSeriesDataset(Dataset):
    def __init__(self, df, feature_cols, seq_len=5):
        self.df = df
        
        self.feature_cols = feature_cols
        self.seq_len = seq_len
        
        self.features = df[feature_cols].values
        self.labels = self._create_labels()

    def __len__(self):
        # 总长度 - 时间步长 - 1 (去掉首尾)
        return len(self.features) - self.seq_len - 1

    def __getitem__(self, idx):
        # 特征列 当前索引到当前索引+时间步长 的矩阵
        X = self.features[idx: idx + self.seq_len]
        # 标签列 当前索引的标签 (为特征列第一行/下一个时间步长快第一行)
        y = self.labels[idx]
        return torch.tensor(X, dtype=torch.float), torch.tensor(y, dtype=torch.long)

    def _create_labels(self):
        labels = []
        for i in range(len(self.features) - self.seq_len - 1):
            # 第一行
            c_start = self.features[i][0]
            # 第六行
            c_end = self.features[i + self.seq_len + 1][0]
            c_delta = (c_end - c_start) / c_start
            labels.append(self._get_label(c_delta))
        return labels

    def _get_label(self, delta):
        if delta < -0.003:
            return 0
        if delta >= -0.003 and delta < -0.001:
            return 1
        if delta >= -0.001 and delta <= 0.001:
            return 2
        if delta > 0.001 and delta <= 0.003:
            return 3
        if delta > 0.003:
            return 4

In [70]:
feature_cols = bar_df.columns.to_list()

In [71]:
# 检查数据集
dataset = TimeSeriesDataset(df_returns, feature_cols)
dataloader = DataLoader(dataset, batch_size=5, shuffle=False)

for i, (x_batch, y_batch) in enumerate(dataloader):
    print(f"Batch {i+1}:", "-" * 30)
    print("X shape:", x_batch.shape)
    print("y shape:", y_batch.shape)
    print("Sample X:\n", x_batch)
    print("Sample y:", y_batch)
    if i >= 0:
        break

Batch 1: ------------------------------
X shape: torch.Size([5, 5, 9])
y shape: torch.Size([5])
Sample X:
 tensor([[[  1280.0000,   1282.0000,   1280.0000,   1282.0000,   1451.0000,
            1281.2610,  61719.0000,   1280.3260,  56268.0000],
         [  1282.0000,   1282.0000,   1280.0000,   1282.0000,   2363.0000,
            1281.7791,  69101.0000,   1280.7700,  80040.0000],
         [  1281.0000,   1283.0000,   1281.0000,   1282.0000,   2462.0000,
            1282.0580, 134440.0000,   1281.0350,  87557.0000],
         [  1282.0000,   1285.0000,   1282.0000,   1284.0000,   5782.0000,
            1284.2380,  45260.0000,   1283.0909,  62190.0000],
         [  1284.0000,   1284.0000,   1283.0000,   1283.0000,   1575.0000,
            1284.0260,  55508.0000,   1283.0031, 124716.0000]],

        [[  1282.0000,   1282.0000,   1280.0000,   1282.0000,   2363.0000,
            1281.7791,  69101.0000,   1280.7700,  80040.0000],
         [  1281.0000,   1283.0000,   1281.0000,   1282.0000,  

In [72]:
len(dataset)

352

In [73]:
# -----------------------------
# 2. 拆分数据集
# -----------------------------
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
# 不打乱顺序
train_loader = DataLoader(train_dataset, batch_size=5, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=5, shuffle=False)

In [74]:
len(train_dataset), len(val_dataset)

(281, 71)

In [75]:
# 第0批次中的特征列形状
train_dataset[0][0].shape, train_dataset[0][0][0].shape

(torch.Size([5, 9]), torch.Size([9]))

In [76]:
# 第0批次中的标签列形状 - 为一个标量
train_dataset[0][1].shape

torch.Size([])

In [81]:
# -----------------------------
# 2. 位置编码（Positional Encoding）
# -----------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=500):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

# -----------------------------
# 3. 模型搭建
# -----------------------------
class TimeSeriesTransformerClassifier(nn.Module):
    def __init__(self, seq_len, input_dim, model_dim, num_heads, num_layers, num_classes, dropout=0.5):
        super().__init__()
        self.model_dim = model_dim

        # 输入映射层
        self.input_map = nn.Linear(input_dim, model_dim)

        # 位置编码
        self.pos_encoder = PositionalEncoding(model_dim, dropout)

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, dim_feedforward=2 * model_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # 分类头
        self.classifier = nn.Sequential(
            nn.Linear(model_dim * seq_len, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_classes)
        )

    def forward(self, src):
        # src shape: (batch_size, seq_len, input_dim)
        # print(f"in forward: src.shape = {src.dtype}")
        src = self.input_map(src)  # (batch_size, seq_len, model_dim)
        src = src.permute(1, 0, 2)  # (seq_len, batch_size, model_dim)
        
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)  # (seq_len, batch_size, model_dim)
        
        output = output.permute(1, 0, 2)  # (batch_size, seq_len, model_dim)
        output = output.reshape(output.size(0), -1)  # (batch_size, seq_len * model_dim)
        
        logits = self.classifier(output)  # (batch_size, num_classes)
        
        return logits


In [85]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -----------------------------
# 4. 训练函数
# -----------------------------
def train_model(model, train_loader, criterion, optimizer, epochs=20):
    model.to(device)
    best_acc = 0.0

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_preds, all_labels = [], []

        for inputs, labels in train_loader:
            # inputs 的维度顺序为 [B,S,M]
            # B - batch_size
            # S - seq_len
            # M - model_dim
            inputs, labels = inputs.to(device), labels.to(device)
            # print(f"when train: inputs.shape = {inputs.shape}, labels.shape = {labels.shape}")

            # 需要转换 input 得维度 - 维度重排 [S,B,M]
            # inputs = inputs.permute(1, 0, 2)
            # print(inputs.shape, labels.shape)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        acc = accuracy_score(all_labels, all_preds)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {acc*100:.2f}%")

# -----------------------------
# 4. 验证函数
# -----------------------------        
def eval_model(model, val_loader):
    # 验证
    model.eval()
    with torch.no_grad():
        val_preds, val_labels = [], []
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
        val_acc = accuracy_score(val_labels, val_preds)
        print(f"Validation Accuracy: {val_acc*100:.2f}%\n")

In [86]:
seq_len = 5            # 时间序列长度
input_dim = 9          # 每个时间步的特征维度
num_classes = 5        # 分类数（0,1,2,3,4）

# 初始化模型
model = TimeSeriesTransformerClassifier(
    seq_len=seq_len,
    input_dim=input_dim,
    model_dim=64,
    num_heads=4,
    num_layers=2,
    num_classes=num_classes
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [88]:
# 开始训练
train_model(model, train_loader, criterion, optimizer, epochs=20)

Epoch 1/20, Loss: 90.5212, Accuracy: 28.83%
Epoch 2/20, Loss: 87.1765, Accuracy: 31.32%
Epoch 3/20, Loss: 88.6485, Accuracy: 37.37%
Epoch 4/20, Loss: 88.6165, Accuracy: 32.38%
Epoch 5/20, Loss: 88.4407, Accuracy: 33.45%
Epoch 6/20, Loss: 87.1830, Accuracy: 36.30%
Epoch 7/20, Loss: 86.5914, Accuracy: 36.30%
Epoch 8/20, Loss: 86.2741, Accuracy: 35.59%
Epoch 9/20, Loss: 87.3806, Accuracy: 37.37%
Epoch 10/20, Loss: 86.4071, Accuracy: 34.52%
Epoch 11/20, Loss: 86.5005, Accuracy: 37.01%
Epoch 12/20, Loss: 84.2455, Accuracy: 37.72%
Epoch 13/20, Loss: 85.7422, Accuracy: 37.37%
Epoch 14/20, Loss: 85.1909, Accuracy: 37.72%
Epoch 15/20, Loss: 85.0313, Accuracy: 35.94%
Epoch 16/20, Loss: 84.8671, Accuracy: 38.08%
Epoch 17/20, Loss: 84.7494, Accuracy: 37.37%
Epoch 18/20, Loss: 85.1331, Accuracy: 34.16%
Epoch 19/20, Loss: 87.5351, Accuracy: 37.01%
Epoch 20/20, Loss: 85.0583, Accuracy: 37.37%
