In [1]:
import numpy as np
import pandas as pd
import torch

for m in [np, pd, torch]:
    print(m.__name__, m.__version__)

from torch import nn, optim
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from funcs.factor_utils import FactorUtil

import warnings
warnings.filterwarnings('ignore')

numpy 1.26.4
pandas 2.2.3
torch 2.2.2


In [2]:
# -----------------------------
# 1. 读取数据集
# -----------------------------
bar_df = FactorUtil.get_bar_df('FG2409_MIN_1', '20240813', batch_size=364)
bar_df.columns

Index(['O', 'H', 'L', 'C', 'V', 'AP', 'AV', 'BP', 'BV'], dtype='object')

In [3]:
bar_df.shape

(364, 9)

In [4]:
bar_df.values[0:5][-1,3]

1280.0

In [5]:
window_size = 5
returns = []

for i in range(window_size, len(bar_df)):
    c_start = bar_df.iloc[i - window_size]['C']  # 第一个时间步的 C
    c_end = bar_df.iloc[i]['C']                  # 第五个时间步的 C
    returns.append((c_end - c_start) / c_start)


In [6]:
len(returns)

359

In [7]:
df_returns = bar_df.iloc[window_size:].copy()
df_returns['return'] = returns

In [8]:
bins = [-np.inf, -0.003, -0.001, 0.001, 0.003, np.inf]
df_returns['label'] = pd.cut(df_returns['return'], bins=bins, labels=[0, 1, 2, 3, 4]).astype(int)

In [9]:
print(df_returns['label'].value_counts().sort_index())

label
0     64
1     72
2    138
3     55
4     30
Name: count, dtype: int64


In [10]:
df_returns.head()

Unnamed: 0_level_0,O,H,L,C,V,AP,AV,BP,BV,return,label
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-08-13 09:05:00,1280.0,1282.0,1280.0,1282.0,1451,1281.261,61719,1280.326,56268,0.004702,4
2024-08-13 09:06:00,1282.0,1282.0,1280.0,1282.0,2363,1281.779,69101,1280.77,80040,0.00313,4
2024-08-13 09:07:00,1281.0,1283.0,1281.0,1282.0,2462,1282.058,134440,1281.035,87557,0.00313,4
2024-08-13 09:08:00,1282.0,1285.0,1282.0,1284.0,5782,1284.238,45260,1283.091,62190,0.002342,3
2024-08-13 09:09:00,1284.0,1284.0,1283.0,1283.0,1575,1284.026,55508,1283.003,124716,0.002344,3


In [11]:
df_returns['return'].describe()

count    359.000000
mean      -0.000455
std        0.002506
min       -0.007223
25%       -0.002330
50%       -0.000779
75%        0.000798
max        0.008143
Name: return, dtype: float64

In [12]:
class TimeSeriesDataset(Dataset):
    def __init__(self, df, feature_cols, label_col='label', window_size=5):
        self.df = df
        self.feature_cols = feature_cols
        self.label_col = label_col
        self.window_size = window_size

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        features = row[self.feature_cols].values.astype(np.float32)
        label = row[self.label_col]
        return torch.tensor(features), torch.tensor(label, dtype=torch.long)

In [13]:
feature_cols = bar_df.columns.to_list()

In [14]:
dataset = TimeSeriesDataset(df_returns, feature_cols)
dataloader = DataLoader(dataset, batch_size=5, shuffle=False)


In [15]:
for i, (x_batch, y_batch) in enumerate(dataloader):
    print(f"Batch {i+1}:")
    print("X shape:", x_batch.shape)
    print("y shape:", y_batch.shape)
    print("Sample X:\n", x_batch[0])
    print("Sample y:", y_batch[0].item())
    if i >= 1:
        break

Batch 1:
X shape: torch.Size([5, 9])
y shape: torch.Size([5])
Sample X:
 tensor([ 1280.0000,  1282.0000,  1280.0000,  1282.0000,  1451.0000,  1281.2610,
        61719.0000,  1280.3260, 56268.0000])
Sample y: 4
Batch 2:
X shape: torch.Size([5, 9])
y shape: torch.Size([5])
Sample X:
 tensor([ 1283.0000,  1284.0000,  1282.0000,  1283.0000,  3855.0000,  1283.1071,
        78977.0000,  1282.1650, 88897.0000])
Sample y: 2


In [16]:
len(dataset)

359

In [28]:
# -----------------------------
# 2. 拆分数据集
# -----------------------------
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=10, shuffle=False)

In [29]:
len(train_dataset), len(val_dataset)

(287, 72)

In [30]:
train_dataset[0][0].shape

torch.Size([9])

In [31]:
# -----------------------------
# 2. 位置编码（Positional Encoding）
# -----------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=500):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

# -----------------------------
# 3. 模型搭建
# -----------------------------
class TimeSeriesTransformerClassifier(nn.Module):
    def __init__(self, seq_len, input_dim, model_dim, num_heads, num_layers, num_classes, dropout=0.5):
        super().__init__()
        self.model_dim = model_dim

        # 输入映射层
        self.input_map = nn.Linear(input_dim, model_dim)

        # 位置编码
        self.pos_encoder = PositionalEncoding(model_dim, dropout)

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, dim_feedforward=2 * model_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # 分类头
        self.classifier = nn.Sequential(
            nn.Linear(model_dim * seq_len, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_classes)
        )

    def forward(self, src):
        # src shape: (batch_size, seq_len, input_dim)
        src = self.input_map(src)  # (batch_size, seq_len, model_dim)
        src = src.permute(1, 0, 2)  # (seq_len, batch_size, model_dim)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)  # (seq_len, batch_size, model_dim)
        output = output.permute(1, 0, 2)  # (batch_size, seq_len, model_dim)
        output = output.reshape(output.size(0), -1)  # (batch_size, seq_len * model_dim)
        logits = self.classifier(output)  # (batch_size, num_classes)
        return logits


In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -----------------------------
# 4. 训练函数
# -----------------------------
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20):
    model.to(device)
    best_acc = 0.0

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_preds, all_labels = [], []

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            print(inputs.shape, labels.shape)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        acc = accuracy_score(all_labels, all_preds)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {acc*100:.2f}%")

        # 验证
        model.eval()
        with torch.no_grad():
            val_preds, val_labels = [], []
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                preds = torch.argmax(outputs, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())
            val_acc = accuracy_score(val_labels, val_preds)
            print(f"Validation Accuracy: {val_acc*100:.2f}%\n")

In [33]:
seq_len = 5            # 时间序列长度
input_dim = 9          # 每个时间步的特征维度
num_classes = 5        # 分类数（0,1,2,3,4）

# 初始化模型
model = TimeSeriesTransformerClassifier(
    seq_len=seq_len,
    input_dim=input_dim,
    model_dim=64,
    num_heads=4,
    num_layers=2,
    num_classes=num_classes
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)



In [34]:
# 开始训练
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20)

torch.Size([10, 9]) torch.Size([10])


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 2 is not equal to len(dims) = 3