In [1]:
import numpy as np
import pandas as pd
import torch

for m in [np, pd, torch]:
    print(m.__name__, m.__version__)

numpy 1.26.4
pandas 2.2.3
torch 2.2.2


In [2]:
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

from funcs.factor_utils import FactorUtil

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
# -----------------------------
# 1. 读取数据集
# -----------------------------
bar_df = FactorUtil.get_window_bar_df('FG2409_MIN_1', '20240813', window_period=20)
bar_df.columns

20240812 22:41:00


Index(['O_19', 'H_19', 'L_19', 'C_19', 'V_19', 'AP_19', 'AV_19', 'BP_19',
       'BV_19', 'O_18',
       ...
       'BV_1', 'O', 'H', 'L', 'C', 'V', 'AP', 'AV', 'BP', 'BV'],
      dtype='object', length=180)

In [5]:
# -----------------------------
# 2. 制作标签列
# -----------------------------
bar_df['R'] = (bar_df['C'] - bar_df['C_5']) / bar_df['C_5']
min_val, max_val = bar_df['R'].min(), bar_df['R'].max()
print(min_val, max_val)

bins = pd.interval_range(start=min_val, end=max_val, periods=10)
print(bins)

bar_df['K'] = pd.cut(bar_df['R'], bins=bins, labels=range(10), include_lowest=True).cat.codes
bar_df['K'].value_counts().sort_index()

-0.009060955518945634 0.0113544201135442
IntervalIndex([  (-0.009060955518945634, -0.00701941795569665],
                 (-0.00701941795569665, -0.004977880392447667],
               (-0.004977880392447667, -0.0029363428291986834],
                 (-0.0029363428291986834, -0.0008948052659497],
                  (-0.0008948052659497, 0.0011467322972992834],
                 (0.0011467322972992834, 0.003188269860548267],
                   (0.003188269860548267, 0.00522980742379725],
                   (0.00522980742379725, 0.007271344987046234],
                  (0.007271344987046234, 0.009312882550295219],
                    (0.009312882550295219, 0.0113544201135442]],
              dtype='interval[float64, right]')


K
-1       1
 0      20
 1      46
 2     217
 3     565
 4    1579
 5     515
 6     161
 7      30
 8       5
 9       2
Name: count, dtype: int64

In [6]:
# -----------------------------
# 3. 自定义 Dataset 类
# -----------------------------
class DataFrameDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, feature_cols, label_col, device='cpu'):
        self.device = device
        self.features = torch.tensor(dataframe[feature_cols].values, dtype=torch.float32).to(device)
        if label_col is not None:
            self.labels = torch.tensor(dataframe[label_col].values, dtype=torch.long).to(device)
        else:
            self.labels = None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        else:
            return self.features[idx]

In [7]:
FEA_COLS = bar_df.columns.to_list()[:-2]
print(FEA_COLS)

dataset = DataFrameDataset(
    dataframe=bar_df[bar_df['K'] >= 0],
    feature_cols=FEA_COLS,
    label_col='K',
    device='cuda' if torch.cuda.is_available() else 'cpu'
)
len(dataset)


['O_19', 'H_19', 'L_19', 'C_19', 'V_19', 'AP_19', 'AV_19', 'BP_19', 'BV_19', 'O_18', 'H_18', 'L_18', 'C_18', 'V_18', 'AP_18', 'AV_18', 'BP_18', 'BV_18', 'O_17', 'H_17', 'L_17', 'C_17', 'V_17', 'AP_17', 'AV_17', 'BP_17', 'BV_17', 'O_16', 'H_16', 'L_16', 'C_16', 'V_16', 'AP_16', 'AV_16', 'BP_16', 'BV_16', 'O_15', 'H_15', 'L_15', 'C_15', 'V_15', 'AP_15', 'AV_15', 'BP_15', 'BV_15', 'O_14', 'H_14', 'L_14', 'C_14', 'V_14', 'AP_14', 'AV_14', 'BP_14', 'BV_14', 'O_13', 'H_13', 'L_13', 'C_13', 'V_13', 'AP_13', 'AV_13', 'BP_13', 'BV_13', 'O_12', 'H_12', 'L_12', 'C_12', 'V_12', 'AP_12', 'AV_12', 'BP_12', 'BV_12', 'O_11', 'H_11', 'L_11', 'C_11', 'V_11', 'AP_11', 'AV_11', 'BP_11', 'BV_11', 'O_10', 'H_10', 'L_10', 'C_10', 'V_10', 'AP_10', 'AV_10', 'BP_10', 'BV_10', 'O_9', 'H_9', 'L_9', 'C_9', 'V_9', 'AP_9', 'AV_9', 'BP_9', 'BV_9', 'O_8', 'H_8', 'L_8', 'C_8', 'V_8', 'AP_8', 'AV_8', 'BP_8', 'BV_8', 'O_7', 'H_7', 'L_7', 'C_7', 'V_7', 'AP_7', 'AV_7', 'BP_7', 'BV_7', 'O_6', 'H_6', 'L_6', 'C_6', 'V_6', 'AP

3140

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [9]:
dataset[0]

(tensor([1.2740e+03, 1.2740e+03, 1.2720e+03, 1.2720e+03, 6.8500e+02, 1.2731e+03,
         5.1006e+04, 1.2720e+03, 4.2873e+04, 1.2720e+03, 1.2730e+03, 1.2710e+03,
         1.2710e+03, 7.6800e+02, 1.2725e+03, 6.2708e+04, 1.2714e+03, 6.9209e+04,
         1.2710e+03, 1.2720e+03, 1.2710e+03, 1.2720e+03, 1.2810e+03, 1.2720e+03,
         7.3884e+04, 1.2710e+03, 9.7186e+04, 1.2720e+03, 1.2720e+03, 1.2700e+03,
         1.2700e+03, 2.4050e+03, 1.2719e+03, 3.2778e+04, 1.2708e+03, 1.1439e+05,
         1.2700e+03, 1.2720e+03, 1.2700e+03, 1.2710e+03, 2.1420e+03, 1.2713e+03,
         5.3205e+04, 1.2702e+03, 1.3337e+05, 1.2710e+03, 1.2710e+03, 1.2700e+03,
         1.2710e+03, 7.6000e+02, 1.2710e+03, 6.0735e+04, 1.2700e+03, 1.6694e+05,
         1.2700e+03, 1.2710e+03, 1.2690e+03, 1.2690e+03, 3.2650e+03, 1.2708e+03,
         6.0905e+04, 1.2698e+03, 1.0815e+05, 1.2690e+03, 1.2710e+03, 1.2690e+03,
         1.2700e+03, 2.1810e+03, 1.2704e+03, 7.1849e+04, 1.2693e+03, 6.5230e+04,
         1.2700e+03, 1.2720e

In [10]:
dataset[0][0].shape

torch.Size([180])

In [11]:
# 划分训练集和测试集
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
print(train_size, test_size)

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
print(len(train_dataset), len(test_dataset))

# 创建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

2512 628
2512 628


In [12]:
# -----------------------------
# 4. 定义 Transformer MLP 分类模型
# -----------------------------
class TransformerMLPClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, d_model=128, nhead=4, num_layers=2, dim_feedforward=256, dropout=0.3):
        super().__init__()
        # 投影输入到 Transformer 维度
        self.input_proj = nn.Linear(input_dim, d_model)

        # Positional Encoding（简单实现）
        self.pos_enc = nn.Parameter(torch.zeros(1, 1, d_model))

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation='relu'
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # 分类头
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, num_classes)
        )

    def forward(self, x):
        # x shape: (batch_size, input_dim)
        x = self.input_proj(x)  # -> (batch_size, d_model)
        x = x.unsqueeze(1)      # -> (batch_size, 1, d_model)
        x = x + self.pos_enc    # 加位置编码
        x = self.transformer_encoder(x)  # -> (batch_size, 1, d_model)
        x = x.squeeze(1)        # -> (batch_size, d_model)
        logits = self.classifier(x)     # -> (batch_size, num_classes)
        return logits


In [13]:
# -----------------------------
# 5. 初始化模型、损失函数和优化器
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

inmput_dim = len(FEA_COLS)
num_classes = 10

model = TransformerMLPClassifier(input_dim=inmput_dim, num_classes=10)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)


In [14]:
# -----------------------------
# 6. 训练函数
# -----------------------------
def train_model(model, train_loader, criterion, optimizer, device, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        all_preds = []
        all_labels = []

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        acc = accuracy_score(all_labels, all_preds)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {acc*100:.2f}%")
        scheduler.step(total_loss)


In [15]:
# -----------------------------
# 7. 开始训练
# -----------------------------
train_model(model, train_loader, criterion, optimizer, device, epochs=30)

Epoch 1/30, Loss: 66.3387, Accuracy: 47.09%
Epoch 2/30, Loss: 60.2056, Accuracy: 49.64%
Epoch 3/30, Loss: 59.8390, Accuracy: 48.61%
Epoch 4/30, Loss: 58.9027, Accuracy: 49.96%
Epoch 5/30, Loss: 58.5115, Accuracy: 50.12%
Epoch 6/30, Loss: 58.4457, Accuracy: 49.80%
Epoch 7/30, Loss: 58.6865, Accuracy: 50.16%
Epoch 8/30, Loss: 57.8782, Accuracy: 50.00%
Epoch 9/30, Loss: 57.9784, Accuracy: 49.72%
Epoch 10/30, Loss: 57.2713, Accuracy: 50.16%
Epoch 11/30, Loss: 55.7845, Accuracy: 50.16%
Epoch 12/30, Loss: 55.9626, Accuracy: 50.24%
Epoch 13/30, Loss: 54.9701, Accuracy: 49.76%
Epoch 14/30, Loss: 55.2902, Accuracy: 49.72%
Epoch 15/30, Loss: 54.8433, Accuracy: 50.88%
Epoch 16/30, Loss: 54.6328, Accuracy: 51.19%
Epoch 17/30, Loss: 53.7464, Accuracy: 49.92%
Epoch 18/30, Loss: 53.8345, Accuracy: 50.20%
Epoch 19/30, Loss: 53.0135, Accuracy: 50.84%
Epoch 20/30, Loss: 52.6485, Accuracy: 50.92%
Epoch 21/30, Loss: 52.3998, Accuracy: 51.11%
Epoch 22/30, Loss: 51.1828, Accuracy: 51.47%
Epoch 23/30, Loss: 

In [16]:
# -----------------------------
# 8. 验证函数
# -----------------------------
def evaluate_model(model, test_loader, device='cpu'):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    print(f"\nValidation Accuracy: {acc*100:.2f}%")


In [None]:
evaluate_model(model, test_loader)