In [1]:
import numpy as np
import pandas as pd
import torch

for m in [np, pd, torch]:
    print(m.__name__, m.__version__)

numpy 1.26.4
pandas 2.2.3
torch 2.2.2


In [2]:
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

from funcs.factor_utils import FactorUtil

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
# -----------------------------
# 1. 读取数据集
# -----------------------------
bar_df = FactorUtil.get_window_bar_df('FG2409_MIN_1', '20240813', window_period=10)
bar_df.columns

20240812 22:51:00


Index(['O_9', 'H_9', 'L_9', 'C_9', 'V_9', 'AP_9', 'AV_9', 'BP_9', 'BV_9',
       'O_8', 'H_8', 'L_8', 'C_8', 'V_8', 'AP_8', 'AV_8', 'BP_8', 'BV_8',
       'O_7', 'H_7', 'L_7', 'C_7', 'V_7', 'AP_7', 'AV_7', 'BP_7', 'BV_7',
       'O_6', 'H_6', 'L_6', 'C_6', 'V_6', 'AP_6', 'AV_6', 'BP_6', 'BV_6',
       'O_5', 'H_5', 'L_5', 'C_5', 'V_5', 'AP_5', 'AV_5', 'BP_5', 'BV_5',
       'O_4', 'H_4', 'L_4', 'C_4', 'V_4', 'AP_4', 'AV_4', 'BP_4', 'BV_4',
       'O_3', 'H_3', 'L_3', 'C_3', 'V_3', 'AP_3', 'AV_3', 'BP_3', 'BV_3',
       'O_2', 'H_2', 'L_2', 'C_2', 'V_2', 'AP_2', 'AV_2', 'BP_2', 'BV_2',
       'O_1', 'H_1', 'L_1', 'C_1', 'V_1', 'AP_1', 'AV_1', 'BP_1', 'BV_1', 'O',
       'H', 'L', 'C', 'V', 'AP', 'AV', 'BP', 'BV'],
      dtype='object')

In [32]:
# -----------------------------
# 2. 制作标签列
# -----------------------------
bar_df['R'] = (bar_df['C'] - bar_df['C_5']) / bar_df['C_5']
min_val, max_val = bar_df['R'].min(), bar_df['R'].max()
print(min_val, max_val)

bin_count = 5
bins = pd.interval_range(start=min_val, end=max_val, periods=bin_count)
print(bins)

bar_df['K'] = pd.cut(bar_df['R'], bins=bins, labels=range(bin_count), include_lowest=True).cat.codes
bar_df['K'].value_counts().sort_index()

-0.009060955518945634 0.0113544201135442
IntervalIndex([(-0.009060955518945634, -0.004977880392447667],
                 (-0.004977880392447667, -0.0008948052659497],
                  (-0.0008948052659497, 0.003188269860548267],
                 (0.003188269860548267, 0.007271344987046234],
                   (0.007271344987046234, 0.0113544201135442]],
              dtype='interval[float64, right]')


K
-1       1
 0      66
 1     782
 2    2094
 3     191
 4       7
Name: count, dtype: int64

In [33]:
# -----------------------------
# 3. 自定义 Dataset 类
# -----------------------------
class DataFrameDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, feature_cols, label_col, device='cpu'):
        self.device = device
        self.features = torch.tensor(dataframe[feature_cols].values, dtype=torch.float32).to(device)
        if label_col is not None:
            self.labels = torch.tensor(dataframe[label_col].values, dtype=torch.long).to(device)
        else:
            self.labels = None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        else:
            return self.features[idx]

In [34]:
FEA_COLS = bar_df.columns.to_list()[:-2]
print(FEA_COLS)

dataset = DataFrameDataset(
    dataframe=bar_df[bar_df['K'] >= 0],
    feature_cols=FEA_COLS,
    label_col='K',
    device='cuda' if torch.cuda.is_available() else 'cpu'
)
len(dataset)


['O_9', 'H_9', 'L_9', 'C_9', 'V_9', 'AP_9', 'AV_9', 'BP_9', 'BV_9', 'O_8', 'H_8', 'L_8', 'C_8', 'V_8', 'AP_8', 'AV_8', 'BP_8', 'BV_8', 'O_7', 'H_7', 'L_7', 'C_7', 'V_7', 'AP_7', 'AV_7', 'BP_7', 'BV_7', 'O_6', 'H_6', 'L_6', 'C_6', 'V_6', 'AP_6', 'AV_6', 'BP_6', 'BV_6', 'O_5', 'H_5', 'L_5', 'C_5', 'V_5', 'AP_5', 'AV_5', 'BP_5', 'BV_5', 'O_4', 'H_4', 'L_4', 'C_4', 'V_4', 'AP_4', 'AV_4', 'BP_4', 'BV_4', 'O_3', 'H_3', 'L_3', 'C_3', 'V_3', 'AP_3', 'AV_3', 'BP_3', 'BV_3', 'O_2', 'H_2', 'L_2', 'C_2', 'V_2', 'AP_2', 'AV_2', 'BP_2', 'BV_2', 'O_1', 'H_1', 'L_1', 'C_1', 'V_1', 'AP_1', 'AV_1', 'BP_1', 'BV_1', 'O', 'H', 'L', 'C', 'V', 'AP', 'AV', 'BP', 'BV']


3140

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [36]:
dataset[0]

(tensor([  1272.0000,   1274.0000,   1271.0000,   1273.0000,   2017.0000,
           1273.3580,  60302.0000,   1272.2410,  37211.0000,   1273.0000,
           1274.0000,   1272.0000,   1274.0000,   2116.0000,   1273.9310,
          56694.0000,   1272.8330,  57743.0000,   1273.0000,   1276.0000,
           1273.0000,   1275.0000,   3327.0000,   1274.9189,  51697.0000,
           1273.4810,  65016.0000,   1276.0000,   1276.0000,   1274.0000,
           1275.0000,   2732.0000,   1275.9180,  42182.0000,   1274.1510,
          56885.0000,   1274.0000,   1276.0000,   1274.0000,   1276.0000,
           1764.0000,   1275.9100,  55445.0000,   1274.7820,  59781.0000,
           1275.0000,   1276.0000,   1274.0000,   1275.0000,   1332.0000,
           1275.9860,  89067.0000,   1274.9301,  57241.0000,   1275.0000,
           1276.0000,   1275.0000,   1275.0000,    495.0000,   1276.0000,
         123868.0000,   1275.0000, 130325.0000,   1275.0000,   1277.0000,
           1275.0000,   1277.0000,   3

In [37]:
dataset[0][0].shape

torch.Size([90])

In [38]:
# 划分训练集和测试集
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
print(train_size, test_size)

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
print(len(train_dataset), len(test_dataset))

# 创建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

2512 628
2512 628


In [39]:
# -----------------------------
# 4. 定义 Transformer MLP 分类模型
# -----------------------------
class TransformerMLPClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, d_model=128, nhead=4, num_layers=2, dim_feedforward=256, dropout=0.3):
        super().__init__()
        # 投影输入到 Transformer 维度
        self.input_proj = nn.Linear(input_dim, d_model)

        # Positional Encoding（简单实现）
        self.pos_enc = nn.Parameter(torch.zeros(1, 1, d_model))

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation='relu'
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # 分类头
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, num_classes)
        )

    def forward(self, x):
        # x shape: (batch_size, input_dim)
        x = self.input_proj(x)  # -> (batch_size, d_model)
        x = x.unsqueeze(1)      # -> (batch_size, 1, d_model)
        x = x + self.pos_enc    # 加位置编码
        x = self.transformer_encoder(x)  # -> (batch_size, 1, d_model)
        x = x.squeeze(1)        # -> (batch_size, d_model)
        logits = self.classifier(x)     # -> (batch_size, num_classes)
        return logits


In [40]:
# -----------------------------
# 5. 初始化模型、损失函数和优化器
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

inmput_dim = len(FEA_COLS)
num_classes = bin_count

model = TransformerMLPClassifier(input_dim=inmput_dim, num_classes=num_classes)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)


In [41]:
# -----------------------------
# 6. 训练函数
# -----------------------------
def train_model(model, train_loader, criterion, optimizer, device, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        all_preds = []
        all_labels = []

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        acc = accuracy_score(all_labels, all_preds)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {acc*100:.2f}%")
        scheduler.step(total_loss)


In [42]:
# -----------------------------
# 7. 开始训练
# -----------------------------
train_model(model, train_loader, criterion, optimizer, device, epochs=30)

Epoch 1/30, Loss: 41.4194, Accuracy: 63.22%
Epoch 2/30, Loss: 36.5530, Accuracy: 67.08%
Epoch 3/30, Loss: 35.7870, Accuracy: 67.28%
Epoch 4/30, Loss: 36.2385, Accuracy: 67.00%
Epoch 5/30, Loss: 35.7909, Accuracy: 67.28%
Epoch 6/30, Loss: 36.0362, Accuracy: 67.24%
Epoch 7/30, Loss: 35.5597, Accuracy: 67.20%
Epoch 8/30, Loss: 35.6263, Accuracy: 67.28%
Epoch 9/30, Loss: 35.6536, Accuracy: 67.20%
Epoch 10/30, Loss: 35.3387, Accuracy: 67.12%
Epoch 11/30, Loss: 35.8570, Accuracy: 67.20%
Epoch 12/30, Loss: 35.5782, Accuracy: 67.24%
Epoch 13/30, Loss: 35.2230, Accuracy: 67.24%
Epoch 14/30, Loss: 35.4550, Accuracy: 67.24%
Epoch 15/30, Loss: 35.2798, Accuracy: 67.16%
Epoch 16/30, Loss: 35.2658, Accuracy: 67.28%
Epoch 17/30, Loss: 35.2949, Accuracy: 67.16%
Epoch 18/30, Loss: 35.4229, Accuracy: 67.24%
Epoch 19/30, Loss: 35.8071, Accuracy: 67.24%
Epoch 20/30, Loss: 35.4612, Accuracy: 67.24%
Epoch 21/30, Loss: 35.3565, Accuracy: 67.24%
Epoch 22/30, Loss: 34.9627, Accuracy: 67.20%
Epoch 23/30, Loss: 

In [27]:
# -----------------------------
# 8. 验证函数
# -----------------------------
def evaluate_model(model, test_loader, device='cpu'):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    print(f"\nValidation Accuracy: {acc*100:.2f}%")


In [28]:
evaluate_model(model, test_loader)


Validation Accuracy: 52.23%


In [43]:
X = np.random.randn(10, 5, 3).astype(np.float32)
X

array([[[-9.7138441e-01, -8.2046250e-03, -4.1831445e-02],
        [-8.3449727e-01,  5.9086430e-01,  1.5920448e-01],
        [-1.8103427e+00,  4.2159006e-01,  6.8547958e-01],
        [ 1.5280780e+00,  7.0662692e-02, -6.8673782e-02],
        [-4.8375028e-01, -1.4277188e+00,  3.2848552e-01]],

       [[ 1.7822380e+00,  4.3391126e-01, -6.9245410e-01],
        [ 1.7672516e-01, -2.5685616e+00, -5.5315411e-01],
        [ 2.6226959e-01,  3.0881274e-01, -4.5001021e-01],
        [-8.6859959e-01,  4.0157357e-01, -7.8219491e-01],
        [-1.9691814e+00, -6.3047439e-01, -1.8294540e-01]],

       [[-2.7137694e-01,  1.9064783e+00, -9.6114504e-01],
        [ 1.1163582e+00, -8.3831050e-02, -1.8813393e+00],
        [ 1.8819986e-02, -6.2149698e-01,  1.2566860e+00],
        [ 1.9507569e+00,  2.5069902e-02,  1.1206604e-01],
        [-1.1334939e-01, -1.5709992e-01,  8.3822805e-01]],

       [[-4.3285406e-01, -8.1039345e-01,  8.6012238e-01],
        [-5.1928300e-01, -2.5286639e-01,  1.8393505e-01],
        