In [1]:
import numpy as np
import pandas as pd
import torch

for m in [np, pd, torch]:
    print(m.__name__, m.__version__)


numpy 1.26.4
pandas 2.2.3
torch 2.2.2


In [2]:
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import LabelEncoder

from funcs.factor_utils import FactorUtil

In [6]:
# -----------------------------
# 1. 读取数据集
# -----------------------------
bar_df = FactorUtil.get_window_bar_df('FG2409_MIN_1', '20240813')
bar_df.columns

20240812 22:51:00


Index(['O_9', 'H_9', 'L_9', 'C_9', 'V_9', 'AP_9', 'AV_9', 'BP_9', 'BV_9',
       'O_8', 'H_8', 'L_8', 'C_8', 'V_8', 'AP_8', 'AV_8', 'BP_8', 'BV_8',
       'O_7', 'H_7', 'L_7', 'C_7', 'V_7', 'AP_7', 'AV_7', 'BP_7', 'BV_7',
       'O_6', 'H_6', 'L_6', 'C_6', 'V_6', 'AP_6', 'AV_6', 'BP_6', 'BV_6',
       'O_5', 'H_5', 'L_5', 'C_5', 'V_5', 'AP_5', 'AV_5', 'BP_5', 'BV_5',
       'O_4', 'H_4', 'L_4', 'C_4', 'V_4', 'AP_4', 'AV_4', 'BP_4', 'BV_4',
       'O_3', 'H_3', 'L_3', 'C_3', 'V_3', 'AP_3', 'AV_3', 'BP_3', 'BV_3',
       'O_2', 'H_2', 'L_2', 'C_2', 'V_2', 'AP_2', 'AV_2', 'BP_2', 'BV_2',
       'O_1', 'H_1', 'L_1', 'C_1', 'V_1', 'AP_1', 'AV_1', 'BP_1', 'BV_1', 'O',
       'H', 'L', 'C', 'V', 'AP', 'AV', 'BP', 'BV'],
      dtype='object')

In [7]:
bar_df['R'] = (bar_df['C'] - bar_df['C_5']) / bar_df['C_5']
min_val, max_val = bar_df['R'].min(), bar_df['R'].max()
min_val, max_val

(-0.009060955518945634, 0.0113544201135442)

In [8]:
bins = pd.interval_range(start=min_val, end=max_val, periods=10)
bins

IntervalIndex([  (-0.009060955518945634, -0.00701941795569665],
                 (-0.00701941795569665, -0.004977880392447667],
               (-0.004977880392447667, -0.0029363428291986834],
                 (-0.0029363428291986834, -0.0008948052659497],
                  (-0.0008948052659497, 0.0011467322972992834],
                 (0.0011467322972992834, 0.003188269860548267],
                   (0.003188269860548267, 0.00522980742379725],
                   (0.00522980742379725, 0.007271344987046234],
                  (0.007271344987046234, 0.009312882550295219],
                    (0.009312882550295219, 0.0113544201135442]],
              dtype='interval[float64, right]')

In [9]:
# -----------------------------
# 2. 制作标签列
# -----------------------------
bar_df['K'] = pd.cut(bar_df['R'], bins=bins, labels=range(10), include_lowest=True).cat.codes
bar_df['K'].value_counts().sort_index()

K
-1       1
 0      20
 1      46
 2     217
 3     565
 4    1579
 5     515
 6     161
 7      30
 8       5
 9       2
Name: count, dtype: int64

In [10]:
type(bar_df['K'].iloc[0])

numpy.int8

In [11]:
# 转换标签为数字（必须）
le = LabelEncoder()
encoded_labels = le.fit_transform(bar_df['K'])
le.classes_

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9], dtype=int8)

In [51]:
# -----------------------------
# 2. 自定义 Dataset 类
# -----------------------------
class DataFrameDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, feature_cols, label_col, device='cpu'):
        """
        构造一个 PyTorch Dataset，用于训练模型
        
        参数:
            dataframe (pd.DataFrame): 包含特征和标签的 DataFrame
            feature_cols (list): 用作输入特征的列名列表
            label_col (str or None): 标签列名（如果是无监督任务可设为 None）
            device (str): 'cuda' 或 'cpu'
        """
        self.device = device

        # 提取特征张量
        self.features = torch.tensor(dataframe[feature_cols].values, dtype=torch.float32).to(device)

        # 提取标签张量
        if label_col is not None:
            self.labels = torch.tensor(dataframe[label_col].values, dtype=torch.long).to(device)
        else:
            self.labels = None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        else:
            return self.features[idx]

In [52]:
FEA_COLS = ['O_9', 'H_9', 'L_9', 'C_9', 'V_9', 'AP_9', 'AV_9', 'BP_9', 'BV_9',
            'O_8', 'H_8', 'L_8', 'C_8', 'V_8', 'AP_8', 'AV_8', 'BP_8', 'BV_8',
            'O_7', 'H_7', 'L_7', 'C_7', 'V_7', 'AP_7', 'AV_7', 'BP_7', 'BV_7',
            'O_6', 'H_6', 'L_6', 'C_6', 'V_6', 'AP_6', 'AV_6', 'BP_6', 'BV_6',
            'O_5', 'H_5', 'L_5', 'C_5', 'V_5', 'AP_5', 'AV_5', 'BP_5', 'BV_5',
            'O_4', 'H_4', 'L_4', 'C_4', 'V_4', 'AP_4', 'AV_4', 'BP_4', 'BV_4',
            'O_3', 'H_3', 'L_3', 'C_3', 'V_3', 'AP_3', 'AV_3', 'BP_3', 'BV_3',
            'O_2', 'H_2', 'L_2', 'C_2', 'V_2', 'AP_2', 'AV_2', 'BP_2', 'BV_2',
            'O_1', 'H_1', 'L_1', 'C_1', 'V_1', 'AP_1', 'AV_1', 'BP_1', 'BV_1', 
            'O', 'H', 'L', 'C', 'V', 'AP', 'AV', 'BP', 'BV']

dataset = DataFrameDataset(
    dataframe=bar_df[bar_df['K'] >= 0],
    feature_cols=FEA_COLS,
    label_col='K',
    device='cuda' if torch.cuda.is_available() else 'cpu'
)
len(dataset)

3140

In [53]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [54]:
dataset[0]

(tensor([  1272.0000,   1274.0000,   1271.0000,   1273.0000,   2017.0000,
           1273.3580,  60302.0000,   1272.2410,  37211.0000,   1273.0000,
           1274.0000,   1272.0000,   1274.0000,   2116.0000,   1273.9310,
          56694.0000,   1272.8330,  57743.0000,   1273.0000,   1276.0000,
           1273.0000,   1275.0000,   3327.0000,   1274.9189,  51697.0000,
           1273.4810,  65016.0000,   1276.0000,   1276.0000,   1274.0000,
           1275.0000,   2732.0000,   1275.9180,  42182.0000,   1274.1510,
          56885.0000,   1274.0000,   1276.0000,   1274.0000,   1276.0000,
           1764.0000,   1275.9100,  55445.0000,   1274.7820,  59781.0000,
           1275.0000,   1276.0000,   1274.0000,   1275.0000,   1332.0000,
           1275.9860,  89067.0000,   1274.9301,  57241.0000,   1275.0000,
           1276.0000,   1275.0000,   1275.0000,    495.0000,   1276.0000,
         123868.0000,   1275.0000, 130325.0000,   1275.0000,   1277.0000,
           1275.0000,   1277.0000,   3

In [55]:
len(dataset), len(dataset[0]), len(dataset[0][0])

(3140, 2, 90)

In [56]:
# 划分训练集和测试集
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
print(train_size, test_size)

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
print(len(train_dataset), len(test_dataset))

# 创建 DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

2512 628
2512 628


In [57]:
train_features, train_labels = next(iter(train_dataloader))
print(f"feature batch shape: {train_features.size()}")
print(f"labels batch shape: {train_labels.size()}")

feature batch shape: torch.Size([64, 90])
labels batch shape: torch.Size([64])


In [58]:
x = nn.Flatten()(train_features)
x

tensor([[ 1171.0000,  1171.0000,  1170.0000,  ..., 10456.0000,  1166.5050,
          5410.0000],
        [ 1221.0000,  1222.0000,  1220.0000,  ...,  5099.0000,  1220.9180,
          8777.0000],
        [ 1229.0000,  1230.0000,  1228.0000,  ...,  5886.0000,  1225.0150,
         28191.0000],
        ...,
        [ 1214.0000,  1214.0000,  1214.0000,  ...,  9462.0000,  1214.8270,
          9728.0000],
        [ 1245.0000,  1245.0000,  1243.0000,  ..., 39123.0000,  1242.9460,
         38292.0000],
        [ 1171.0000,  1171.0000,  1170.0000,  ..., 10891.0000,  1168.0250,
          8730.0000]])

In [59]:
# -----------------------------
# 3. 定义神经网络模型
# -----------------------------
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        logits = self.model(x)
        return logits


In [60]:
model = NeuralNetwork(input_dim=90, num_classes=10)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


NeuralNetwork(
  (model): Sequential(
    (0): Linear(in_features=90, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Linear(in_features=64, out_features=10, bias=True)
  )
)

In [61]:
# -----------------------------
# 4. 设置损失函数和优化器
# -----------------------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [62]:
# -----------------------------
# 5. 训练模型
# -----------------------------
def train_model(model, train_loader, criterion, optimizer, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        accuracy = 100. * correct / total
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {accuracy:.2f}%")



In [63]:
train_model(model, train_dataloader, criterion, optimizer, epochs=20)


Epoch 1/20, Loss: 25062.3834, Accuracy: 31.37%
Epoch 2/20, Loss: 7253.6832, Accuracy: 30.49%
Epoch 3/20, Loss: 3632.8298, Accuracy: 29.82%
Epoch 4/20, Loss: 1569.8861, Accuracy: 21.18%
Epoch 5/20, Loss: 621.4439, Accuracy: 26.79%
Epoch 6/20, Loss: 335.5015, Accuracy: 42.83%
Epoch 7/20, Loss: 248.8836, Accuracy: 47.17%
Epoch 8/20, Loss: 182.5912, Accuracy: 47.81%
Epoch 9/20, Loss: 140.1641, Accuracy: 49.56%
Epoch 10/20, Loss: 129.1226, Accuracy: 49.16%
Epoch 11/20, Loss: 108.6476, Accuracy: 49.44%
Epoch 12/20, Loss: 119.7519, Accuracy: 49.24%
Epoch 13/20, Loss: 90.7290, Accuracy: 49.96%
Epoch 14/20, Loss: 92.0454, Accuracy: 49.92%
Epoch 15/20, Loss: 86.2290, Accuracy: 49.96%
Epoch 16/20, Loss: 84.0033, Accuracy: 49.72%
Epoch 17/20, Loss: 83.3054, Accuracy: 50.04%
Epoch 18/20, Loss: 83.1913, Accuracy: 50.24%
Epoch 19/20, Loss: 76.9574, Accuracy: 50.16%
Epoch 20/20, Loss: 83.4599, Accuracy: 49.96%


In [65]:
# -----------------------------
# 6. 测试模型（可选）
# -----------------------------
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    accuracy = 100. * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")



In [66]:
evaluate_model(model, test_dataloader)

Test Accuracy: 50.32%
