In [1]:
from torch.utils.data import Dataset  
import torch  
from torch.utils.data import Sampler
from torch.utils.data import DataLoader
from qlib.data.dataset.processor import RobustZScoreNorm, Fillna, DropnaLabel, CSZScoreNorm
import pandas as pd
import numpy as np
from qlib.data.dataset import TSDataSampler

In [2]:
import warnings  
warnings.filterwarnings("ignore", category=FutureWarning) 

In [3]:
cp = pd.read_parquet('D:\CX_code\Graph series\MASTER\data\\000300.XSHG_component.pq')

In [4]:
print(cp)

ticker      000001.XSHE  000002.XSHE  000008.XSHE  000009.XSHE  000027.XSHE  \
date                                                                          
2016-01-04          1.0          1.0          NaN          1.0          1.0   
2016-01-05          1.0          1.0          NaN          1.0          1.0   
2016-01-06          1.0          1.0          NaN          1.0          1.0   
2016-01-07          1.0          1.0          NaN          1.0          1.0   
2016-01-08          1.0          1.0          NaN          1.0          1.0   
...                 ...          ...          ...          ...          ...   
2024-06-26          1.0          1.0          NaN          NaN          NaN   
2024-06-27          1.0          1.0          NaN          NaN          NaN   
2024-06-28          1.0          1.0          NaN          NaN          NaN   
2024-07-01          1.0          1.0          NaN          NaN          NaN   
2024-07-02          1.0          1.0          NaN   

In [18]:
# step1:获取数据并生成复制
df_test = pd.read_parquet('combined_data.pq')

# step2:设置双重列索引
level0 = ['feature'] * 44 + ['label']  # 第一层索引：前44列为'feature'，最后一列为'label'
level1 = list(df_test.keys())
multi_index = pd.MultiIndex.from_tuples(list(zip(level0, level1)))
df_test.columns = multi_index
# print(df_test.head())

# step3:设置双重行索引
# print("Original index names:", df_test.index.names)
df_test.index.names = ['datetime', 'instrument']
# print("New index names:", df_test.index.names)
# print(df_test.index)
df_sorted = df_test.sort_index()
# print(df_sorted.index)
df_sorted = df_sorted[df_sorted['label']["label"] != 0]

# step4:数据预处理
# print(df_test.index.get_level_values(0).unique())
# row_counts = df_test.groupby(level=0).size()
# print(row_counts)
RobustZScoreNorm = RobustZScoreNorm(fit_start_time='20160101', fit_end_time='20170501', fields_group='feature', clip_outlier=True)
Fillna = Fillna(fields_group='feature')
CSZScoreNorm = CSZScoreNorm(fields_group='label')

RobustZScoreNorm.fit(df_sorted)
for process_func in [RobustZScoreNorm, Fillna, CSZScoreNorm]:
    df_process = process_func(df_sorted)


In [19]:
feature_mean = df_process['label'].mean()
feature_std = df_process['label'].std()

print(f"Mean of 'feature_column' after preprocessing: {feature_mean}")
print(f"Standard deviation of 'feature_column' after preprocessing: {feature_std}")

Mean of 'feature_column' after preprocessing: label    2.121197e-19
dtype: float64
Standard deviation of 'feature_column' after preprocessing: label    0.999819
dtype: float64


In [20]:
print(df_test.shape)
print(df_process.shape)

(1495317, 45)
(1339890, 45)


In [21]:
df_test['label']

Unnamed: 0_level_0,Unnamed: 1_level_0,label
datetime,instrument,Unnamed: 2_level_1
2016-01-29,000001.XSHE,0.001013
2016-01-29,000004.XSHE,-0.030452
2016-01-29,000005.XSHE,-0.020498
2016-01-29,000006.XSHE,-0.061911
2016-01-29,000008.XSHE,0.025485
...,...,...
2017-12-29,002920.XSHE,0.100000
2017-12-29,600145.XSHG,0.000000
2017-12-29,601313.XSHG,0.000000
2017-12-29,601360.XSHG,0.000000


In [22]:
print(df_process.shape)

(1339890, 45)


In [23]:
# 定义时间范围  
train_range = pd.date_range('20160101', '20170331')  
val_range = pd.date_range('20170401', '20170630')  
test_range = pd.date_range('20170701', '20171231')  

# 使用IndexSlice选择子集  
idx = pd.IndexSlice  
train_set = df_process.loc[idx[train_range, :], :]  
val_set = df_process.loc[idx[val_range, :], :]  
test_set = df_process.loc[idx[test_range, :], :]  

# 展示各子集的信息  
print("Training set shape:", train_set.shape)  
print("Validation set shape:", val_set.shape)  
print("Test set shape:", test_set.shape)

Training set shape: (784696, 45)
Validation set shape: (173490, 45)
Test set shape: (381704, 45)


In [24]:
class TSdataset(Dataset):  
    def __init__(self, dataframe, seq_len=8):  
        self.dataframe = dataframe  
        self.seq_len = seq_len  
        self.data = self.dataframe
  
    def __len__(self):  
        return len(self.data) - self.seq_len
  
    def __getitem__(self, index):  
        # 返回从idx开始，长度为window_size的序列作为输入，以及下一个值作为目标  
        return (  
            torch.tensor(self.data[idx:index + self.seq_len], dtype=torch.float32),  
            torch.tensor(self.data[index + self.seq_len], dtype=torch.float32)  
        )  
  

train_dataset = TSdataset(train_set)  
val_dataset = TSdataset(val_set)  
test_dataset = TSdataset(test_set) 

In [31]:
class DailyBatchSamplerRandom(Sampler):
    def __init__(self, data_source, shuffle=False):
        self.data_source = data_source
        self.shuffle = shuffle
        # calculate number of samples in each batch
        self.daily_count = pd.Series(index=self.data_source.get_index()).groupby("datetime").size().values
        self.daily_index = np.roll(np.cumsum(self.daily_count), 1)  # calculate begin index of each batch
        self.daily_index[0] = 0

    def __iter__(self):
        if self.shuffle:
            index = np.arange(len(self.daily_count))
            np.random.shuffle(index)
            for i in index:
                yield np.arange(self.daily_index[i], self.daily_index[i] + self.daily_count[i])
        else:
            for idx, count in zip(self.daily_index, self.daily_count):
                yield np.arange(idx, idx + count)

    def __len__(self):
        return len(self.data_source)

In [26]:
def _init_data_loader(data, shuffle=True, drop_last=True):
    sampler = DailyBatchSamplerRandom(data, shuffle)
    data_loader = DataLoader(data, sampler=sampler, drop_last=drop_last)
    return data_loader

In [27]:
train_loader = _init_data_loader(train_set)

In [28]:
for data in train_loader:
    data = torch.squeeze(data, dim=0)
    feature = data[:, :, 0:-1]
    label = data[:, -1, -1]

KeyError: '[395906 395907 395908 ... 398509 398510 398511] not in index'

In [36]:
sampler = TSDataSampler(train_set, '20160101', '20170331', step_len=8, fillna_type="ffill+bfill")
train_loader = _init_data_loader(sampler)
for data in train_loader:
    data = torch.squeeze(data, dim=0)
    print(data.shape)

torch.Size([2576, 8, 45])
torch.Size([2720, 8, 45])
torch.Size([2756, 8, 45])
torch.Size([2502, 8, 45])
torch.Size([2630, 8, 45])
torch.Size([2520, 8, 45])
torch.Size([2553, 8, 45])
torch.Size([2573, 8, 45])
torch.Size([2517, 8, 45])
torch.Size([2524, 8, 45])
torch.Size([2544, 8, 45])
torch.Size([2590, 8, 45])
torch.Size([2651, 8, 45])
torch.Size([2550, 8, 45])
torch.Size([2637, 8, 45])
torch.Size([2578, 8, 45])
torch.Size([2665, 8, 45])
torch.Size([2509, 8, 45])
torch.Size([2516, 8, 45])
torch.Size([2636, 8, 45])
torch.Size([2491, 8, 45])
torch.Size([2435, 8, 45])
torch.Size([2514, 8, 45])
torch.Size([2638, 8, 45])
torch.Size([2494, 8, 45])
torch.Size([2644, 8, 45])
torch.Size([2556, 8, 45])
torch.Size([2583, 8, 45])
torch.Size([2583, 8, 45])
torch.Size([2855, 8, 45])
torch.Size([2671, 8, 45])
torch.Size([2484, 8, 45])
torch.Size([2812, 8, 45])
torch.Size([2509, 8, 45])
torch.Size([2601, 8, 45])
torch.Size([2563, 8, 45])
torch.Size([2483, 8, 45])
torch.Size([2557, 8, 45])
torch.Size([