In [4]:
import pickle
import os, datetime
import pandas as pd
import numpy as np
from tqdm import tqdm

In [5]:
train_folders = [
    r'D:\code\forecast_model\notebook\20241026_1_part_data',
    r'D:\code\forecast_model\notebook\20241027_1_base_part_bin',
    r'D:\code\forecast_model\notebook\20241028_1_base_part_bin_mid_std',
    r'D:\code\forecast_model\notebook\20241027_1_base_deeplob',
    r'D:\code\forecast_model\notebook\20241030_1_base_axiallob',
]

# 取所有文件夹
_model_folders = []
for i in train_folders:
    _model_folders += [os.path.join(i, j) for j in os.listdir(i)]

# 只取用文件夹
_model_folders = [i for i in _model_folders if os.path.isdir(i)]

# 只取用top10的模型
data = pd.DataFrame()
for model_folder in _model_folders:
    result_file = os.path.join(model_folder,'result.csv')
    _data = pd.read_csv(result_file)

    if 'test_class_f1_0' in list(_data):
        _data[f'test_mean_class_f1'] = (_data[f'test_class_f1_0'] + _data[f'test_class_f1_1']) / 2
    elif 'test_final_class_f1_0' in list(_data):
        # 采用 final
        _data[f'test_mean_class_f1'] = _data[f'test_final_class_f1_0']
        for i in range(1, _data['y_n'][0] - 1):
            _data[f'test_mean_class_f1'] += _data[f'test_final_class_f1_{i}']
        _data[f'test_mean_class_f1'] = _data[f'test_mean_class_f1'] / (_data['y_n'][0] - 1)
    else:
        continue

    _data['path'] = model_folder.split('_seed')[0]
    data = pd.concat([data, _data], ignore_index=True)

data = data.loc[:, ['test_mean_class_f1', 'path']]
data = data.groupby('path').mean().reset_index()
data = data.sort_values(['test_mean_class_f1'], ascending=False).reset_index(drop=True)
print(data.head(10))

model_folders = data['path'].tolist()

print('使用模型种类', len(model_folders))
model_folders

                                                path  test_mean_class_f1
0  D:\code\forecast_model\notebook\20241028_1_bas...            0.635850
1  D:\code\forecast_model\notebook\20241028_1_bas...            0.635000
2  D:\code\forecast_model\notebook\20241027_1_bas...            0.628750
3  D:\code\forecast_model\notebook\20241027_1_bas...            0.623425
4  D:\code\forecast_model\notebook\20241026_1_par...            0.606425
5  D:\code\forecast_model\notebook\20241026_1_par...            0.577425
6  D:\code\forecast_model\notebook\20241026_1_par...            0.572700
7  D:\code\forecast_model\notebook\20241030_1_bas...            0.569087
8  D:\code\forecast_model\notebook\20241027_1_bas...            0.467650
使用模型种类 9


['D:\\code\\forecast_model\\notebook\\20241028_1_base_part_bin_mid_std\\base_binbtabl_mid_std_top5_predict_n100',
 'D:\\code\\forecast_model\\notebook\\20241028_1_base_part_bin_mid_std\\base_binctabl_mid_std_top5_predict_n100',
 'D:\\code\\forecast_model\\notebook\\20241027_1_base_part_bin\\base_binctabl_top5_predict_n100',
 'D:\\code\\forecast_model\\notebook\\20241027_1_base_part_bin\\base_binbtabl_top5_predict_n100',
 'D:\\code\\forecast_model\\notebook\\20241026_1_part_data\\train_depth_only_depth_predict_n100',
 'D:\\code\\forecast_model\\notebook\\20241026_1_part_data\\train_depth_only_deal_predict_n100',
 'D:\\code\\forecast_model\\notebook\\20241026_1_part_data\\train_depth_only_order_predict_n100',
 'D:\\code\\forecast_model\\notebook\\20241030_1_base_axiallob\\base_axiallob_top5_predict_n100',
 'D:\\code\\forecast_model\\notebook\\20241027_1_base_deeplob\\base_deeplob_top5_predict_n100']

In [6]:
out_folder = r'D:\L2_DATA_T0_ETF\train_data/meta_data_all'
os.makedirs(out_folder, exist_ok=True)

In [7]:
for _type in ['train', 'val','test']:
    # _type = 'train'
    dates = None
    for model_folder in _model_folders:
        model_output_folder = os.path.join(model_folder,'model_final',_type)
        _dates = [i.replace('.csv', '') for i in os.listdir(model_output_folder)]
        if None is dates:
            dates = _dates
        else:
            # 取交集
            dates = list(set(dates) & set(_dates))
    print(dates)

    for date in tqdm(dates):
        data = None

        # 读取所有模型的数据
        for main_folder in model_folders:
            for folder in _model_folders:
                if not folder.startswith(main_folder):
                    continue
                model_name = os.path.basename(folder)
                file = os.path.join(folder,'model_final', _type, date + '.csv')
                _data = pd.read_csv(file)
                _data.rename(columns={
                    '0': f'{model_name}_0',
                    '1': f'{model_name}_1',
                    '2': f'{model_name}_2',
                }, inplace=True)

                if data is None:
                    data = _data
                else:
                    # # 检查id / target是否一致
                    # ids = list(set(data['id'].to_list()) & set(_data['id'].to_list()))
                    # assert data.loc[data['id'].isin(ids), ['id', 'target']].sort_values('id').reset_index(drop=True).equals(_data.loc[_data['id'].isin(ids), ['id', 'target']].sort_values('id').reset_index(drop=True))

                    _data = _data.loc[:, [i for i in list(_data) if ('target' not in i)]]
                    data = data.merge(_data, on=['id'], how='inner')

        # 储存数据
        # ids,mean_std, x, y, raw
        ids = data['id'].to_list()
        mean_std = [(0, 1) for i in range(len(ids))]
        x = [(i, i+1) for i in range(len(ids))]
        y = data['target'].to_list()
        raw = data.iloc[:, 2:]
        # raw.columns = range(len(raw.columns))
        _out_folder = os.path.join(out_folder,_type)
        os.makedirs(_out_folder, exist_ok=True)
        pickle.dump((ids,mean_std, x, y, raw), open(os.path.join(_out_folder, date + '.pkl'), 'wb'))


['20240412', '20231219', '20240703', '20240531', '20240625', '20240104', '20231103', '20240126', '20240711', '20240624', '20240603', '20240125', '20240613', '20240116', '20231130', '20240618', '20240418', '20240131', '20240514', '20240808', '20240229', '20240327', '20231206', '20240729', '20240129', '20231213', '20240813', '20240604', '20240306', '20240223', '20240328', '20240529', '20240705', '20240122', '20231201', '20240508', '20240207', '20240419', '20240402', '20231208', '20240701', '20240530', '20240828', '20231113', '20240326', '20240731', '20240409', '20240715', '20240730', '20240823', '20240105', '20240112', '20240513', '20231127', '20240321', '20240612', '20240805', '20240801', '20240410', '20240524', '20240806', '20240220', '20240416', '20231108', '20231222', '20231218', '20240815', '20240305', '20240115', '20231129', '20240429', '20240510', '20240205', '20240614', '20240222', '20240422', '20240708', '20240626', '20240528', '20240619', '20240816', '20231110', '20240227', '20

100%|██████████| 161/161 [13:57<00:00,  5.20s/it]  


['20240628', '20240109', '20240627', '20240110', '20231214', '20240124', '20240314', '20240219', '20240822', '20231123', '20240611', '20240621', '20240819']


100%|██████████| 13/13 [00:52<00:00,  4.07s/it]


['20240905', '20240830', '20240903', '20240902', '20240904', '20240829']


100%|██████████| 6/6 [00:22<00:00,  3.71s/it]


# 测试

In [8]:
file = os.path.join(out_folder, 'train', '20231103.pkl')
# file = r"D:\L2_DATA_T0_ETF\train_data\depth_deal_order_data\train\20231103.pkl"
ids,mean_std, x, y, raw = pickle.load(open(file, 'rb'))
print(len(ids))

20112


In [9]:
idx = 0
_id = ids[idx]
_mean_std = mean_std[idx]
_y = y[idx]
_id, _mean_std, _y

('159941_1698975384', (0, 1), 2)

In [10]:
a, b = x[idx]
data = raw.iloc[a:b]
data

Unnamed: 0,base_binbtabl_mid_std_top5_predict_n100_seed0_T4x2_0,base_binbtabl_mid_std_top5_predict_n100_seed0_T4x2_1,base_binbtabl_mid_std_top5_predict_n100_seed0_T4x2_2,base_binbtabl_mid_std_top5_predict_n100_seed1_T4x2_0,base_binbtabl_mid_std_top5_predict_n100_seed1_T4x2_1,base_binbtabl_mid_std_top5_predict_n100_seed1_T4x2_2,base_binbtabl_mid_std_top5_predict_n100_seed2_T4x2_0,base_binbtabl_mid_std_top5_predict_n100_seed2_T4x2_1,base_binbtabl_mid_std_top5_predict_n100_seed2_T4x2_2,base_binbtabl_mid_std_top5_predict_n100_seed3_T4x2_0,...,base_deeplob_top5_predict_n100_seed0_T4x2_2,base_deeplob_top5_predict_n100_seed1_T4x2_0,base_deeplob_top5_predict_n100_seed1_T4x2_1,base_deeplob_top5_predict_n100_seed1_T4x2_2,base_deeplob_top5_predict_n100_seed2_T4x2_0,base_deeplob_top5_predict_n100_seed2_T4x2_1,base_deeplob_top5_predict_n100_seed2_T4x2_2,base_deeplob_top5_predict_n100_seed3_T4x2_0,base_deeplob_top5_predict_n100_seed3_T4x2_1,base_deeplob_top5_predict_n100_seed3_T4x2_2
0,0.227967,0.218834,0.553199,0.25011,0.228442,0.521448,0.249771,0.219628,0.530601,0.233565,...,0.773212,0.250454,0.000307,0.749239,0.00251,0.000748,0.996742,0.025631,0.002588,0.97178


In [11]:
list(data)

['base_binbtabl_mid_std_top5_predict_n100_seed0_T4x2_0',
 'base_binbtabl_mid_std_top5_predict_n100_seed0_T4x2_1',
 'base_binbtabl_mid_std_top5_predict_n100_seed0_T4x2_2',
 'base_binbtabl_mid_std_top5_predict_n100_seed1_T4x2_0',
 'base_binbtabl_mid_std_top5_predict_n100_seed1_T4x2_1',
 'base_binbtabl_mid_std_top5_predict_n100_seed1_T4x2_2',
 'base_binbtabl_mid_std_top5_predict_n100_seed2_T4x2_0',
 'base_binbtabl_mid_std_top5_predict_n100_seed2_T4x2_1',
 'base_binbtabl_mid_std_top5_predict_n100_seed2_T4x2_2',
 'base_binbtabl_mid_std_top5_predict_n100_seed3_T4x2_0',
 'base_binbtabl_mid_std_top5_predict_n100_seed3_T4x2_1',
 'base_binbtabl_mid_std_top5_predict_n100_seed3_T4x2_2',
 'base_binctabl_mid_std_top5_predict_n100_seed0_T4x2_0',
 'base_binctabl_mid_std_top5_predict_n100_seed0_T4x2_1',
 'base_binctabl_mid_std_top5_predict_n100_seed0_T4x2_2',
 'base_binctabl_mid_std_top5_predict_n100_seed1_T4x2_0',
 'base_binctabl_mid_std_top5_predict_n100_seed1_T4x2_1',
 'base_binctabl_mid_std_top5_pr

: 