In [1]:
# 导入包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import zipfile
import pickle
from collections import defaultdict
from tqdm import tqdm

# 导入自定义包
sys.path.append("work")
from candle2 import Canva

# 数据读取

In [2]:
# 指定文件路径
zip_file_path = 'data/data285396/初赛数据集.zip'
train_file_name = '数据集/初赛-训练集.csv'
test_file_name = '数据集/初赛-测试集.csv'

# 打开zip文件
with zipfile.ZipFile(zip_file_path) as z:
    with z.open(train_file_name) as f:
        train_df = pd.read_csv(f, encoding="gbk")

# 数据预处理

In [3]:
# 计算移动平均线
train_df['MA_5'] = train_df.groupby('股票')['收盘价'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
train_df['MA_10'] = train_df.groupby('股票')['收盘价'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())
train_df['MA_20'] = train_df.groupby('股票')['收盘价'].transform(lambda x: x.rolling(window=20, min_periods=1).mean())
train_df['MA_30'] = train_df.groupby('股票')['收盘价'].transform(lambda x: x.rolling(window=30, min_periods=1).mean())

# 计算成交量的移动平均
train_df['Volume_MA_5'] = train_df.groupby('股票')['成交量'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
train_df['Volume_MA_10'] = train_df.groupby('股票')['成交量'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())

# 计算换手率
train_df['换手率'] = (train_df['成交量'] / 10000) * 100  # 假设流通股总数为 10,000，具体可根据实际情况调整

# 计算 MACD
train_df['EMA_12'] = train_df.groupby('股票')['收盘价'].transform(lambda x: x.ewm(span=12, adjust=False).mean())
train_df['EMA_26'] = train_df.groupby('股票')['收盘价'].transform(lambda x: x.ewm(span=26, adjust=False).mean())
train_df['DIFF'] = train_df['EMA_12'] - train_df['EMA_26']
train_df['DEA'] = train_df.groupby('股票')['DIFF'].transform(lambda x: x.ewm(span=9, adjust=False).mean())
train_df['MACD Histogram'] = train_df['DIFF'] - train_df['DEA']

# 按照赛方的公式计算标签，这样计算的标签不会使用到未来数据
close_t_6 = train_df.groupby('股票')['收盘价'].shift(-6)
close_t_1 = train_df.groupby('股票')['收盘价'].shift(-1)

train_df['label'] = (close_t_6 - close_t_1) / close_t_1

# 对数据分组并保存

In [4]:
# 这里的 grouper 通过日期代码的差值来分组，由于周一到周五的日期代码差值都是1，所以周一到周五的组别是一样的
# 而周五到下周一的日期代码差值不是1，所以能将其分到下一组
grouper = pd.DataFrame([train_df["日期代码"].unique(), pd.Series((np.diff(train_df["日期代码"].unique()) != 1).cumsum()).shift(1)]).T.bfill().ffill()
# grouper = pd.DataFrame([train_df["日期代码"].unique(), pd.Series(np.arange(len(train_df["日期代码"].unique())) // 20)]).T.bfill().ffill()

grouper.columns = ['日期代码', '组别']
merged_train = pd.merge(train_df, grouper, on='日期代码', how='left')

# 更改列名
merged_train.columns = ['Stock_name', 'Data_time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Money', 'MA_5', 'MA_10', 'MA_20', 'MA_30', 'Volume_MA_5', 'Volume_MA_10', 'Trunover', 'EMA_12', 'EMA_26', 'DIFF', 'DEA', 'MACD Histogram', 'Label', 'Group']

grouped_train = merged_train.groupby(['Stock_name', 'Group'])

In [5]:
# 对预处理后的分组数据保存
# 创建二级索引的 defaultdict
grouped_dict = defaultdict(dict)

# 遍历分组结果并存储在 defaultdict 中
# 这里的分组是按周分组，分组得到的 sub_df 包含每只股票每周 5 天的量价数据
for (stock, group), sub_df in tqdm(grouped_train):
    # mpf绘图时需要标准化的日期
    # 日期标准化
    sub_df['Data_time'] = pd.date_range('1/10/2021', periods=len(sub_df), freq='D')
    # 设置日期索引
    sub_df.set_index('Data_time', inplace=True)
    if len(sub_df) == 5:
        grouped_dict[stock][group] = sub_df

  0%|          | 3721/1081000 [00:13<1:03:43, 281.72it/s]


KeyboardInterrupt: 

In [None]:
# 将 grouped_dict 保存为 pickle 文件
with open('train_grouped_dict.pkl', 'wb') as f:
    pickle.dump(grouped_dict, f)