In [1]:
import pandas as pd
import numpy as np
from common_variables import *


In [2]:
#参数含义：读取路径，读取指定列，index_col指定首列，将字符串转换成日期格式(数据合并后自动充当首列)
df_csv = pd.read_csv(full_time_series_path, usecols=['Date','Time','High','Low'],parse_dates=[['Date','Time']])
df_csv['Date_Time'] = pd.to_datetime(start_date)
# 去除有空格的行
df_csv.dropna(how='any', inplace=True)
df = df_csv
df.head(100)

Unnamed: 0,Date_Time,High,Low
0,2001-01-01,114.41,114.41
2,2001-01-01,114.37,114.37
4,2001-01-01,114.42,114.42
6,2001-01-01,114.40,114.35
8,2001-01-01,114.42,114.37
...,...,...,...
190,2001-01-01,114.59,114.50
192,2001-01-01,114.59,114.54
194,2001-01-01,114.60,114.52
196,2001-01-01,114.60,114.52


In [3]:
# HLAvg 最高价和最低价之间的平均值
df["HLAvg"] = df['High'].add(df['Low']).div(2)
del df['High']
del df['Low']
# Simple Moving Average
# 移动平均线周期
# rolling 向后回滚
# ma_periods表示计算统计量的观测值的数量即向前几个数据
df['MA'] = df['HLAvg'].rolling(window=ma_periods).mean()
# Log Returns
# log取对数，df['MA'].shift(1) 数据向下移动一行，索引不变
# 产看汇率趋势
df['Returns'] = np.log(df['MA']/df['MA'].shift(1))
df.head(100)

Unnamed: 0,Date_Time,HLAvg,MA,Returns
0,2001-01-01,114.410,,
2,2001-01-01,114.370,,
4,2001-01-01,114.420,,
6,2001-01-01,114.375,,
8,2001-01-01,114.395,,
...,...,...,...,...
190,2001-01-01,114.545,114.486429,0.000084
192,2001-01-01,114.565,114.492500,0.000053
194,2001-01-01,114.560,114.499643,0.000062
196,2001-01-01,114.560,114.507143,0.000066


In [4]:
#dropna 删除缺失值所造的行，how='any'表示只要这行的某一列缺失，就删除
df.dropna(how='any', inplace=True)
# df[0:2] 选取前三列 ，这里余数为0，所以没有变化
df = df[df.shape[0] % batch_size:]
df

Unnamed: 0,Date_Time,HLAvg,MA,Returns
64,2001-01-01,114.460,114.508929,-0.000062
66,2001-01-01,114.495,114.505714,-0.000028
68,2001-01-01,114.435,114.494643,-0.000097
70,2001-01-01,114.430,114.482500,-0.000106
72,2001-01-01,114.395,114.478571,-0.000034
...,...,...,...,...
1048566,2001-01-01,124.780,124.765714,0.000003
1048568,2001-01-01,124.770,124.764643,-0.000009
1048570,2001-01-01,124.770,124.763929,-0.000006
1048572,2001-01-01,124.755,124.762857,-0.000009


In [5]:
#划分训练集
df_train = df[:- validation_size - test_size]
#划分验证集
df_validation = df[- validation_size - test_size - window_size:- test_size]
#测试集
df_test = df[- test_size - window_size:]
print(f'df_train.shape {df_train.shape}, df_validation.shape {df_validation.shape}, df_test.shape {df_test.shape}')

df_train.shape (513888, 4), df_validation.shape (5440, 4), df_test.shape (5440, 4)


In [6]:
#生成csv文件
df_train.to_csv(train_time_series_path)
df_validation.to_csv(validate_time_series_path)
df_test.to_csv(test_time_series_path)