# Data Preprocess

In [1]:
import pandas as pd
import numpy as np
import os

raw_data = pd.read_parquet('data/raw_data.par')
raw_data.rename(columns={'return_T+1_T+11':'ret10', 'return_T+1_T+6':'ret5'}, inplace=True)
raw_data.head()

Unnamed: 0,datetime,instrument,high,open,low,close,volume,vwap,is_st,ret10,ret5
0,2014-01-02,SH600000,68.873001,68.800102,67.706902,67.998398,9580077.0,68.100788,False,-0.002189,0.02954
1,2014-01-02,SH600004,10.8385,10.7763,10.7296,10.7918,1791940.0,10.766718,False,-0.040697,-0.045053
2,2014-01-02,SH600005,11.5404,11.3845,11.1765,11.2805,5773659.0,11.304832,False,-0.032856,-0.032856
3,2014-01-02,SH600006,10.1198,10.1198,10.0154,10.085,1037137.0,10.058204,False,-0.024397,-0.052271
4,2014-01-02,SH600007,16.898701,16.898701,16.6912,16.818899,266118.1,16.773777,False,-0.070106,-0.052575


In [2]:
raw_data = raw_data[~raw_data['is_st']]
del raw_data['is_st']
raw_data.head()

Unnamed: 0,datetime,instrument,high,open,low,close,volume,vwap,ret10,ret5
0,2014-01-02,SH600000,68.873001,68.800102,67.706902,67.998398,9580077.0,68.100788,-0.002189,0.02954
1,2014-01-02,SH600004,10.8385,10.7763,10.7296,10.7918,1791940.0,10.766718,-0.040697,-0.045053
2,2014-01-02,SH600005,11.5404,11.3845,11.1765,11.2805,5773659.0,11.304832,-0.032856,-0.032856
3,2014-01-02,SH600006,10.1198,10.1198,10.0154,10.085,1037137.0,10.058204,-0.024397,-0.052271
4,2014-01-02,SH600007,16.898701,16.898701,16.6912,16.818899,266118.1,16.773777,-0.070106,-0.052575


reshape the raw data to meet the input shape

In [3]:
sample_datetime = raw_data.datetime.unique()
sample_stock = raw_data.instrument.unique()

if not os.path.exists(f'data/processed_data'):
    os.makedirs(f'data/processed_data')

np.save('data/processed_data/sample_stock.npy', sample_stock)
np.save('data/processed_data/sample_datetime.npy', sample_datetime)

print(sample_datetime.shape, sample_stock.shape)

(2498,) (5272,)


In [4]:
features = ['open', 'high', 'low', 'close', 'volume', 'vwap']
labels = ['ret10', 'ret5']

X = np.zeros((len(sample_stock), len(sample_datetime), len(features)))
for i, f in enumerate(features):
    featurei = raw_data.pivot(index = 'instrument', columns = 'datetime', values = f)
    X[:, :, i] = featurei.values
print(X.shape)
np.save('data/processed_data/X.npy', X)

for l in labels:
    label = raw_data.pivot(index = 'instrument', columns = 'datetime', values = l)
    print(label.shape)
    np.save(f'data/processed_data/{l}.npy', label)

(5272, 2498, 6)
(5272, 2498)
(5272, 2498)
