# Dataset Generation

In [14]:
import os
import qlib
import torch
import numpy as np
import pandas as pd
from qlib.config import REG_CN
from qlib.contrib.data.handler import Alpha158
from sklearn.preprocessing import MinMaxScaler

### Part 1 - Fetch features (alpha158) by the module `qlib`

In [2]:
qlib.init(provider_uri="../data/dataset_qlib", region=REG_CN)
data_handler_config = {
    "start_time": "2010-01-01",
    "end_time": "2022-12-30",
    "instruments": "all",
}
h = Alpha158(**data_handler_config)
data_df = h.fetch()

[16904:MainThread](2023-05-23 18:18:50,728) INFO - qlib.Initialization - [config.py:416] - default_conf: client.
[16904:MainThread](2023-05-23 18:18:51,328) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[16904:MainThread](2023-05-23 18:18:51,328) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': WindowsPath('D:/文件/研一下/深度学习/Quantitative Investment Based on Transformer/data/dataset_qlib')}
[16904:MainThread](2023-05-23 18:19:37,058) INFO - qlib.timer - [log.py:128] - Time cost: 45.727s | Loading data Done
[16904:MainThread](2023-05-23 18:19:37,298) INFO - qlib.timer - [log.py:128] - Time cost: 0.193s | DropnaLabel Done
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
[169

In [3]:
# Fetch the feature data
feature_df = data_df.reset_index().dropna(axis=1, how="all").rename(
    columns={"datetime": "date", "instrument": "tic"}
)
# Delete features that have different NaN data among different stocks.
drop_col = []
for feat_name, nan_nums in feature_df.isna().sum().items():
    if nan_nums % 100 != 0:
        drop_col.append(feat_name)
feature_df = feature_df.drop(columns=drop_col)
feature_df = feature_df.dropna(how = "any")
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 293300 entries, 6000 to 299299
Columns: 145 entries, date to LABEL0
dtypes: datetime64[ns](1), float32(143), object(1)
memory usage: 166.7+ MB


### Part 2 - Merge the features with basic price and volume  

In [4]:
# Hyperparameters for dataset
alpha158 = feature_df.columns[2:].to_list() # alpha158
basic_feature = ["open", "close", "high", "low", "volume"]
target_return_span = 1
target = f"return+{target_return_span}"
time_span = 20
CSI_date = ['20100101', '20171230', '20180101', '20191231',  '20200101', '20211231'] 

In [5]:
# Merge the basic price and volume data with features
tic_df_list = []
for tic_path in os.listdir("../data/dataset_per_stocks/"):
    tic_df = pd.read_csv(f"../data/dataset_per_stocks/{tic_path}", index_col=0)[["date", "open", "close", "high", "low", "volume"]]
    tic_df["tic"] = tic_path[:8]
    tic_df[target] = tic_df.close.pct_change(target_return_span).shift(-1 * target_return_span)
    tic_df_list.append(tic_df)

tic_target_df = pd.concat(tic_df_list).sort_values(by="date").dropna()
tic_target_df.date = pd.DatetimeIndex(tic_target_df.date)
dataset_df = feature_df.merge(tic_target_df, how='inner', on=["date", "tic"])

### Part 3 - Generate tensor for training, evaluating and testing

In [6]:
# Split the dataset into training, valuating and testing dataset
train_df = dataset_df[(dataset_df.date >= CSI_date[0]) & (dataset_df.date <= CSI_date[1])]
eval_df  = dataset_df[(dataset_df.date >= CSI_date[2]) & (dataset_df.date <= CSI_date[3])]
test_df  = dataset_df[(dataset_df.date >= CSI_date[4]) & (dataset_df.date <= CSI_date[5])]

In [36]:
# The function to generate feature and label for training
def df_2_array(dataset_df, feat_col, target, type):
    dataset_feat = []
    dataset_ret = []
    dataset_price = []
    for tic in dataset_df.tic.unique():
        df = dataset_df[dataset_df.tic == tic]
        feat = df[feat_col].to_numpy()
        ret = df[target].to_numpy()
        price = df['close'].to_numpy()
        stock_feat = []
        stock_ret = []
        stock_price = []
        for i in range(time_span, feat.shape[0]):
            feat_standard = MinMaxScaler().fit_transform(feat[i-time_span : i]) # Standardization to (0, 1)
            stock_feat.append(feat_standard)
            stock_ret.append(ret[i])
            stock_price.append(price[i])
        stock_feat = np.array(stock_feat)
        stock_ret = np.array(stock_ret)
        stock_price = np.array(stock_price)
    
        dataset_feat.append(stock_feat)
        dataset_ret.append(stock_ret)
        dataset_price.append(stock_price)

    dataset_feat = np.array(dataset_feat).transpose((1, 2, 0, 3))
    dataset_ret = np.array(dataset_ret).transpose((1, 0))
    dataset_price = np.array(dataset_price).transpose((1, 0))
    
    dataset_feat_tensor = torch.tensor(dataset_feat, dtype=torch.float)
    dataset_ret_tensor = torch.tensor(dataset_ret, dtype=torch.float)
    dataset_price_tensor = torch.tensor(dataset_price, dtype=torch.float)
    date_list = list(map(lambda x: str(x.date()), list(sorted(set(dataset_df.date)))[time_span:]))

    torch.save(dataset_feat_tensor, f"../data/dataset_tensor/{type}/feat.pt")
    torch.save(dataset_ret_tensor, f"../data/dataset_tensor/{type}/ret.pt")
    # torch.save(dataset_price_tensor, f"../data/alpha/{type}/price.pt") # The price data isn't needed yet
    with open(f"../data/dataset_tensor/{type}/date.txt", "w") as file:
        for date in date_list:
            file.write(date + "\n")
    with open(f"../data/dataset_tensor/{type}/stocks.txt", "w") as file:
        for tic in dataset_df.tic.unique():
            file.write(tic + "\n")
    return dataset_feat, dataset_ret

dataset_feat_train, dataset_ret_train = df_2_array(train_df, alpha158, target, "train")
dataset_feat_eval,  dataset_ret_eval  = df_2_array(eval_df,  alpha158, target, "eval")
dataset_feat_test,  dataset_ret_test  = df_2_array(test_df,  alpha158, target, "test")

In [37]:
dataset_feat_train.shape, dataset_feat_eval.shape, dataset_feat_test.shape

((1865, 20, 100, 143), (465, 20, 100, 143), (466, 20, 100, 143))