# Part 1
## Modify the raw data downloaded from `Choice` 
### In order to use `Qlib` to calculate factor `Alpha158` 

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

data_raw = pd.read_excel("../data/data_from_choice.xlsx", index_col = 0, header = [0, 1]).T

### The following code does not need to be run twice

In [None]:
stock_code, _ = zip(*list(data_raw.index))
stock_code = list(set(stock_code))
field_map = {"开盘价": "open", "收盘价": "close", "最高价": "high", "最低价": "low",
             "均价": "vwap", "涨跌幅": "change", "成交量": "volume", "成交金额": "money",
             "复权因子（后）": "factor"}
for tmp_stock_code in tqdm(stock_code):
    tmp_data_raw = data_raw.loc[tmp_stock_code]
    tmp_data_raw.rename(index = field_map, inplace = True) 
    tmp_data_raw = tmp_data_raw.T
    tmp_data_raw.insert(0, column = "stock_code", value = tmp_stock_code)
    tmp_data_raw.insert(1, column = "date", value = tmp_data_raw.index)
    tmp_data_raw.set_index(keys = "stock_code", inplace = True)
    tmp_data_raw.to_csv(os.path.join("../data/data_prepared_for_qlib/", tmp_stock_code + ".csv"))

### Runing the system command

```
python scripts/dump_bin.py dump_all --csv_path "D:\文件\研一下\深度学习\Quantitative Investment Based on Transformer\data\data_prepared_for_qlib" --qlib_dir ~/.qlib/qlib_data/hs100_data --symbol_field_name stock_code --date_field_name date --include_fields open,high,low,close,volume,money,factor,vwap,change
```

### in the following directory

```
D:\Download_app\anaconda3\Lib\site-packages\qlib
```

# Part 2
## Generate dataset for training

In [1]:
import os
import qlib
import torch
import numpy as np
import pandas as pd
from qlib.data import D
from qlib.contrib.data.handler import Alpha158

qlib.init(provider_uri = "~/.qlib/qlib_data/hs100_data")
# Set trade date and stock code
trade_date = D.calendar(start_time = "2016-01-04", end_time = "2022-12-30", freq = "day")
instruments = D.instruments(market = "all")
stock_list = D.list_instruments(instruments = instruments,
                                start_time = "2016-01-04",
                                end_time = "2022-12-30",
                                as_list = True)
# Set trade date and features
features_df = D.features(instruments = stock_list, fields = ["$close", "$volume"], start_time = "2016-01-04", end_time = "2022-12-30", freq = "day")
# Calculate alpha158
data_handler_config = {
    "start_time": "2016-01-04",
    "end_time": "2022-12-30",
    "instruments": "all",
}
h = Alpha158(**data_handler_config)
feature_alpha158 = h.fetch()

[8084:MainThread](2023-05-08 18:44:01,175) INFO - qlib.Initialization - [config.py:416] - default_conf: client.
[8084:MainThread](2023-05-08 18:44:01,427) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[8084:MainThread](2023-05-08 18:44:01,427) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': WindowsPath('C:/Users/29411/.qlib/qlib_data/hs100_data')}
[8084:MainThread](2023-05-08 18:44:39,863) INFO - qlib.timer - [log.py:128] - Time cost: 27.449s | Loading data Done
[8084:MainThread](2023-05-08 18:44:39,991) INFO - qlib.timer - [log.py:128] - Time cost: 0.105s | DropnaLabel Done
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
[8084:MainThread](2023-05-08 18:44:41,539) IN

In [18]:
feature_alpha158 = feature_alpha158.reset_index().dropna(axis=1, how="all").rename(
    columns={"datetime": "date", "instrument": "stock_code"}
)
# Delete nan data
feature_alpha158_dropna = feature_alpha158.dropna(how = "any")
print(f"The percentage of deleted sample is:{1 - len(feature_alpha158_dropna) / len(feature_alpha158): .2%}")

The percentage of deleted sample is: 12.29%


In [21]:
for stock_code in feature_alpha158_dropna['stock_code'].unique():
    print(stock_code, len(feature_alpha158_dropna[feature_alpha158_dropna['stock_code'] == stock_code]))

000100.SZ 1469
000301.SZ 1462
000333.SZ 1585
000338.SZ 1639
000661.SZ 1622
000708.SZ 1633
000725.SZ 1641
000792.SZ 1318
000938.SZ 1602
001979.SZ 1621
002027.SZ 1641
002049.SZ 1519
002129.SZ 1228
002241.SZ 1641
002271.SZ 1641
002371.SZ 1634
002410.SZ 1641
002415.SZ 1639
002460.SZ 1639
002466.SZ 1609
002475.SZ 1640
002493.SZ 1641
002594.SZ 1641
002601.SZ 1599
002648.SZ 1639
300014.SZ 1641
300015.SZ 1619
300059.SZ 1634
300122.SZ 1641
300124.SZ 1628
300274.SZ 1634
300347.SZ 1634
300413.SZ 1373
600010.SH 1628
600019.SH 1543
600028.SH 1641
600031.SH 1641
600036.SH 1641
600048.SH 1632
600050.SH 1543
600089.SH 1633
600111.SH 1641
600176.SH 1627
600219.SH 1608
600276.SH 1640
600309.SH 1512
600346.SH 1526
600406.SH 1528
600426.SH 1641
600436.SH 1641
600519.SH 1641
600570.SH 1639
600585.SH 1641
600588.SH 1639
600690.SH 1639
600893.SH 1606
600900.SH 1628
601012.SH 1632
601088.SH 1575
601225.SH 1641
601318.SH 1641
601390.SH 1565
601398.SH 1641
601600.SH 1530
601668.SH 1640
601766.SH 1629
601857.SH 

In [24]:
alpha158_list = feature_alpha158_dropna.columns[2:].to_list()  # alpha158
basic_feature = ["open", "close", "high", "low", "volume"]
target_return_span = 5
target = f"return+{target_return_span}"

In [57]:
# Merge the raw data and features(alpha 158)
stock_raw_data_list = []
for file_name in os.listdir("../data/data_prepared_for_qlib"):
    tmp_data = pd.read_csv(os.path.join("../data/data_prepared_for_qlib", file_name))[["date", "open", "close", "high", "low", "volume", "stock_code"]]
    tmp_data[target] = tmp_data.close.pct_change(target_return_span).shift(-1 * target_return_span) # the return of several future days 
    stock_raw_data_list.append(tmp_data)

stock_raw_data_df = pd.concat(stock_raw_data_list).sort_values(by="date").dropna()
stock_raw_data_df.date = pd.DatetimeIndex(stock_raw_data_df.date)
# The final DataFrame used for generating dataset
stock_alpha158 = feature_alpha158_dropna.merge(stock_raw_data_df, how = "inner", on = ["stock_code", "date"])

In [61]:
# Split the DataFrame for training, validating and testing
train_df = stock_alpha158[(stock_alpha158.date >= "2016-01-01") & (stock_alpha158.date < "2019-01-01")]
val_df   = stock_alpha158[(stock_alpha158.date >= "2019-01-01") & (stock_alpha158.date < "2020-01-01")]
test_df  = stock_alpha158[(stock_alpha158.date >= "2020-01-01") & (stock_alpha158.date < "2023-01-01")]

In [75]:
def df2array(dataset_df, feat_col, target, type, time_span):
    dataset_feat = []
    dataset_ret = []
    dataset_price = []
    stock_list = list(dataset_df.stock_code.unique())
    for stock in stock_list:
        df = dataset_df[dataset_df.stock_code == stock]
        feat = df[feat_col].to_numpy()
        ret = df[target].to_numpy()
        price = df['close'].to_numpy()
        stock_feat = []
        stock_ret = []
        stock_price = []
        for i in range(time_span, feat.shape[0]):
            stock_feat.append(feat[i-time_span : i])
            stock_ret.append(ret[i])
            stock_price.append(price[i])
        stock_feat = np.array(stock_feat)
        stock_ret = np.array(stock_ret)
        stock_price = np.array(stock_price)
    
        dataset_feat.append(stock_feat)
        dataset_ret.append(stock_ret)
        dataset_price.append(stock_price)

    dataset_feat = np.array(dataset_feat).transpose((1, 2, 0, 3))
    dataset_ret = np.array(dataset_ret).transpose((1, 0))
    dataset_price = np.array(dataset_price).transpose((1, 0))
    
    dataset_feat_tensor = torch.tensor(dataset_feat, dtype=torch.float)
    dataset_ret_tensor = torch.tensor(dataset_ret, dtype=torch.float)
    dataset_price_tensor = torch.tensor(dataset_price, dtype=torch.float)
    '''
    torch.save(dataset_feat_tensor, f"./dataset/alpha/{type}/feat.pt")
    torch.save(dataset_ret_tensor, f"./dataset/alpha/{type}/ret.pt")
    torch.save(dataset_price_tensor, f"./dataset/alpha/{type}/price.pt")    
    '''
    return dataset_feat, dataset_ret, dataset_price

dataset_feat, dataset_ret, dataset_price = df2array(train_df, alpha158_list, target, "train", time_span = 60)

  dataset_feat = np.array(dataset_feat).transpose((1, 2, 0, 3))


ValueError: axes don't match array

In [76]:
dataset_df, feat_col, target, type, time_span = train_df, alpha158_list, target, "train", 60
dataset_feat = []
dataset_ret = []
dataset_price = []
stock_list = list(dataset_df.stock_code.unique())
for stock in stock_list:
    df = dataset_df[dataset_df.stock_code == stock]
    feat = df[feat_col].to_numpy()
    ret = df[target].to_numpy()
    price = df['close'].to_numpy()
    stock_feat = []
    stock_ret = []
    stock_price = []
    for i in range(time_span, feat.shape[0]):
        stock_feat.append(feat[i-time_span : i])
        stock_ret.append(ret[i])
        stock_price.append(price[i])
    stock_feat = np.array(stock_feat)
    stock_ret = np.array(stock_ret)
    stock_price = np.array(stock_price)

    dataset_feat.append(stock_feat)
    dataset_ret.append(stock_ret)
    dataset_price.append(stock_price)

dataset_feat = np.array(dataset_feat).transpose((1, 2, 0, 3))
dataset_ret = np.array(dataset_ret).transpose((1, 0))
dataset_price = np.array(dataset_price).transpose((1, 0))

dataset_feat_tensor = torch.tensor(dataset_feat, dtype=torch.float)
dataset_ret_tensor = torch.tensor(dataset_ret, dtype=torch.float)
dataset_price_tensor = torch.tensor(dataset_price, dtype=torch.float)

  dataset_feat = np.array(dataset_feat).transpose((1, 2, 0, 3))


ValueError: axes don't match array

(18, 60, 159)