In [5]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
from my_model.linear_model import layer_embedding as MyModel
from my_dataset.linear_dataset import MyDataset

In [2]:
factor_concat = pd.read_pickle("./data/factor_concat_2018_2019.pkl")
stock_return = pd.read_pickle("./data/stock_return.pkl")
stock_return.head()

code,000001.SZ,000002.SZ,000004.SZ,000005.SZ,000006.SZ,000007.SZ,000008.SZ,000009.SZ,000010.SZ,000011.SZ,...,688786.SH,688787.SH,688788.SH,688789.SH,688793.SH,688798.SH,688799.SH,688800.SH,688819.SH,688981.SH
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-02,-0.030448,-0.069008,0.025657,-0.044194,-0.136403,0.005676,-0.099164,-0.079422,-0.115093,-0.092042,...,,,,,,,,,,
2014-01-03,-0.007562,-0.054992,-0.052687,-0.053074,-0.120228,0.002179,-0.090949,-0.054661,-0.069328,-0.094981,...,,,,,,,,,,
2014-01-06,0.003476,-0.033638,-0.057617,-0.033409,-0.058411,-0.017157,-0.012993,-0.033251,-0.032283,-0.063117,...,,,,,,,,,,
2014-01-07,0.00515,-0.032215,-0.061517,-0.03286,-0.035923,-0.048517,0.035773,-0.027421,-0.038713,-0.054584,...,,,,,,,,,,
2014-01-08,-0.002579,-0.033638,-0.051468,-0.029227,-0.022742,-0.039024,0.044866,0.006798,-0.074572,-0.032958,...,,,,,,,,,,


In [3]:
dataset = MyDataset(factor_concat, stock_return)
dataloader = DataLoader(dataset, batch_size=256, shuffle=False, drop_last=False)

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
feature_num = len(factor_concat.columns.levels[0])
stock_num = len(dataset.num_code_dict)
model = MyModel(975,3028,10)
model.load_state_dict(torch.load('./log/18_eb/model_30.pth'))
model.to(device)

layer_embedding(
  (embedding): Embedding(3028, 10)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=985, out_features=256, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): ReLU()
    (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=256, out_features=256, bias=True)
    (7): ReLU()
    (8): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): Linear(in_features=256, out_features=1, bias=True)
  )
)

In [10]:
score = stock_return.copy()
score.iloc[:,:] = np.nan
score_dict = {}
for date in dataset.date_num_dict.keys():
    score_dict[date] = {}
for date_num, code_num, x, y in tqdm(dataloader):
    date_list = date_num.numpy().flatten()
    code_list = code_num.numpy().flatten()
    stock_id = code_num.int().to(device)
    x = x.float().to(device)
    y_pred = model(stock_id, x)
    y_pred = torch.sigmoid(y_pred).to("cpu").detach().numpy().flatten()
    for i in range(len(date_list)):
        date = dataset.num_date_dict[date_list[i]]
        code = dataset.num_code_dict[code_list[i]]
        score_dict[date][code] = y_pred[i]

 24%|██▎       | 670/2824 [00:11<00:35, 60.23it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
score = pd.DataFrame(score_dict)
score.reindex(stock_return.columns)
score = score.sort_index().T
score.to_pickle("F:\Multifactor_Project\score_18.pkl")
score

In [3]:
import pandas as pd
import numpy as np
stock_price = pd.read_pickle("F:\Trade_data\\adjopen.pkl")
stock_return = stock_price.pct_change().shift(-6)
quantile = 2
return_stack = stock_return.stack().dropna()
quantile_return = return_stack.groupby("dt").apply(
    lambda x: pd.qcut(
        x, np.arange(quantile + 1) / quantile, np.arange(quantile)
    )
)
quantile_return[quantile_return < (quantile - 1)] = 0
quantile_return[quantile_return == (quantile - 1)] = 1
quantile_return.to_pickle("./data/quantile_return.pkl")
quantile_return

dt          code     
2014-01-02  000001.SZ    1
            000002.SZ    1
            000004.SZ    0
            000005.SZ    1
            000006.SZ    0
                        ..
2022-12-22  688798.SH    1
            688799.SH    1
            688800.SH    1
            688819.SH    0
            688981.SH    1
Length: 7740431, dtype: category
Categories (2, int64): [0 < 1]

In [4]:
from my_dataset.linear_dataset import make_idx_num_dict
date_num_dict, num_date_dict = make_idx_num_dict(stock_price.index)
code_num_dict, num_code_dict = make_idx_num_dict(stock_price.columns)
import pickle

with open('./data/processed_data/date_num_dict.pkl', 'wb') as f:
    pickle.dump(date_num_dict, f)
with open('./data/processed_data/num_date_dict.pkl', 'wb') as f:
    pickle.dump(num_date_dict, f)
with open('./data/processed_data/code_num_dict.pkl', 'wb') as f:
    pickle.dump(code_num_dict, f)
with open('./data/processed_data/num_code_dict.pkl', 'wb') as f:
    pickle.dump(num_code_dict, f)
print("finished")

finished


In [5]:
import pandas as pd
# code_num_dict = pd.read_pickle("./data/processed_data/date_num_dict.pkl")
code_num_dict['300136.SZ']

1681