In [3]:
import pandas as pd
import numpy as np
from sklearn import metrics
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
features_stock = pd.read_csv('feature_stock1.csv')
features_stock2 = pd.read_csv('feature_stock2.csv')
features_stock = features_stock2.merge(features_stock, on=['trade_date', 'ts_code'], how='left')
del features_stock2
feature_index = pd.read_csv('feature_index.csv')
features_stock = features_stock.merge(feature_index[['trade_date', 'rate_1', 'rate_2', 'rate_3', 
                                                     'rate', ]], on='trade_date', how='left')
del feature_index

In [None]:
feature_col = ['open_transform', 'close_transform', 'high_transform', 'low_transform',
                   'open_transform_shift_1', 'open_transform_shift_2', 'close_transform_shift_1',
                   'close_transform_shift_2', 'high_transform_shift_1', 'high_transform_shift_2',
                   'low_transform_shift_1', 'low_transform_shift_2', 'rate', 'rate_1', 'rate_2', 'rate_3',
                   'open_transform_3', 'close_3', 'turnover_rate',
                   'pingjun_3', 'turnover_rate_shift_1', 'turnover_rate_shift_2', 'weekday',
                   'zhenfu', 'zhenfu_shift_1', 'zhenfu_shift_2', 'high_10', 'low_10', 'high_20', 'low_20'
                   ]

In [None]:
features_stock = features_stock.rename(columns={'ts_code':'name', 'trade_date':'day', 'close':'close_price'})
features = features_stock
features = features.dropna().reset_index(drop=True)
features['label'] = ((features['next_close'] / features['close_price'] - 1) > 0)
features['rate_stock'] = (features['next_close'] / features['close_price'] - 1)

In [None]:
features['label'] = features['label'].astype('int')

In [None]:
train_date_min = 20170101
train_date_max = 20190101

val_date_min = 20190102
val_date_max = 20200101

In [None]:
df = features
idx = (df['day']>=train_date_min) & (df['day']<=train_date_max)
for tmp_col in feature_col:
    max_ = np.percentile(df[idx][tmp_col], 99.99)
    df.loc[df[tmp_col]>max_, tmp_col] = max_
    min_ = np.percentile(df[idx][tmp_col], 0.01)
    df.loc[df[tmp_col]<min_, tmp_col] = min_
    
    df[tmp_col] = (df[tmp_col] - df[idx][tmp_col].mean()) / (df[idx][tmp_col].std() + 1e-6)

In [None]:

# 获取class的天和time
train_data = []
train_label = []
test_data = []
test_label = []

trn_time_start = []  # 起始时间
val_time_start = []  # 测试起始时间
classes = []
class_dict = dict()
data_len = 10
num = -1

df_test_all = []

for i, g in tqdm(df.groupby(['name'])):
    num = num + 1
    class_dict[num] = i
    classes.append(num)
    
    g = g.reset_index(drop=True)
    g = g.sort_values(['day'], ascending=[True]).reset_index(drop=True)
    g_trn = g[(g['day'] >= train_date_min) & (g['day'] <= train_date_max)].reset_index(drop=True)
    
    if len(g_trn) < data_len:
        train_data.append([])
        train_label.append([])
        trn_time_start.append([])
#         print('%s has no training data' % i)
    else:

        train_data.append(g_trn[feature_col].values)
        train_label.append(g_trn['label'].values)
        trn_time_start.append(list(range(len(g_trn) - data_len + 1)))
    
    idx = g[(g['day'] >= val_date_min) & (g['day'] <= val_date_max)].index  
    if len(idx) < data_len:
        val_time_start.append([])
        test_data.append([])
        test_label.append([])

    else:
        st = max(idx[0] - data_len + 1, 0)
        et = idx[-1]
        g_test = g.loc[st:et]
        val_time_start.append(list(range(len(g_test) - data_len + 1)))
        test_data.append(g_test[feature_col].values)
        test_label.append(g_test['label'].values)
        df_test_all.append(g.loc[st + data_len - 1:et][['name', 'day', 'label', 'next_close', 'close_price', 'is_limit']])
df_test_all = pd.concat(df_test_all)   

In [None]:
# 生成训练idx
idxs = []
for i in range(len(classes)):
    tmp_t = trn_time_start[i]
    tmp_t = np.reshape(tmp_t, (-1, 1))
    if len(trn_time_start[i]) <= 0:
        continue
    tmp_c = np.ones((len(trn_time_start[i]), 1)) * classes[i]
    idxs.append(np.concatenate([tmp_c, tmp_t], axis=1))
    

idxs = np.concatenate(idxs, axis=0)
df_sample_trn = pd.DataFrame(idxs, columns=['id', 'st'])
df_sample_trn['id'] = df_sample_trn['id'].astype(int)
df_sample_trn['st'] = df_sample_trn['st'].astype(int)

idxs = []

for i in range(len(classes)):
    test_ = test_data[i]
    tmp_t = val_time_start[i]
    tmp_t = np.reshape(tmp_t, (-1, 1))
    if len(val_time_start[i]) <= 0:
        continue
    tmp_c = np.ones((len(val_time_start[i]), 1)) * classes[i]
    idxs.append(np.concatenate([tmp_c, tmp_t], axis=1))

    assert len(test_)-data_len+1 == len(val_time_start[i])

idxs = np.concatenate(idxs, axis=0)
df_sample_test = pd.DataFrame(idxs, columns=['id', 'st'])
df_sample_test['id'] = df_sample_test['id'].astype(int)
df_sample_test['st'] = df_sample_test['st'].astype(int)

In [None]:
class lstm(nn.Module):
    def __init__(self, in_dim):
        super(lstm, self).__init__()
        self.lstm1 = nn.LSTM(in_dim, 64, 1, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(128, 128, 1, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(256, 1)

    def forward(self, x):
        x, (_, _) = self.lstm1(x)
        x, (_, _) = self.lstm2(x)
        #         print(x.shape)
        x = self.fc(x[:, -1, :].reshape((x.size(0), -1)))
        return x

In [None]:
def train_epoch(model, optimizer, criterion, train_dataloader, val_dataloader):
    model.train()
    f1_meter, loss_meter, it_count = 0, 0, 0
    tq = tqdm(range(len(train_dataloader)))
    for i, (inputs, target) in enumerate(train_dataloader):
        inputs = inputs.to(device)
        target = target.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward
        output = model(inputs)
        output = torch.sigmoid(output)

        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        loss_meter += loss.item()
        it_count += 1
        tq.set_description('batch: %d, loss: %.3f' % (i, loss.item()))
        tq.update(1)
    tq.close()
    return loss_meter / it_count

def val_epoch(model, criterion, val_dataloade):
    model.eval()
    loss_meter, it_count = 0, 0

    with torch.no_grad():
        if torch.cuda.is_available():
            label_all = torch.FloatTensor().cuda()
            pred_all = torch.FloatTensor().cuda()
        else:
            label_all = torch.FloatTensor()
            pred_all = torch.FloatTensor()

        num = 0
        for inputs, target in tqdm(val_dataloader):
            inputs = inputs.to(device)
            target = target.to(device)
            output = model(inputs)
            output = torch.sigmoid(output)
            it_count += 1
            label_all = torch.cat((label_all, target), 0)
            pred_all = torch.cat((pred_all, output), 0)

        output = pred_all.cpu().detach().numpy()
        target = label_all.cpu().detach().numpy()
        
        loss = np.mean(target * np.log(output) + (1-target) * np.log((1-output)))
#         loss = np.mean(np.abs(output - target))

    return loss, target, output

In [None]:
class MyDataset(Dataset):
    def __init__(self, df_sample, data, label, data_len, train=True):
        super(MyDataset, self).__init__()
        self.df_sample = df_sample.values
        self.data_len = data_len
        self.label = label
        self.data = data
        

    def __getitem__(self, index):

        idx = int(self.df_sample[index][0])
        st = int(self.df_sample[index][1])
        tmp_data = self.data[idx]
        tmp_label = self.label[idx]
        x = tmp_data[st:st + self.data_len]
        y = tmp_label[st + self.data_len-1]
        y = np.reshape(y, (-1))
        x = np.reshape(x, (self.data_len, len(feature_col)))
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.df_sample)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(2020)
torch.cuda.manual_seed(2020)
label_col = 'label'

In [None]:
inchannels = len(feature_col)
model = lstm(inchannels)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.BCELoss()
train_dataset = MyDataset(df_sample_trn, train_data, train_label, data_len,)
val_dataset = MyDataset(df_sample_test, test_data, test_label, data_len,)

train_dataloader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=1024, num_workers=4)


In [None]:
max_epoch = 1
for i in range(max_epoch):
    trn_loss = train_epoch(model, optimizer, criterion, train_dataloader, val_dataloader)
    model_save_dir = 'model_lstm_' + str(i+1) + '.pth'
    torch.save(model.state_dict(), model_save_dir)
    print(trn_loss)

In [None]:
loss, target, output = val_epoch(model, criterion, val_dataloader)

In [None]:
day_list = sorted(df_test_all['day'].unique())
oof = output > 0.5
print('acc: %.4f' % metrics.accuracy_score(target, oof))
print(metrics.confusion_matrix(target, oof))

oof = output > 0.6
acc2 = np.sum(oof * target) / np.sum(oof)
print('precision: %.4f' % acc2)

In [None]:
buy_df = df_test_all.copy()
buy_df['pred'] = oof
buy_df['prob'] = output
idx = (buy_df['is_limit']==False) & (buy_df['pred']==1) #&(buy_df['close_transform']<1.15)
buy_df = buy_df[idx]
buy_df['next_open'] = buy_df['next_close']

In [None]:
from imp import reload
import Account
reload(Account)
money_init = 100000
account = Account.Account(money_init)
account.BackTest(buy_df, sorted(day_list), buy_price='close_price')

In [None]:
account_profit = (account.market_value - money_init) / money_init
win_rate = account.victory / (account.victory + account.defeat)
print('账户盈利情况:%.4f' % account_profit)
print('交易胜率:%.4f' % win_rate)
print('最大回撤率:%.4f' % account.max_retracement)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure()
plt.plot(account.market_value_all)
plt.show()