# 股票预测

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
import datetime
import random
import math
import torch.nn.utils.rnn as rnn_utils

In [2]:
seed = 10
torch.manual_seed(seed) #cpu
torch.cuda.manual_seed(seed) #gpu
 
np.random.seed(seed) #numpy
random.seed(seed) # random and transforms

## 1. LSTM 构建

In [3]:
class LSTMCell(nn.Module):
    """
    i = sigmoid(W_{ii}x + b_{ii} + W_{hi}h + b_{hi})
    f = sigmoid(W_{ff}x + b_{ff} + W_{hf}h + b_{hf})
    g = tanh(W_{ig}x + b_{ig} + W_{hg}h + b_{hg})
    o = sigmoid(W_{io}x + b_{io} + W_{ho}h + b_{h_o})
    c' = f*c + i*g
    h' = o*sigmoid(c')
    
    input_size – The number of expected features in the input x
    hidden_size – The number of features in the hidden state h
    """
    
    def __init__(self,input_size,hidden_size):
        super(LSTMCell,self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.wi = nn.Parameter(torch.Tensor(4*hidden_size,input_size))
        self.wh = nn.Parameter(torch.Tensor(4*hidden_size,hidden_size))
        self.bi = nn.Parameter(torch.Tensor(4*hidden_size))
        self.bh = nn.Parameter(torch.Tensor(4*hidden_size))
        self._initilization()
    
    def _initilization(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for p in self.parameters():
            p.data.uniform_(-std, std)
    
    def forward(self,X,h_0, c_0): 
        """
        Parameter:
          X of shape (batch, input_size): tensor containing input features
          h_0 of shape (batch, hidden_size): tensor containing the initial hidden state for each element in the batch.
          c_0 of shape (batch, hidden_size): tensor containing the initial cell state for each element in the batch.
        
        [Return]:
          h_1 of shape (batch, hidden_size): tensor containing the next hidden state for each element in the batch
          c_1 of shape (batch, hidden_size): tensor containing the next cell state for each element in the batch
        """
        
        gates = torch.mm(X, self.wi.T) + self.bi +  torch.mm(h_0, self.wh.T) + self.bh
        i_gate, f_gate, gate, o_gate = gates.chunk(4, 1)
        i = torch.sigmoid(i_gate)
        f = torch.sigmoid(f_gate)
        g = torch.tanh(gate)
        o = torch.sigmoid(o_gate)
        c_1 = f*c_0 + i*g
        h_1 = o*torch.tanh(c_1)
        return h_1,c_1

    
class LSTM(nn.Module):
    """
    i_{t} = sigmoid(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
    f_{t} = sigmoid(W_{ff}x_{t} + b_{ff} + W_{hf}h_{t-1} + b_{hf})
    g_{t} = tanh(W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
    o_{t} = sigmoid(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{h_o})
    c_{t} = f*c_{t-1} + i_{t}*g_{t}
    h_{t} = o_{t}*sigmoid(c_{t})
    """
    def __init__(self,input_size,hidden_size):
        """
        Parameters:
          input_size – The number of expected features in the input x
          hidden_size – The number of features in the hidden state h
        """
        super(LSTM,self).__init__()
        self.cell = LSTMCell(input_size, hidden_size)
        self.hidden_size = hidden_size
    
    def forward(self,X):
        """
        [Parameters]:
          X of shape (seq_len, batch, input_size)
          h_0 of shape (batch, hidden_size)
          c_0 of shape (batch, hidden_size)
          num_directions: seq_len
        [Return]:
          output of shape (seq_len, batch, hidden_size)
          h_n of shape (batch, hidden_size)
          c_n of shape (batch, hidden_size)
        """
        seq_len,batch,input_size = X.shape
         # Initialize hidden state with zeros
        h_0 = torch.zeros(batch, self.hidden_size)
        # Initialize cell state
        c_0 = torch.zeros(batch, self.hidden_size)
        hidden_seq = []
        for i in range(seq_len):
            x_t = X[i,:,:]
            h_0,c_0 = self.cell(x_t,h_0,c_0)
            hidden_seq.append(h_0.unsqueeze(0))
        hidden_seq = torch.cat(hidden_seq, dim=0)
        return hidden_seq,h_0,c_0
            

class Net(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(Net,self).__init__()
        self.lstm = LSTM(input_size,hidden_size)
        self.fc = nn.Linear(hidden_size,output_size)
        
    
    def forward(self,X):
        hidden_seq,h_0,c_0 = self.lstm(X)
        return self.fc(hidden_seq[-1,:,:])
    

## 2. 数据处理

In [4]:
def feature_normalize(data, train = True, mean_ = None,std_ = None):
    if train:
        mean_ = data.mean()
        std_ = data.std()
    return (data - mean_)/std_, mean_,std_

# 铜、铝、铅、镍、锌以及锡
cats = ['Copper','Aluminium','Lead','Nickel','Zinc','Tin']

def get_indices_infos(train = True):
    path = './valid_data'
    if train:
        path = './train_data'
    files = os.listdir(path)
    indices_infos = []
    for file in files:
        if file.startswith('Indices_'):
            indices_info = pd.read_csv(f'{path}/{file}').iloc[:,1:]
            indices_info = indices_info.rename(columns = {'Unnamed: 0.1':'Index'})
            indices_infos.append(indices_info)
    return indices_infos

def get_label_infos(train = True):
    
    if not train:
        valid_label = pd.read_csv('./valid_data/validation_label_file.csv')
        names = valid_label['id'].str.split('-', expand = True)
        valid_label =pd.DataFrame( {
            'Index':valid_label['id'].str[-10:],
            'cat':names[0].str[3:],
            'day':names[2].str[:-1],
            'label':valid_label['label']
    })
        return valid_label
    
    # label 信息
    label_train_infos = {}
    for cat in cats:
        label_infos = []
        for day in [1,20,60]:
            train_d = pd.read_csv(f'./train_data/Label_LME{cat}_train_{day}d.csv').iloc[:,1:]
            train_d = train_d.rename(columns = {'Unnamed: 0.1':'Index',
                                                'LMAHDY':f'{cat}_{day}_LABEL',
                                                'LMCADY':f'{cat}_{day}_LABEL',
                                                'LMPBDY':f'{cat}_{day}_LABEL'})
            label_infos.append(train_d)
        label_train_infos[cat] = label_infos
    return label_train_infos

def get_cat_infos(train = True):
    path = './valid_data'
    if train:
        path = './train_data'
    

### 2.1 训练集数据

In [5]:
!ls ./train_data

 COMEX_Copper_train.csv		    Label_LMENickel_train_1d.csv
 COMEX_Gold_train.csv		    Label_LMENickel_train_20d.csv
 COMEX_Palladium_train.csv	    Label_LMENickel_train_60d.csv
 COMEX_Platinum_train.csv	    Label_LMETin_train_1d.csv
 COMEX_Silver_train.csv		    Label_LMETin_train_20d.csv
'Indices_DXY Curncy_train.csv'	    Label_LMETin_train_60d.csv
'Indices_NKY Index_train.csv'	    Label_LMEZinc_train_1d.csv
'Indices_SHSZ300 Index_train.csv'   Label_LMEZinc_train_20d.csv
'Indices_SPX Index_train.csv'	    Label_LMEZinc_train_60d.csv
'Indices_SX5E Index_train.csv'	    LMEAluminium3M_train.csv
'Indices_UKX Index_train.csv'	    LMEAluminium_OI_train.csv
'Indices_VIX Index_train.csv'	    LMECopper3M_train.csv
 Label_LMEAluminium_train_1d.csv    LMECopper_OI_train.csv
 Label_LMEAluminium_train_20d.csv   LMELead3M_train.csv
 Label_LMEAluminium_train_60d.csv   LMELead_OI_train.csv
 Label_LMECopper_train_1d.csv	    LMENickel3M_train.csv
 Label_LMECopper_train_20d.csv	    LMENicke

In [6]:
train_indices_infos = get_indices_infos()

In [7]:
for indices_info in train_indices_infos:
    print(indices_info.columns)

Index(['Index', 'NKY'], dtype='object')
Index(['Index', 'DXY'], dtype='object')
Index(['Index', 'VIX'], dtype='object')
Index(['Index', 'SX5E'], dtype='object')
Index(['Index', 'SHSZ300'], dtype='object')
Index(['Index', 'SPX'], dtype='object')
Index(['Index', 'UKX'], dtype='object')


In [8]:
label_train_infos = get_label_infos()

In [9]:
# load dataset

columns_map = {'LME_3M_Unnamed: 0.1':'Index',
               'COMEX_Unnamed: 0.1':'Index',
               'LME_OI_Index':'Index'
              }
infos = []
cats_std_mean = {}
for cat in tqdm(cats):
    LME_OI = pd.read_csv(f'./train_data/LME{cat}_OI_train.csv').iloc[:,1:]
    LME_OI = LME_OI.add_prefix('LME_OI_')
    LME_3M = pd.read_csv(f'./train_data/LME{cat}3M_train.csv').iloc[:,1:]
    LME_3M = LME_3M.add_prefix('LME_3M_')
    LME_OI = LME_OI.rename(columns =columns_map)
    LME_3M = LME_3M.rename(columns =columns_map)
    info = pd.merge(LME_OI,LME_3M, how='outer',on = 'Index')
#     for indices_info in train_indices_infos:
#         info = pd.merge(info,indices_info,how='outer',on = 'Index')
    if cat == 'Copper':
        COMEX = pd.read_csv(f'./train_data/COMEX_{cat}_train.csv').iloc[:,1:]
        COMEX = COMEX.add_prefix('COMEX_')
        COMEX = COMEX.rename(columns = columns_map)
        info = pd.merge(info,COMEX, how='outer',on = 'Index')
      # 缺失值处理，填充前后两天的均值
    info = info.fillna(method='ffill')
    info = info.fillna(method='bfill')
    data = info.drop('Index',axis = 1)
    data,mean_, std_ = feature_normalize(data)
    info[data.columns] = data
    cats_std_mean[cat] = (mean_, std_)
    for label_info in label_train_infos[cat]:
        # 剔除一些没有标签的数据
        info = pd.merge(info,label_info, how='right',on = 'Index')
    info = info.drop('Index',axis = 1)
    infos.append(info)

100%|██████████| 6/6 [00:00<00:00, 28.50it/s]


In [10]:
for cat,info in zip(cats,infos):
    print(f'{cat} shape {info.shape}')
copper_info,al_info, lead_info,nickel_info,zinc_info,tin_info = infos

Copper shape (3790, 15)
Aluminium shape (3790, 9)
Lead shape (3790, 9)
Nickel shape (3790, 9)
Zinc shape (3790, 9)
Tin shape (3790, 9)


In [11]:
copper_info.head()

Unnamed: 0,LME_OI_LMCADS03,LME_3M_Open.Price,LME_3M_High.Price,LME_3M_Low.Price,LME_3M_Close.Price,LME_3M_Volume,COMEX_Open,COMEX_High,COMEX_Low,COMEX_Close,COMEX_Volume,COMEX_Open.Interest,Copper_1_LABEL,Copper_20_LABEL,Copper_60_LABEL
0,-0.546339,-2.118704,-2.111271,-2.109964,-2.10429,0.012048,0.616517,0.615958,0.648059,0.643528,3.061131,2.741004,1.0,1.0,1.0
1,-0.546339,-2.095403,-2.099983,-2.08834,-2.094083,0.012048,-2.106642,-2.102181,-2.097899,-2.092263,-0.701535,-0.933671,1.0,1.0,0.0
2,-0.546339,-2.088121,-2.093498,-2.08146,-2.085577,0.012048,-2.090807,-2.087562,-2.080266,-2.078006,-0.795391,-0.925101,0.0,1.0,0.0
3,-0.546339,-2.091034,-2.094939,-2.081951,-2.089222,0.012048,-2.073916,-2.081296,-2.067976,-2.070086,-0.896738,-0.932554,1.0,1.0,0.0
4,-0.546339,-2.089092,-2.088214,-2.079985,-2.081446,0.012048,-2.068637,-2.081818,-2.06477,-2.073254,-0.810024,-0.947128,1.0,1.0,0.0


### 2.2 测试集数据

In [12]:
!ls ./valid_data

 COMEX_Copper_validation.csv		 LMEAluminium_OI_validation.csv
 COMEX_Gold_validation.csv		 LMECopper3M_validation.csv
 COMEX_Palladium_validation.csv		 LMECopper_OI_validation.csv
 COMEX_Platinum_validation.csv		 LMELead3M_validation.csv
 COMEX_Silver_validation.csv		 LMELead_OI_validation.csv
'Indices_NKY Index_validation.csv'	 LMENickel3M_validation.csv
'Indices_SHSZ300 Index_validation.csv'	 LMENickel_OI_validation.csv
'Indices_SPX Index_validation.csv'	 LMETin3M_validation.csv
'Indices_SX5E Index_validation.csv'	 LMETin_OI_validation.csv
'Indices_UKX Index_validation.csv'	 LMEZinc3M_validation.csv
'Indices_VIX Index_validation.csv'	 LMEZinc_OI_validation.csv
 LMEAluminium3M_validation.csv		 validation_label_file.csv


In [13]:
valid_labels = get_label_infos(train = False)

In [14]:
labels = valid_labels.groupby(['cat','day'])
labels_map = {}
for (cat, day), label in labels:
    if cat not in labels_map:
        labels_map[cat] = {}
    labels_map[cat][day] = label.reset_index()

In [48]:
labels_map['Copper']['60'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253 entries, 0 to 252
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   253 non-null    int64 
 1   Index   253 non-null    object
 2   cat     253 non-null    object
 3   day     253 non-null    object
 4   label   253 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 10.0+ KB


In [16]:
valid_infos = []
for cat in tqdm(cats):
    LME_OI = pd.read_csv(f'./valid_data/LME{cat}_OI_validation.csv').iloc[:,1:]
    LME_OI = LME_OI.add_prefix('LME_OI_')
    LME_3M = pd.read_csv(f'./valid_data/LME{cat}3M_validation.csv').iloc[:,1:]
    LME_3M = LME_3M.add_prefix('LME_3M_')
    LME_OI = LME_OI.rename(columns =columns_map)
    LME_3M = LME_3M.rename(columns =columns_map)
    info = pd.merge(LME_OI,LME_3M, how='outer',on = 'Index')
#     for indices_info in get_indices_infos(train = False):
#         info = pd.merge(info,indices_info,how='outer',on = 'Index')
    if cat == 'Copper':
        COMEX = pd.read_csv(f'./valid_data/COMEX_{cat}_validation.csv').iloc[:,1:]
        COMEX = COMEX.add_prefix('COMEX_')
        COMEX = COMEX.rename(columns = columns_map)
        info = pd.merge(info,COMEX, how='outer',on = 'Index')        
    info = info.fillna(method='ffill')
    info = info.fillna(method='bfill')
    valid_infos.append(info)
copper_valid_info,al_valid_info, lead_valid_info,nickel_valid_info,zinc_valid_info,tin_valid_info = valid_infos

100%|██████████| 6/6 [00:00<00:00, 107.31it/s]


In [17]:
copper_valid_info.head()

Unnamed: 0,Index,LME_OI_LMCADS03,LME_3M_Open.Price,LME_3M_High.Price,LME_3M_Low.Price,LME_3M_Close.Price,LME_3M_Volume,COMEX_Open,COMEX_High,COMEX_Low,COMEX_Close,COMEX_Volume,COMEX_Open.Interest
0,2018-01-02,2519.0,7250.0,7285.0,7196.0,7205.0,33751.0,329.65,330.95,326.75,330.05,107293.0,175744.0
1,2018-01-03,3079.0,7210.0,7212.5,7132.5,7147.0,39658.0,329.65,330.95,326.75,327.8,107293.0,175744.0
2,2018-01-04,3106.0,7166.0,7265.0,7150.0,7188.5,39800.0,327.75,328.0,323.65,325.75,104300.0,177263.0
3,2018-01-05,2635.0,7198.5,7217.5,7107.0,7121.0,36018.0,325.7,329.8,324.5,326.3,105443.0,176654.0
4,2018-01-08,2257.0,7133.5,7160.0,7105.0,7125.0,35165.0,325.85,327.6,322.25,322.95,84947.0,174869.0


## 3. pytorch lstm model

In [94]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

In [95]:
def generate_time_sequences(data,window_size = 120,label_name = 'Copper_1_LABEL',drop_name = ['Copper_1_LABEL','Copper_20_LABEL','Copper_60_LABEL']):
    train_data = []
    labels = []
    for i in tqdm(range(len(data) - window_size)):
        train_data.append(torch.from_numpy(data.iloc[i:i + window_size,:].drop(drop_name,axis = 1).values).float())
        labels.append(torch.Tensor([data.iloc[i + window_size,:][label_name]]).long())
    return train_data, labels

def generate_test_time_sequences(test_data, labels_map,window_size = 120,cat = 'Copper', day = '1'):
    
    labels = labels_map[cat][day][['Index','label']]
    data = pd.merge(test_data, labels,how = 'right', on= 'Index')
    data = data.drop('Index',axis = 1)
    train_data = []
    labels = []
    for i in tqdm(range(len(data) - window_size)):
        train_data.append(torch.from_numpy(data.iloc[i:i + window_size,:].drop('label',axis = 1).values).float())
        labels.append(torch.Tensor([data.iloc[i + window_size,:]['label']]).long())
    return train_data, labels

In [96]:
train_data,train_labels = generate_time_sequences(copper_info)

100%|██████████| 3670/3670 [00:02<00:00, 1374.68it/s]


In [97]:
train_labels = torch.cat(train_labels)

In [98]:
data = copper_valid_info.drop('Index',axis = 1)
data ,_,_ = feature_normalize(data ,train = False,mean_= cats_std_mean['Copper'][0],std_= cats_std_mean['Copper'][1])
copper_valid_info[data.columns] = data

In [99]:
test_data,test_labels = generate_test_time_sequences(copper_valid_info,labels_map)

100%|██████████| 133/133 [00:00<00:00, 1228.94it/s]


In [100]:
test_labels = torch.cat(test_labels)

In [101]:
class LstmDataset(Dataset):
    def __init__(self,train_data,train_labels):
        self.train_data = train_data
        self.train_labels = train_labels

    def __getitem__(self,idx):
        return self.train_data[idx],self.train_labels[idx]
    
    def __len__(self):
        return len(self.train_data)


In [102]:
data_loader = DataLoader(LstmDataset(train_data,train_labels),batch_size = 5,shuffle = True)
test_loader = DataLoader(LstmDataset(test_data,test_labels),batch_size = 5,shuffle = True)

In [103]:
net = Net(train_data[0].size()[1],10,2)
criterion = torch.nn.CrossEntropyLoss()
op = torch.optim.Adam(net.parameters(),lr = 0.5)
epochs = 10
batch_size = 5

In [104]:
%%time
for epoch in range(epochs):
    loss = 0.0
    acc = 0.0
    for i,(x, label) in enumerate(data_loader):
        op.zero_grad()
        x = x.permute(1,0,2)
        out = net(x)
        l = criterion(out,label)
        l.backward()
        op.step()
        acc += (1 - torch.argmax(out,dim =1)^label).sum().item()/batch_size
        loss += l.item()
    print(f'train [epoch]: {epoch} , [loss]: {loss/len(data_loader)} , [acc]: {acc/len(data_loader)}')
    with torch.no_grad():
        loss = 0.0
        acc = 0.0
        for i, (x, label) in enumerate(test_loader):
            x = x.permute(1,0,2)
            out = net(x)
            l = criterion(out,label)
            acc += (1 - torch.argmax(out,dim =1)^label).sum().item()/batch_size
            loss += l.item()
        print(f'valid [epoch]: {epoch} , [loss]: {loss/len(test_loader)} , [acc]: {acc/len(test_loader)}')
    
        
    

train [epoch]: 0 , [loss]: 0.9260147709896844 , [acc]: 0.5046321525885555
valid [epoch]: 0 , [loss]: 0.6945644881990221 , [acc]: 0.4148148148148148
train [epoch]: 1 , [loss]: 0.8435657898557933 , [acc]: 0.512806539509537
valid [epoch]: 1 , [loss]: 1.4836063578172967 , [acc]: 0.4148148148148148
train [epoch]: 2 , [loss]: 0.8434179985352693 , [acc]: 0.4980926430517715
valid [epoch]: 2 , [loss]: 2.035178021148399 , [acc]: 0.4148148148148148
train [epoch]: 3 , [loss]: 0.8659314645977046 , [acc]: 0.4899182561307898
valid [epoch]: 3 , [loss]: 0.6842971245447794 , [acc]: 0.5703703703703703
train [epoch]: 4 , [loss]: 0.7999284410346756 , [acc]: 0.5125340599455047
valid [epoch]: 4 , [loss]: 0.8930187975918805 , [acc]: 0.5703703703703704
train [epoch]: 5 , [loss]: 0.7833914375313297 , [acc]: 0.502724795640327
valid [epoch]: 5 , [loss]: 0.680080520885962 , [acc]: 0.5703703703703703
train [epoch]: 6 , [loss]: 0.7973075066908829 , [acc]: 0.4972752043596729
valid [epoch]: 6 , [loss]: 0.9455261837553