In [3]:
import torch
import torch.nn as nn
import h5py
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from torch.utils.data import Dataset,DataLoader,TensorDataset
import numpy as np
from torch.autograd import Variable
import torch.nn.functional as F
import os
import random
import pandas as pd

In [7]:
class BiLSTM_Attention(nn.Module):
    def __init__(self):
        super(BiLSTM_Attention, self).__init__()

        self.lstm = nn.LSTM(embedding_dim, n_hidden, bidirectional=True)
        self.out = nn.Linear(n_hidden * 2, num_classes)

    # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
    def attention_net(self, lstm_output, final_state):
        hidden = final_state.view(-1, n_hidden * 2, 1)   # hidden : [batch_size, n_hidden * num_directions(=2), 1(=n_layer)]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2) # attn_weights : [batch_size, n_step]
        soft_attn_weights = F.softmax(attn_weights, 1)
        # [batch_size, n_hidden * num_directions(=2), n_step] * [batch_size, n_step, 1] = [batch_size, n_hidden * num_directions(=2), 1]
        context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
#         return context, soft_attn_weights.data.numpy() # context : [batch_size, n_hidden * num_directions(=2)]
        return context
    def forward(self, X):
        input = x
#         input = self.embedding(X) # input : [batch_size, len_seq, embedding_dim]
        #len_seq=time embedding_dim=vol
        input = input.permute(1, 0, 2) # input : [len_seq, batch_size, embedding_dim]
        #1层双头
        hidden_state = Variable(torch.zeros(1*2, len(X), n_hidden)).to(device) # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        cell_state = Variable(torch.zeros(1*2, len(X), n_hidden)).to(device) # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        # final_hidden_state, final_cell_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        output, (final_hidden_state, final_cell_state) = self.lstm(input, (hidden_state, cell_state))
        output = output.permute(1, 0, 2) # output : [batch_size, len_seq, n_hidden]
        attn_output= self.attention_net(output, final_hidden_state)
        return self.out(attn_output) # model : [batch_size, num_classes], attention : [batch_size, n_step]

def get_file_list(folder):
    filetype = 'hdf5'
    filelist = []
    for dirpath,dirnames,filenames in os.walk(folder):
        for file in filenames:
            filename = file.split('.')[0][:4]
            file_type = file.split('.')[-1]
            if file_type == filetype and filename in ['2017']:
                file_fullname = os.path.join(dirpath, file) #文件全名
                filelist.append(file_fullname)
    return filelist

def make_val_loader(file_path):
    # start = time.clock()
    temp=h5py.File(file_path,"r")
    x_data = temp['vol'][()]
    y_data = temp['pct_change'][()]
    
    x_data= torch.from_numpy(x_data).float().sum(axis=3)
    y_data = torch.from_numpy(y_data).float()
    dataset = TensorDataset(x_data,y_data)
    
    loader = DataLoader(dataset=dataset,batch_size=256,shuffle=True,drop_last=True,pin_memory=True,num_workers=16)

    return loader

if __name__ == '__main__':
    ########
    device = torch.device("cuda:0") if torch.cuda.is_available() else "cpu"
    embedding_dim = 601
    n_hidden = 100
    num_classes = 1

    model = BiLSTM_Attention()
    model.to(device)
    
    model.load_state_dict(torch.load('./0.0920,0.0889.pkl'),strict=False)
    
#     filelist = get_file_list(r'/data1/lanwei/chouma_h5')
    prefilelist = get_file_list(r'./')
    print(prefilelist)
#     val_dates_2=['20170103.hdf5','20170104.hdf5','20170105.hdf5','20170106.hdf5','20170109.hdf5','20170110.hdf5','20170111.hdf5','20170112.hdf5','20170113.hdf5','20170116.hdf5','20170117.hdf5','20170118.hdf5','20170119.hdf5','20170120.hdf5','20170123.hdf5','20170124.hdf5','20170125.hdf5','20170126.hdf5','20170203.hdf5','20170206.hdf5','20170207.hdf5','20170208.hdf5','20170209.hdf5','20170210.hdf5','20170213.hdf5','20170214.hdf5','20170215.hdf5','20170216.hdf5','20170217.hdf5','20170220.hdf5','20170221.hdf5','20170222.hdf5','20170223.hdf5','20170224.hdf5','20170227.hdf5','20170228.hdf5','20170301.hdf5','20170302.hdf5','20170303.hdf5','20170306.hdf5','20170307.hdf5','20170308.hdf5','20170309.hdf5','20170310.hdf5','20170313.hdf5','20170314.hdf5','20170315.hdf5','20170316.hdf5','20170317.hdf5','20170320.hdf5','20170321.hdf5','20170322.hdf5','20170323.hdf5','20170324.hdf5','20170327.hdf5','20170328.hdf5','20170329.hdf5','20170330.hdf5','20170331.hdf5']
    predict = {'pre':[],'pct':[],'date':[]}
    for path in prefilelist:
        print(path)
        date = path.split('/')[-1][:8]
        print(date)
        
        pre = []
        pct = []

        val_loader = make_val_loader(path)
        # print('loader2')
        for k, (x, y) in enumerate(val_loader):
            x = x.to(device)
            output = model(x)
            
            output = output.squeeze().cpu().detach().numpy()
            y = y.squeeze().numpy()
            pre.append(output)
            pct.append(y)
        predict['pre'].extend(pre)
        predict['pct'].extend(pct)
        predict['date'].extend([date]*(k+1)*256)
        
    predict['pre'] = np.array(predict['pre']).flatten()
    predict['pct'] = np.array(predict['pct']).flatten()
    predict['date'] = np.array(predict['date']).flatten()
    print(len(predict['pre']))
    print(len(predict['pct']))
    print(len(predict['date']))
    
    print
    val_pre=pd.DataFrame(predict)
    val_pre["pre_rank"]=val_pre.groupby("date")["pre"].rank(pct=True)
    val_pre["pre_rank"]=pd.cut(val_pre["pre_rank"],bins=[0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],labels=False)
    print(val_pre.groupby("pre_rank")["pct"].mean())

['./20170120.hdf5', './20170123.hdf5', './20170124.hdf5', './20170125.hdf5', './20170224.hdf5', './20170227.hdf5', './20170228.hdf5', './20170330.hdf5', './20170331.hdf5']
./20170120.hdf5
20170120
./20170123.hdf5
20170123
./20170124.hdf5
20170124
./20170125.hdf5
20170125
./20170224.hdf5
20170224
./20170227.hdf5
20170227
./20170228.hdf5
20170228
./20170330.hdf5
20170330
./20170331.hdf5
20170331
23040
23040
23040
pre_rank
0    0.003706
1    0.004262
2    0.004552
3    0.003993
4    0.005244
5    0.004384
6    0.004826
7    0.005129
8    0.003952
9    0.003956
Name: pct, dtype: float32
