In [1]:
import pandas as pd
import os
from glob import glob
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pyupbit
import math

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# Model

In [2]:
class Transformer2FC(nn.Module) :
    def __init__(self, input_shape, d_model, n_head, num_layer, dropout, num_class=2):
        super(Transformer2FC, self).__init__()
        
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_head, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layer)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        self.Encoder = nn.Sequential(
            nn.Linear(input_shape[1], d_model//2),
            nn.ReLU(),
            nn.Linear(d_model//2, d_model)
        )
        
        self.linear = nn.Sequential(
            nn.Linear(d_model, d_model//2),
            nn.ReLU(),
            nn.Linear(d_model//2, 1)
        )

        self.linear2 = nn.Sequential(
            nn.Linear(input_shape[0], input_shape[0]//2),
            nn.ReLU(),
            nn.Linear(input_shape[0]//2, num_class)
        )
        
#         self.sigmoid = nn.Softmax()
    
    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
    
    def forward(self, x, masked_x) :
        # (batch, data, dim)
        x = self.Encoder(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x.transpose(0,1), masked_x).transpose(0, 1)
        x = self.linear(x)
        x = x.squeeze(2)
        x = self.linear2(x)
        x = x.squeeze(1)
        return x

class PositionalEncoding(nn.Module) :
    def __init__(self, d_model, dropout=0.1, max_len=5000) :
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
        
    def forward(self, x) :
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

def get_attention_mask(x) :
    mask = torch.eq(x, 0)
    return mask

# Data Preprocessing

In [3]:
class Data_preprocess() :
    def __init__(self, ticker, interval, to, count, norm) :
        self.norm = {"minmax" : self.MinMax,
                    "stand" : self.standarization,
                    "diff" : self.diff}
        
        self.data, self.label, self.dataset = self.preprocess(pyupbit.get_ohlcv(ticker=ticker, interval=interval, to=to, count=count),
                                                             normalization=norm)
        
    def MinMax(self, df) :
        norm = MinMaxScaler()
        norm_dataset = norm.fit_transform(df)
        return pd.DataFrame(norm_dataset, columns=list(df.columns))
    
    def standarization(self, df) :
        for col in df:
            df[col] = (df[col] - df[col].mean()) / df[col].std()
        return df
    
    def diff(self, df) :
        for col in (df):
            log_y = np.log(df[col])
            df[col][1:] = np.diff(log_y)
        return df[1:]
    

    def add_label(self, dataset_df) :
        after10 = np.zeros_like(dataset_df['close'])
        for i in range(len(dataset_df['close']) - 1) :
            if dataset_df['close'][i + 1] > dataset_df['close'][i] :
                after10[i] = 1
            else : 
                after10[i] = 0
            
        return pd.DataFrame(after10,columns=['label'])
    
    
    def drop_feature(self, dataset_df) :
        # index(시간) 제거
        dataset_df = dataset_df.reset_index(drop=True)
        # value 제거
#         dataset_df = dataset_df.drop(columns=['value'])
        return dataset_df
    
    
    def add_avgPrice(self, dataset_df) :
        return (dataset_df['high'] + dataset_df['low'] + 
                dataset_df['open'] + dataset_df['close']) // 4
       
    
    def preprocess(self, dataset, normalization) :
        
        # drop feature
        dataset_df = self.drop_feature(dataset)
        
        # avg_price 추가
        dataset_df['avg_price'] = self.add_avgPrice(dataset_df)
        
        # label 추가
        if normalization == "diff" :
            label = self.add_label(dataset_df)[1:-1]
        else :
            label = self.add_label(dataset_df)[:-1]
        
        norm_df = self.norm[normalization](dataset_df.copy())[:-1]
        
        return norm_df, label, dataset_df[:-1]
        
        

In [81]:
ticker = 'KRW-BTC'
interval ='minute1'
to = f'2022-04-05 00:00'
count = 1440 # minimum data 개수는 145개

processed_data =  Data_preprocess(ticker, interval, to, count, "stand")
display(processed_data.data)
display(processed_data.label)
display(processed_data.dataset)

Unnamed: 0,open,high,low,close,volume,value,avg_price
0,0.915429,0.889563,0.923616,0.978195,1.135768,1.142924,0.929268
1,0.980266,0.908707,1.086308,1.001050,0.898145,0.905184,0.996495
2,1.003150,0.969965,1.086308,1.027715,0.347703,0.353493,1.024347
3,1.060359,1.134596,1.117297,1.225795,3.048927,3.064478,1.137672
4,1.228173,1.165225,1.307105,1.191512,0.605243,0.613281,1.226028
...,...,...,...,...,...,...,...
1434,-1.109776,-1.032414,-1.036440,-0.972132,0.645137,0.636519,-1.040484
1435,-0.968660,-1.040071,-1.156523,-1.185449,0.687296,0.677227,-1.090424
1436,-1.235636,-1.170245,-1.160397,-1.200686,-0.441703,-0.443929,-1.195106
1437,-1.197497,-1.300419,-1.183638,-1.257824,-0.445634,-0.447916,-1.238323


Unnamed: 0,label
0,1.0
1,1.0
2,1.0
3,0.0
4,0.0
...,...
1434,0.0
1435,0.0
1436,0.0
1437,0.0


Unnamed: 0,open,high,low,close,volume,value,avg_price
0,56777000.0,56795000.0,56751000.0,56794000.0,8.018144,4.552798e+08,56779250.0
1,56794000.0,56800000.0,56793000.0,56800000.0,7.133874,4.051642e+08,56796750.0
2,56800000.0,56816000.0,56793000.0,56807000.0,5.085501,2.888675e+08,56804000.0
3,56815000.0,56859000.0,56801000.0,56859000.0,15.137623,8.603438e+08,56833500.0
4,56859000.0,56867000.0,56850000.0,56850000.0,6.043890,3.436310e+08,56856500.0
...,...,...,...,...,...,...,...
1434,56246000.0,56293000.0,56245000.0,56282000.0,6.192351,3.485296e+08,56266500.0
1435,56283000.0,56291000.0,56214000.0,56226000.0,6.349237,3.571108e+08,56253500.0
1436,56213000.0,56257000.0,56213000.0,56222000.0,2.147869,1.207709e+08,56226250.0
1437,56223000.0,56223000.0,56207000.0,56207000.0,2.133240,1.199304e+08,56215000.0


# Custom dataset

In [82]:
class WindowDataset(Dataset) :
    def __init__(self, df_data, df_label, window_size=144, stride=6, norm="diff") :
        self.data, self.label = self.WindowDataGenerator(df_data, df_label, window_size, stride, norm)
        
    def __getitem__(self, i) :
        return self.data[i], self.label[i]
                
    def __len__(self) :
        assert len(self.data) == len(self.label), "data와 label의 길이가 다름"
        return len(self.data)
        
    
    def EachColumnMinMax(self, df) :
        norm = MinMaxScaler()
        df = norm.fit_transform(np.array(df).reshape(-1, 1)).squeeze(1)
        return df

    def EachColumnStand(self, df) :
        df = (df - df.mean()) / df.std()
        return df

    def EachColumnDiff(self, df) :
        log_y = np.log(df)
        df[1:] = np.diff(log_y)
        return df[1:]


    def WindowDataGenerator(self, df_data, df_label, window_size, stride, norm) :
        if norm == "diff" :
            num_sample = ((df_data.shape[0] - 1) - window_size) // stride + 1
            data = np.zeros([window_size - 1, df_data.shape[1], num_sample])
        else : 
            num_sample = (df_data.shape[0] - window_size) // stride + 1
            data = np.zeros([window_size, df_data.shape[1], num_sample])
            
        labels = np.zeros([num_sample])

        for i in tqdm(range(num_sample)) :
            tmp = {}
            data_start = stride * i
            data_end = data_start + window_size
            for col in df_data.columns :
                if norm == "diff" :
                    tmp[col] = self.EachColumnDiff(df_data[col][data_start : data_end].copy())
                elif norm == "stand" :
                    tmp[col] = self.EachColumnStand(df_data[col][data_start : data_end].copy())
                elif norm == "minmax" :
                    tmp[col] = self.EachColumnMinMax(df_data[col][data_start : data_end].copy())

#                 tmp[col] = self.EachColumnMinMax(df_data.loc[data_start : data_end - 1, col])

            data[:, :, i] = pd.DataFrame(tmp).values
            labels[i] = df_label.values[data_end - 1]

        data = data.transpose((2, 0, 1))
        print("dataset shape ==== ",data.shape)

        # data shape (80, 600, 6), label shape (80,)
        return torch.Tensor(data), torch.Tensor(labels)

In [87]:
device = torch.device("cpu")
lr = 1e-4
epochs = 500
window_size = 24 * 6
window_stride = 12
feature_len = 7
batch_size = 128
num_class = 1

path = "./model/300E_stand_96_model.pt"

model = Transformer2FC(input_shape=(window_size, feature_len), 
                       d_model=512, 
                       n_head=8, 
                       num_layer=4, 
                       dropout=0.3, 
                       num_class = num_class).to(device)

model.load_state_dict(torch.load(path, map_location=device))

<All keys matched successfully>

In [88]:
dataset = WindowDataset(processed_data.dataset, 
                        processed_data.label,
                       window_size = window_size, 
                       stride = window_stride, norm="stand")

dataloader = DataLoader(dataset, batch_size=batch_size)

100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:00<00:00, 162.57it/s]

dataset shape ====  (108, 144, 7)





In [89]:
def cal_accuracy(pred, label, threshold=0) :
    acc = torch.zeros(pred.shape[0])
    acc[pred > threshold] = 1
    acc[pred < threshold] = 0
    score = [1 if acc[i] == label[i] else 0 for i in range(pred.shape[0])]
    return sum(score) / pred.shape[0]

In [90]:
model.eval()

tqdm_dataloader = tqdm(dataloader)
for batch, (data, label) in enumerate(tqdm_dataloader, start=1) :
    print(label)
    src_mask = model.generate_square_subsequent_mask(data.shape[1]).to(device)

    pred = model(data.to(device), src_mask)
    print(pred)
    print(label)
    score = cal_accuracy(pred.cpu(), label.cpu())
    print("점수 : ", score)

  0%|                                                                                             | 0/1 [00:00<?, ?it/s]

tensor([1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0.,
        1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
        1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0.,
        0., 1., 0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0.,
        1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1.,
        0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.])


100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.54s/it]

tensor([  3.6161,  19.0768,  11.6453,   1.5012,  -3.1272,  18.4049,  10.1671,
          8.6416,   9.3078,   3.9349,   3.4661, -18.4280,   1.6118,   6.8659,
          0.1182,   0.4287,   7.5241,  11.0669,   3.8079,  11.0782,  -5.1228,
        -16.3864,  -1.2728,   0.6672, -13.1793,  -3.1064,   5.6459,   5.0625,
          4.2497,   0.9527,  -1.8641,   2.4920,  -3.1149,   5.2728,   8.0343,
          1.9952,  22.6564,  -8.7613, -15.5015,  -1.8133,  -2.7000,  -9.5095,
          8.4664,  -6.2703,  12.2930,  11.0785,  -9.4418, -19.8750, -20.7593,
         36.3267, -31.0346,  -3.0148,  22.8914,  -5.7613,   0.8440,  16.2121,
          9.5381,   6.8240, -11.7274,   8.6610,  -8.8849,   1.4815,  17.3565,
          0.3514,  29.8024,  12.9046, -16.5930,   8.5348,  24.6688,  -3.9136,
          3.1287,   3.7528, -24.3480,  33.7406,  40.9719,  -3.0907,  17.5472,
         29.1703,   5.2272,  22.1687,  19.3203,  15.1213,   8.7299,   4.0942,
          5.7753,  -8.6199,   8.9615,   5.1897, -13.4408,  21.19


