# Data Preprocessing

In [9]:
import pandas as pd
import os
from glob import glob
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pyupbit

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

In [37]:
class Data_preprocess() :
    def __init__(self, ticker, interval, to, count) :
        self.data, self.label, self.dataset = self.preprocess(pyupbit.get_ohlcv(ticker=ticker, interval=interval, to=to, count=count))
    
    def MinMax(self, dataset_df) :
        norm = MinMaxScaler()
        norm_dataset = norm.fit_transform(dataset_df)
        return pd.DataFrame(norm_dataset, columns=list(dataset_df.columns))
    
    
#     def add_after10(self, dataset_df) :
#         after10 = np.zeros_like(self.norm_dataset['close'])
#         for i in range(len(dataset_df['close']) - 1) :
#             after10[i] = dataset_df['close'][i + 1]
#         return after10
    def add_after10(self, dataset_df) :
        after10 = np.zeros_like(self.norm_dataset['close'])
        for i in range(len(dataset_df['close']) - 1) :
            if dataset_df['close'][i + 1] > dataset_df['close'][i] :
                after10[i] = 1
            else : 
                after10[i] = 0
            
        return after10
    
    
    def drop_feature(self, dataset_df) :
        # index(시간) 제거
        dataset_df = dataset_df.reset_index(drop=True)
        # value 제거
        dataset_df = dataset_df.drop(columns=['value'])
        return dataset_df
    
    
    def add_avgPrice(self, dataset_df) :
        return (dataset_df['high'] + dataset_df['low'] + 
                dataset_df['open'] + dataset_df['close']) // 4
       
    
    def preprocess(self, dataset, latest=False) :
        
        # drop feature
        dataset_df = self.drop_feature(dataset)
        
        # avg_price 추가
        dataset_df['avg_price'] = self.add_avgPrice(dataset_df)
        
        if latest == True :
            # 가장 예전 데이터 삭제 - norm이랑 original 둘 다 적용
            self.dataset = self.dataset.drop([self.dataset.index[0]]).drop(columns=['after10'])
            self.norm_dataset = self.norm_dataset.drop([self.norm_dataset.index[0]])

            # ori dataset에 추가
            self.dataset = pd.concat([self.dataset, dataset_df])
            self.dataset = self.dataset.reset_index(drop=True)
            
            # min max 정규화 (MinMaxScaler) 적용
            self.norm_dataset = self.MinMax(self.dataset)
            
            # after10 추가
            self.dataset['after10'] = self.add_after10(self.dataset)
            
        
        else :
            # min max 정규화 (MinMaxScaler) 적용
            self.norm_dataset = self.MinMax(dataset_df)
            
            # after10 추가
            dataset_df['after10'] = self.add_after10(dataset_df)
        
        # 예측될 값(label)인 10분 후 가격
        self.norm_dataset['after10'] = self.add_after10(self.norm_dataset)
        
        # 마지막 행 삭제 (다음 가격을 모르므로 라벨을 붙일 수 없음)
        self.norm_dataset = self.norm_dataset.iloc[:-1]
        
        return self.norm_dataset.drop(columns=['after10']), self.norm_dataset['after10'], dataset_df
    


In [38]:
ticker = 'KRW-BTC'
interval ='minute10'
to = f'2021-11-10 00:10'
count = 1000

processed_data =  Data_preprocess(ticker, interval, to, count)
display(processed_data.dataset)
display(processed_data.data)
display(processed_data.label)

Unnamed: 0,open,high,low,close,volume,avg_price,after10
0,75025000.0,75110000.0,74808000.0,74827000.0,42.323616,74942500.0,1.0
1,74827000.0,74955000.0,74810000.0,74886000.0,26.774901,74869500.0,1.0
2,74886000.0,75023000.0,74885000.0,74913000.0,27.888634,74926750.0,1.0
3,74914000.0,75019000.0,74880000.0,74982000.0,29.500190,74948750.0,0.0
4,74981000.0,74981000.0,74699000.0,74733000.0,24.260177,74848500.0,0.0
...,...,...,...,...,...,...,...
995,81359000.0,81359000.0,81166000.0,81203000.0,54.985089,81271750.0,0.0
996,81203000.0,81265000.0,81080000.0,81118000.0,46.954699,81166500.0,0.0
997,81118000.0,81385000.0,80756000.0,80776000.0,60.260760,81008750.0,0.0
998,80776000.0,80999000.0,80550000.0,80650000.0,99.310089,80743750.0,0.0


Unnamed: 0,open,high,low,close,volume,avg_price
0,0.224190,0.218171,0.236572,0.203964,0.122389,0.214193
1,0.204150,0.202204,0.236777,0.209930,0.074569,0.206586
2,0.210121,0.209209,0.244465,0.212661,0.077994,0.212551
3,0.212955,0.208797,0.243952,0.219638,0.082951,0.214844
4,0.219737,0.204883,0.225400,0.194458,0.066835,0.204397
...,...,...,...,...,...,...
994,0.858300,0.873918,0.893706,0.864496,0.155024,0.880425
995,0.865283,0.861867,0.888274,0.848721,0.161329,0.873730
996,0.849494,0.852184,0.879459,0.840125,0.136632,0.862762
997,0.840891,0.864545,0.846248,0.805542,0.177554,0.846324


0      1.0
1      1.0
2      1.0
3      0.0
4      0.0
      ... 
994    0.0
995    0.0
996    0.0
997    0.0
998    0.0
Name: after10, Length: 999, dtype: float64

In [41]:
display(len(processed_data.data))
display(len(processed_data.label))

display((processed_data.data).shape)
display((processed_data.label).shape)

999

999

(999, 6)

(999,)

In [40]:
display((processed_data.data).shape[0])
tensor_data = torch.Tensor((processed_data.data).values)
display(tensor_data.shape)
print(tensor_data)

999

torch.Size([999, 6])

tensor([[0.2242, 0.2182, 0.2366, 0.2040, 0.1224, 0.2142],
        [0.2041, 0.2022, 0.2368, 0.2099, 0.0746, 0.2066],
        [0.2101, 0.2092, 0.2445, 0.2127, 0.0780, 0.2126],
        ...,
        [0.8495, 0.8522, 0.8795, 0.8401, 0.1366, 0.8628],
        [0.8409, 0.8645, 0.8462, 0.8055, 0.1776, 0.8463],
        [0.8063, 0.8248, 0.8251, 0.7928, 0.2976, 0.8187]])


https://doheon.github.io/%EC%BD%94%EB%93%9C%EA%B5%AC%ED%98%84/time-series/ci-4.transformer-post/

참고해서 WindowDataset 구성하기

# Window Dataset

In [66]:
class WindowDataset(Dataset) :
    def __init__(self, df, label, window_size=600, stride=5) :
        self.data, self.label = self.WindowDataGenerator(df, label, window_size, stride)        
        
    def __getitem__(self, i) :
        return self.data[i], self.label[i]
                
    def __len__(self) :
        assert len(self.data) == len(self.label), "data와 label의 길이가 다름"
        return len(self.data)
        
    
    def WindowDataGenerator(self, df_data, df_label, window_size, stride) :
        num_sample = (df_data.shape[0] - window_size) // stride + 1     

        data = np.zeros([window_size, df_data.shape[1], num_sample])
        labels = np.zeros([num_sample])

        for i in range(num_sample) :
            data_start = stride * i
            data_end = data_start + window_size
            data[:, :, i] = df_data[data_start : data_end, :]
            labels[i] = df_label[data_end - 1]

        data = data.transpose((2, 0, 1))
        
        # data shape (80, 600, 6), label shape (80,)
        return torch.Tensor(data), torch.Tensor(labels)

# nn.Transformer

In [None]:
class Transformer2FC(nn.Module) :
    def __init__(self, input_dim, d_model, n_head, dropout, num_layer, num_class=2):
        super(CustomTransformer, self).__init__()
        
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, n_head=n_head, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layer)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        self.Encoder = nn.Sequential(
            nn.Linear(input_dim, d_model//2),
            nn.ReLU(),
            nn.Linear(d_model//2, d_model)
        )
        
        self.linear = nn.Sequential(
            nn.Linear(d_model, d_model//2),
            nn.ReLU(),
            nn.Linear(d_model//2, input_dim)
        )
        self.linear2 = nn.Sequential(
            nn.Linear(input_dim, input_size//2),
            nn.ReLU(),
            nn.Linear(input_size//2, num_class)
        )
    
    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
    
    def forward(self, x, masked_x) :
        x = self.Encoder(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x.transpose(0,1), masked_x).transpose(0, 1)
        x = self.linear(x)[:, :, 0]
        x = self.linear2(x)
        return x

class PositionalEncoder(nn.Module) :
    def __init__(self, d_model, dropout=0.1, max_len=5000) :
        super(PositionalEncoder, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe