# Data Preprocessing

In [3]:
import pandas as pd
import os
from glob import glob
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pyupbit

In [4]:
class Data_preprocess() :
    def __init__(self, ticker, interval, to, count) :
        self.data, self.label, self.dataset = self.preprocess(pyupbit.get_ohlcv(ticker=ticker, interval=interval, to=to, count=count))
    
    def MinMax(self, dataset_df) :
        norm = MinMaxScaler()
        norm_dataset = norm.fit_transform(dataset_df)
        return pd.DataFrame(norm_dataset, columns=list(dataset_df.columns))
    
    
    def add_after10(self, dataset_df) :
        after10 = np.zeros_like(self.norm_dataset['close'])
        for i in range(len(dataset_df['close']) - 1) :
            after10[i] = dataset_df['close'][i + 1]
        return after10
    
    
    def drop_feature(self, dataset_df) :
        # index(시간) 제거
        dataset_df = dataset_df.reset_index(drop=True)
        # value 제거
        dataset_df = dataset_df.drop(columns=['value'])
        return dataset_df
    
    
    def add_avgPrice(self, dataset_df) :
        return (dataset_df['high'] + dataset_df['low'] + 
                dataset_df['open'] + dataset_df['close']) // 4
       
    
    def preprocess(self, dataset, latest=False) :
        
        # drop feature
        dataset_df = self.drop_feature(dataset)
        
        # avg_price 추가
        dataset_df['avg_price'] = self.add_avgPrice(dataset_df)
        
        if latest == True :
            # 가장 예전 데이터 삭제 - norm이랑 original 둘 다 적용
            self.dataset = self.dataset.drop([self.dataset.index[0]]).drop(columns=['after10'])
            self.norm_dataset = self.norm_dataset.drop([self.norm_dataset.index[0]])

            # ori dataset에 추가
            self.dataset = pd.concat([self.dataset, dataset_df])
            self.dataset = self.dataset.reset_index(drop=True)
            
            # min max 정규화 (MinMaxScaler) 적용
            self.norm_dataset = self.MinMax(self.dataset)
            
            # after10 추가
            self.dataset['after10'] = self.add_after10(self.dataset)
            
        
        else :
            # min max 정규화 (MinMaxScaler) 적용
            self.norm_dataset = self.MinMax(dataset_df)
            
            # after10 추가
            dataset_df['after10'] = self.add_after10(dataset_df)
        
        # 예측될 값(label)인 10분 후 가격
        self.norm_dataset['after10'] = self.add_after10(self.norm_dataset)

        return self.norm_dataset.drop(columns=['after10']), self.norm_dataset['after10'], dataset_df
    


In [11]:
ticker = 'KRW-BTC'
interval ='minute10'
to = f'2021-11-10 00:10'
count = 1000

processed_data =  Data_preprocess(ticker, interval, to, count)
display(processed_data.dataset)
display(processed_data.data)
display(processed_data.label)

Unnamed: 0,open,high,low,close,volume,avg_price,after10
0,75025000.0,75110000.0,74808000.0,74827000.0,42.323616,74942500.0,74886000.0
1,74827000.0,74955000.0,74810000.0,74886000.0,26.774901,74869500.0,74913000.0
2,74886000.0,75023000.0,74885000.0,74913000.0,27.888634,74926750.0,74982000.0
3,74914000.0,75019000.0,74880000.0,74982000.0,29.500190,74948750.0,74733000.0
4,74981000.0,74981000.0,74699000.0,74733000.0,24.260177,74848500.0,74529000.0
...,...,...,...,...,...,...,...
995,81359000.0,81359000.0,81166000.0,81203000.0,54.985089,81271750.0,81118000.0
996,81203000.0,81265000.0,81080000.0,81118000.0,46.954699,81166500.0,80776000.0
997,81118000.0,81385000.0,80756000.0,80776000.0,60.260760,81008750.0,80650000.0
998,80776000.0,80999000.0,80550000.0,80650000.0,99.310089,80743750.0,80441000.0


Unnamed: 0,open,high,low,close,volume,avg_price
0,0.224190,0.218171,0.236572,0.203964,0.122389,0.214193
1,0.204150,0.202204,0.236777,0.209930,0.074569,0.206586
2,0.210121,0.209209,0.244465,0.212661,0.077994,0.212551
3,0.212955,0.208797,0.243952,0.219638,0.082951,0.214844
4,0.219737,0.204883,0.225400,0.194458,0.066835,0.204397
...,...,...,...,...,...,...
995,0.865283,0.861867,0.888274,0.848721,0.161329,0.873730
996,0.849494,0.852184,0.879459,0.840125,0.136632,0.862762
997,0.840891,0.864545,0.846248,0.805542,0.177554,0.846324
998,0.806275,0.824784,0.825133,0.792800,0.297649,0.818710


0      0.209930
1      0.212661
2      0.219638
3      0.194458
4      0.173830
         ...   
995    0.840125
996    0.805542
997    0.792800
998    0.771665
999    0.000000
Name: after10, Length: 1000, dtype: float64

In [12]:
data =  pyupbit.get_ohlcv(ticker=ticker, interval=interval, to=to, count=count)
data

Unnamed: 0,open,high,low,close,volume,value
2021-11-03 01:30:00,75025000.0,75110000.0,74808000.0,74827000.0,42.323616,3.173048e+09
2021-11-03 01:40:00,74827000.0,74955000.0,74810000.0,74886000.0,26.774901,2.004494e+09
2021-11-03 01:50:00,74886000.0,75023000.0,74885000.0,74913000.0,27.888634,2.090929e+09
2021-11-03 02:00:00,74914000.0,75019000.0,74880000.0,74982000.0,29.500190,2.210531e+09
2021-11-03 02:10:00,74981000.0,74981000.0,74699000.0,74733000.0,24.260177,1.814633e+09
...,...,...,...,...,...,...
2021-11-09 23:20:00,81359000.0,81359000.0,81166000.0,81203000.0,54.985089,4.467551e+09
2021-11-09 23:30:00,81203000.0,81265000.0,81080000.0,81118000.0,46.954699,3.813998e+09
2021-11-09 23:40:00,81118000.0,81385000.0,80756000.0,80776000.0,60.260760,4.888984e+09
2021-11-09 23:50:00,80776000.0,80999000.0,80550000.0,80650000.0,99.310089,8.019597e+09


https://doheon.github.io/%EC%BD%94%EB%93%9C%EA%B5%AC%ED%98%84/time-series/ci-4.transformer-post/

참고해서 WindowDataset 구성하기