In [2]:
import pandas as pd
import os
from glob import glob
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pyupbit
import math
import schedule

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

In [19]:
class Data_preprocess() :
    def __init__(self, ticker, interval, to, count, norm) :
        self.norm = {"minmax" : self.MinMax,
                    "stand" : self.standarization,
                    "diff" : self.diff}
        
        self.data, self.label, self.dataset = self.preprocess(pyupbit.get_ohlcv(ticker=ticker, interval=interval, to=to, count=count),
                                                             normalization=norm)
        
    def MinMax(self, df) :
        norm = MinMaxScaler()
        norm_dataset = norm.fit_transform(df)
        return pd.DataFrame(norm_dataset, columns=list(df.columns))
    
    def standarization(self, df) :
        for col in df:
            df[col] = (df[col] - df[col].mean()) / df[col].std()
        return df
    
    def diff(self, df) :
        for col in (df):
            log_y = np.log(df[col])
            df[col][1:] = np.diff(log_y)
        return df[1:]
    

    def add_label(self, dataset_df) :
        after10 = np.zeros_like(dataset_df['close'])
        for i in range(len(dataset_df['close']) - 1) :
            if dataset_df['close'][i + 1] > dataset_df['close'][i] :
                after10[i] = 1
            else : 
                after10[i] = 0
            
        return pd.DataFrame(after10,columns=['label'])
    
    
    def drop_feature(self, dataset_df) :
        # index(시간) 제거
        dataset_df = dataset_df.reset_index(drop=True)
        # value 제거
#         dataset_df = dataset_df.drop(columns=['value'])
        return dataset_df
    
    
    def WindowDataGenerator(self, df_data, df_label, window_size=144, stride=6, norm="stand") :
        if norm == "diff" :
            num_sample = ((df_data.shape[0] - 1) - window_size) // stride + 1
            data = np.zeros([window_size - 1, df_data.shape[1], num_sample])
        else : 
            num_sample = (df_data.shape[0] - window_size) // stride + 1
            data = np.zeros([window_size, df_data.shape[1], num_sample])
            
        labels = np.zeros([num_sample])

        for i in tqdm(range(num_sample)) :
            data_start = stride * i
            data_end = data_start + window_size
            tmp = {}
            for col in df_data.columns :
#                 if norm == "diff" :
#                     tmp[col] = self.EachColumnDiff(df_data[col][data_start : data_end].copy())
#                 elif norm == "stand" :
#                     tmp[col] = self.EachColumnStand(df_data[col][data_start : data_end].copy())
#                 elif norm == "minmax" :
#                     tmp[col] = self.EachColumnMinMax(df_data[col][data_start : data_end].copy())

#                 tmp[col] = self.EachColumnMinMax(df_data.loc[data_start : data_end - 1, col])
    
                tmp[col] = df_data[col][data_start : data_end].copy()
    
            data[:, :, i] = pd.DataFrame(tmp).values
            labels[i] = df_label.values[data_end - 1]

        data = data.transpose((2, 0, 1))
        print("dataset shape ==== ",data.shape)

        # data shape (80, 600, 6), label shape (80,)
        return torch.Tensor(data), torch.Tensor(labels)
    
    def add_avgPrice(self, dataset_df) :
        return (dataset_df['high'] + dataset_df['low'] + 
                dataset_df['open'] + dataset_df['close']) // 4
       
    
    def preprocess(self, dataset, normalization) :
        
        # drop feature
        dataset_df = self.drop_feature(dataset)
        
        # avg_price 추가
        dataset_df['avg_price'] = self.add_avgPrice(dataset_df)
        
        # label 추가
        if normalization == "diff" :
            label = self.add_label(dataset_df)[1:-1]
        else :
            label = self.add_label(dataset_df)[:-1]
        
        norm_df = self.norm[normalization](dataset_df.copy())[:-1]
        data, label = self.WindowDataGenerator(norm_df, label)
        return data, label, dataset_df[:-1]
        
        

In [20]:
ticker = 'KRW-BTC'
interval ='minute10'
to = f'2022-04-05 00:00'
count = 145 # minimum data 개수는 145개

processed_data =  Data_preprocess(ticker, interval, to, count, "stand")
display(processed_data.data)
display(processed_data.label)
display(processed_data.dataset)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 333.28it/s]

dataset shape ====  (1, 144, 7)





tensor([[[ 0.0094,  0.6336,  0.3110,  ..., -0.6124, -0.6098,  0.4803],
         [ 0.9131,  0.9763,  1.0402,  ...,  0.8519,  0.8620,  0.9513],
         [ 0.8704,  0.7212,  0.8498,  ..., -0.8783, -0.8754,  0.8692],
         ...,
         [-0.3746, -0.4706, -0.2359,  ..., -0.2462, -0.2478, -0.3439],
         [-0.2854, -0.4059, -0.3494,  ..., -0.4344, -0.4355, -0.3647],
         [-0.4134, -0.6686, -0.7707,  ..., -0.3303, -0.3340, -0.7130]]])

tensor([0.])

Unnamed: 0,open,high,low,close,volume,value,avg_price
0,56544000.0,56777000.0,56542000.0,56777000.0,21.738048,1.231732e+09,56660000.0
1,56777000.0,56867000.0,56722000.0,56750000.0,60.154918,3.417271e+09,56779000.0
2,56766000.0,56800000.0,56675000.0,56792000.0,14.761901,8.373187e+08,56758250.0
3,56791000.0,56792000.0,56595000.0,56595000.0,21.761339,1.233731e+09,56693250.0
4,56595000.0,56659000.0,56537000.0,56572000.0,18.596070,1.052715e+09,56590750.0
...,...,...,...,...,...,...,...
139,56455000.0,56500000.0,56406000.0,56473000.0,31.743649,1.792229e+09,56458500.0
140,56474000.0,56573000.0,56401000.0,56445000.0,41.485935,2.342977e+09,56473250.0
141,56445000.0,56487000.0,56407000.0,56468000.0,31.345217,1.769248e+09,56451750.0
142,56468000.0,56504000.0,56379000.0,56435000.0,26.408405,1.490466e+09,56446500.0


In [58]:
schedule.every(10).minutes.at().do(job) # 3분마다 job 실행

0.0