# Data Preprocessing

In [1]:
import pandas as pd
import os
from glob import glob
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pyupbit
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

In [2]:
class preprocessing() :
    def __init__(self, ticker, interval, to, count) :
        self.o_dataset = pyupbit.get_ohlcv(ticker=ticker, interval=interval, to=to, count=count)
        self.data, self.label = self.preprocess(self.o_dataset)
        
    def drop_feature(self, dataset_df) :
        # index(시간) 제거 
        dataset_df = dataset_df.reset_index(drop=True)
        return dataset_df
    
    def add_after10(self, dataset_df) :
        after10 = np.zeros_like(dataset_df['close'])
        for i in range(len(dataset_df['close']) - 1) :
            if dataset_df['close'][i + 1] > dataset_df['close'][i] :
                after10[i] = 1
            else : 
                after10[i] = 0
            
        return after10
    
    def add_avgPrice(self, dataset_df) :
        return (dataset_df['high'] + dataset_df['low'] + 
                dataset_df['open'] + dataset_df['close']) // 4    
    
    def preprocess(self, df) :
        dataset_df = self.drop_feature(df)
        dataset_df['avg_price'] = self.add_avgPrice(dataset_df)
        dataset_df['after10'] = self.add_after10(dataset_df)
        
        data = dataset_df.iloc[:-1]
        
        return data.drop(columns=['after10']), data['after10']
        
    

In [20]:
ticker = 'KRW-BTC'
interval ='minute10'
to = f'2022-02-24 00:00'
count = 144000

processed_data =  preprocessing(ticker, interval, to, count)
display(processed_data.data)
display(processed_data.label)

Unnamed: 0,open,high,low,close,volume,value,avg_price
0,10395000.0,10410000.0,10378000.0,10383000.0,102.368936,1.064276e+09,10391500.0
1,10383000.0,10418000.0,10382000.0,10408000.0,86.774279,9.025902e+08,10397750.0
2,10415000.0,10423000.0,10401000.0,10404000.0,47.036384,4.897349e+08,10410750.0
3,10404000.0,10411000.0,10399000.0,10400000.0,33.246942,3.458745e+08,10403500.0
4,10400000.0,10404000.0,10396000.0,10398000.0,23.638871,2.458447e+08,10399500.0
...,...,...,...,...,...,...,...
143994,47213000.0,47232000.0,47098000.0,47098000.0,20.211839,9.538009e+08,47160250.0
143995,47100000.0,47100000.0,46993000.0,47078000.0,47.030849,2.212001e+09,47067750.0
143996,47078000.0,47190000.0,47064000.0,47105000.0,18.898176,8.904184e+08,47109250.0
143997,47105000.0,47225000.0,47044000.0,47207000.0,31.848817,1.501038e+09,47145250.0


0         1.0
1         0.0
2         0.0
3         0.0
4         1.0
         ... 
143994    0.0
143995    1.0
143996    1.0
143997    0.0
143998    0.0
Name: after10, Length: 143999, dtype: float64

In [21]:
processed_data.data.to_csv('./144000_data.csv')
processed_data.label.to_csv('./144000_label.csv')

In [6]:
# csv_data = processed_data.data
# csv_label = processed_data.label

In [83]:
csv_data = pd.read_csv('./144000_data.csv').drop(columns=['Unnamed: 0'])
csv_label = pd.read_csv('./144000_label.csv').drop(columns=['Unnamed: 0'])

In [84]:
csv_data

Unnamed: 0,open,high,low,close,volume,value,avg_price
0,10395000.0,10410000.0,10378000.0,10383000.0,102.368936,1.064276e+09,10391500.0
1,10383000.0,10418000.0,10382000.0,10408000.0,86.774279,9.025902e+08,10397750.0
2,10415000.0,10423000.0,10401000.0,10404000.0,47.036384,4.897349e+08,10410750.0
3,10404000.0,10411000.0,10399000.0,10400000.0,33.246942,3.458745e+08,10403500.0
4,10400000.0,10404000.0,10396000.0,10398000.0,23.638871,2.458447e+08,10399500.0
...,...,...,...,...,...,...,...
143994,47213000.0,47232000.0,47098000.0,47098000.0,20.211839,9.538009e+08,47160250.0
143995,47100000.0,47100000.0,46993000.0,47078000.0,47.030849,2.212001e+09,47067750.0
143996,47078000.0,47190000.0,47064000.0,47105000.0,18.898176,8.904184e+08,47109250.0
143997,47105000.0,47225000.0,47044000.0,47207000.0,31.848817,1.501038e+09,47145250.0


https://doheon.github.io/%EC%BD%94%EB%93%9C%EA%B5%AC%ED%98%84/time-series/ci-4.transformer-post/

참고해서 WindowDataset 구성하기

In [76]:
def _EachColumnMinMax(df_data) :
    norm = MinMaxScaler()
    for col in df_data.columns :
        df_2d = np.array(df_data[col].values).reshape(-1, 1)
        df_data[col] = norm.fit_transform(df_2d).squeeze(1)
    print(type(df_data))
    return df_data

def EachColumnMinMax(df_data) :
    norm = MinMaxScaler()
    df_data = norm.fit_transform(np.array(df_data).reshape(-1, 1)).squeeze(1)
    return df_data

def WindowDataGenerator(df_data, df_label, window_size, stride) :
    num_sample = (df_data.shape[0] - window_size) // stride + 1     

    data = np.zeros([window_size, df_data.shape[1], num_sample])
    labels = np.zeros([num_sample])

    for i in range(num_sample) :
        data_start = stride * i
        data_end = data_start + window_size
        data[:, :, i] = df_data[data_start : data_end, :]
        labels[i] = df_label[data_end - 1]

    data = data.transpose((2, 0, 1))
    print("dataset shape ==== ",data.shape)
    
    # data shape (80, 600, 6), label shape (80,)
    return torch.Tensor(data), torch.Tensor(labels)

# data = np.zeros([80, 144, 299])
# display(csv_data[0:159])
# csv_data.loc[0:159,] = EachColumnMinMax(csv_data[0:159]).copy()
# display(csv_data[0:159])
tmp = {}
display(csv_data[0:144])
for col in csv_data.columns :
    tmp[col] = EachColumnMinMax(csv_data[col][0:159])
#     print(type(EachColumnMinMax(csv_data.loc[0:159, col])))
    display(pd.DataFrame(tmp))
    display(csv_label.values[159 - 1])

# a,b = WindowDataGenerator(csv_data.values, csv_label.values, 24 * 6, 5)
# a.shape

Unnamed: 0,open,high,low,close,volume,value,avg_price
0,10395000.0,10410000.0,10378000.0,10383000.0,102.368936,1.064276e+09,10391500.0
1,10383000.0,10418000.0,10382000.0,10408000.0,86.774279,9.025902e+08,10397750.0
2,10415000.0,10423000.0,10401000.0,10404000.0,47.036384,4.897349e+08,10410750.0
3,10404000.0,10411000.0,10399000.0,10400000.0,33.246942,3.458745e+08,10403500.0
4,10400000.0,10404000.0,10396000.0,10398000.0,23.638871,2.458447e+08,10399500.0
...,...,...,...,...,...,...,...
139,10285000.0,10307000.0,10277000.0,10307000.0,46.917195,4.825916e+08,10294000.0
140,10306000.0,10311000.0,10285000.0,10287000.0,45.156411,4.649165e+08,10297250.0
141,10287000.0,10300000.0,10282000.0,10300000.0,50.006902,5.146086e+08,10292250.0
142,10301000.0,10330000.0,10300000.0,10323000.0,53.084240,5.476554e+08,10313500.0


Unnamed: 0,open
0,0.828221
1,0.754601
2,0.950920
3,0.883436
4,0.858896
...,...
154,0.276074
155,0.251534
156,0.245399
157,0.233129


array([0.])

Unnamed: 0,open,high
0,0.828221,0.844311
1,0.754601,0.892216
2,0.950920,0.922156
3,0.883436,0.850299
4,0.858896,0.808383
...,...,...
154,0.276074,0.227545
155,0.251534,0.191617
156,0.245399,0.191617
157,0.233129,0.209581


array([0.])

Unnamed: 0,open,high,low
0,0.828221,0.844311,0.816514
1,0.754601,0.892216,0.834862
2,0.950920,0.922156,0.922018
3,0.883436,0.850299,0.912844
4,0.858896,0.808383,0.899083
...,...,...,...
154,0.276074,0.227545,0.362385
155,0.251534,0.191617,0.417431
156,0.245399,0.191617,0.422018
157,0.233129,0.209581,0.376147


array([0.])

Unnamed: 0,open,high,low,close
0,0.828221,0.844311,0.816514,0.754601
1,0.754601,0.892216,0.834862,0.907975
2,0.950920,0.922156,0.922018,0.883436
3,0.883436,0.850299,0.912844,0.858896
4,0.858896,0.808383,0.899083,0.846626
...,...,...,...,...
154,0.276074,0.227545,0.362385,0.220859
155,0.251534,0.191617,0.417431,0.251534
156,0.245399,0.191617,0.422018,0.233129
157,0.233129,0.209581,0.376147,0.257669


array([0.])

Unnamed: 0,open,high,low,close,volume
0,0.828221,0.844311,0.816514,0.754601,0.330371
1,0.754601,0.892216,0.834862,0.907975,0.277076
2,0.950920,0.922156,0.922018,0.883436,0.141269
3,0.883436,0.850299,0.912844,0.858896,0.094142
4,0.858896,0.808383,0.899083,0.846626,0.061306
...,...,...,...,...,...
154,0.276074,0.227545,0.362385,0.220859,0.145820
155,0.251534,0.191617,0.417431,0.251534,0.036902
156,0.245399,0.191617,0.422018,0.233129,0.048908
157,0.233129,0.209581,0.376147,0.257669,0.076815


array([0.])

Unnamed: 0,open,high,low,close,volume,value
0,0.828221,0.844311,0.816514,0.754601,0.330371,0.336094
1,0.754601,0.892216,0.834862,0.907975,0.277076,0.282043
2,0.950920,0.922156,0.922018,0.883436,0.141269,0.144026
3,0.883436,0.850299,0.912844,0.858896,0.094142,0.095934
4,0.858896,0.808383,0.899083,0.846626,0.061306,0.062494
...,...,...,...,...,...,...
154,0.276074,0.227545,0.362385,0.220859,0.145820,0.146745
155,0.251534,0.191617,0.417431,0.251534,0.036902,0.037105
156,0.245399,0.191617,0.422018,0.233129,0.048908,0.049195
157,0.233129,0.209581,0.376147,0.257669,0.076815,0.077277


array([0.])

Unnamed: 0,open,high,low,close,volume,value,avg_price
0,0.828221,0.844311,0.816514,0.754601,0.330371,0.336094,0.810000
1,0.754601,0.892216,0.834862,0.907975,0.277076,0.282043,0.845714
2,0.950920,0.922156,0.922018,0.883436,0.141269,0.144026,0.920000
3,0.883436,0.850299,0.912844,0.858896,0.094142,0.095934,0.878571
4,0.858896,0.808383,0.899083,0.846626,0.061306,0.062494,0.855714
...,...,...,...,...,...,...,...
154,0.276074,0.227545,0.362385,0.220859,0.145820,0.146745,0.268571
155,0.251534,0.191617,0.417431,0.251534,0.036902,0.037105,0.278571
156,0.245399,0.191617,0.422018,0.233129,0.048908,0.049195,0.274286
157,0.233129,0.209581,0.376147,0.257669,0.076815,0.077277,0.267143


array([0.])

In [101]:
pd.set_option('display.max_rows', None)
csv_label.iloc[167:190]

Unnamed: 0,after10
167,0.0
168,0.0
169,0.0
170,0.0
171,1.0
172,1.0
173,1.0
174,0.0
175,1.0
176,0.0


In [33]:
# def EachColumnMinMax(df_data) :
#     norm = MinMaxScaler()
#     for col in df_data.columns :
#         df_2d = np.array(df_data[col]).reshape(-1, 1)
#         print(df_2d.shape)
# #         norm_dataset = norm.fit_transform(df_data[col])
#         df_data.loc[:, col] = norm.fit_transform(df_2d).squeeze(1)
        
#     return df_data

def EachColumnMinMax(df_data) :
    norm = MinMaxScaler()
    df_data = norm.fit_transform(np.array(df_data).reshape(-1, 1)).squeeze(1)
    return df_data

def WindowDataGenerator(df_data, df_label, window_size, stride) :
    num_sample = (df_data.shape[0] - window_size) // stride + 1     

    data = np.zeros([window_size, df_data.shape[1], num_sample])
    labels = np.zeros([num_sample])

    for i in tqdm(range(num_sample)) :
        tmp = {}
        data_start = stride * i
        data_end = data_start + window_size
        for col in df_data.columns :
            tmp[col] = EachColumnMinMax(df_data[col][data_start : data_end])
#         display(pd.DataFrame(tmp))
#         display(df_data[data_start : data_end])
        data[:, :, i] = pd.DataFrame(tmp).values
        labels[i] = df_label.values[data_end - 1]
        
    data = data.transpose((2, 0, 1))
    print("dataset shape ==== ",data.shape)
    
    # data shape (80, 600, 6), label shape (80,)
    return torch.Tensor(data), torch.Tensor(labels)

display(csv_data.shape)

a,b = WindowDataGenerator(csv_data, csv_label, 24 * 6, 5)
a.shape

(143999, 7)

100%|███████████████████████████████████████████████████████████████████████████| 28772/28772 [01:02<00:00, 458.88it/s]

dataset shape ====  (28772, 144, 7)





torch.Size([28772, 144, 7])

In [56]:
for o in b :
    print(o)

tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)

tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)

tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)

tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)

tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)

tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)

tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)

tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)

tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)

tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(0.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(0.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)

KeyboardInterrupt: 

# Window Dataset

In [85]:
class WindowDataset(Dataset) :
    def __init__(self, df_data, df_label, window_size=144, stride=6) :
        self.data, self.label = self.WindowDataGenerator(df_data, df_label, window_size, stride)        
        
    def __getitem__(self, i) :
        return self.data[i], self.label[i]
                
    def __len__(self) :
        assert len(self.data) == len(self.label), "data와 label의 길이가 다름"
        return len(self.data)
        
    
    def WindowDataGenerator(self, df_data, df_label, window_size, stride) :
        num_sample = (df_data.shape[0] - window_size) // stride + 1     

        data = np.zeros([window_size, df_data.shape[1], num_sample])
        labels = np.zeros([num_sample])

        for i in tqdm(range(num_sample)) :
            tmp = {}
            data_start = stride * i
            data_end = data_start + window_size
            for col in df_data.columns :
                tmp[col] = EachColumnMinMax(df_data[col][data_start : data_end])

            data[:, :, i] = pd.DataFrame(tmp).values
            labels[i] = df_label.values[data_end - 1]

        data = data.transpose((2, 0, 1))
        # data shape (80, 600, 6), label shape (80,)
        return torch.Tensor(data), torch.Tensor(labels)
    
    def EachColumnMinMax(self, df_data) :
        norm = MinMaxScaler()
        df_data = norm.fit_transform(np.array(df_data).reshape(-1, 1)).squeeze(1)
        return df_data

    def WindowDataGenerator(self, df_data, df_label, window_size, stride) :
        num_sample = (df_data.shape[0] - window_size) // stride + 1     

        data = np.zeros([window_size, df_data.shape[1], num_sample])
        labels = np.zeros([num_sample])

        for i in tqdm(range(num_sample)) :
            tmp = {}
            data_start = stride * i
            data_end = data_start + window_size
            for col in df_data.columns :
                tmp[col] = self.EachColumnMinMax(df_data.loc[data_start : data_end - 1, col])

            data[:, :, i] = pd.DataFrame(tmp).values
            labels[i] = df_label.values[data_end - 1]

        data = data.transpose((2, 0, 1))
        print("dataset shape ==== ",data.shape)

        # data shape (80, 600, 6), label shape (80,)
        return torch.Tensor(data), torch.Tensor(labels)

# nn.Transformer

In [86]:
class Transformer2FC(nn.Module) :
    def __init__(self, input_shape, d_model, n_head, num_layer, dropout, num_class=2):
        super(Transformer2FC, self).__init__()
        
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_head, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layer)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        self.Encoder = nn.Sequential(
            nn.Linear(input_shape[1], d_model//2),
            nn.ReLU(),
            nn.Linear(d_model//2, d_model)
        )
        
        self.linear = nn.Sequential(
            nn.Linear(d_model, d_model//2),
            nn.ReLU(),
            nn.Linear(d_model//2, 1)
        )

        self.linear2 = nn.Sequential(
            nn.Linear(input_shape[0], input_shape[0]//2),
            nn.ReLU(),
            nn.Linear(input_shape[0]//2, num_class)
        )
        
#         self.sigmoid = nn.Softmax()
    
    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
    
    def forward(self, x, masked_x) :
        # (batch, data, dim)
#         print("input shape : ", x.shape)
        x = self.Encoder(x)
#         print("Encoder shape : ", x.shape)
        x = self.pos_encoder(x)
#         print("pos_encoder shape : ", x.shape)
#         print("masked_x shape : ", masked_x.shape)
        x = self.transformer_encoder(x.transpose(0,1), masked_x).transpose(0, 1)
#         print("transformer_encoder shape : ", x.shape)
        x = self.linear(x)
#         print("linear shape : ", x.shape)
        x = x.squeeze(2)
#         print("squeeze shape : ", x.shape)
        x = self.linear2(x)
#         print("linear2 shape : ", x.shape)
        x = x.squeeze(1)
#         print("squeeze shape : ", x.shape)
        return x

class PositionalEncoding(nn.Module) :
    def __init__(self, d_model, dropout=0.1, max_len=5000) :
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
        
    def forward(self, x) :
        x = x + self.pe[:x.size(0), :]

        return self.dropout(x)

def get_attention_mask(x) :
    mask = torch.eq(x, 0)
    return mask

In [87]:
device = torch.device("cuda")
lr = 1e-4
epoch = 200
window_size = 24 * 6
window_stride = 6
feature_len = 7
batch_size = 64
num_class = 1

model = Transformer2FC(input_shape=(window_size, feature_len), 
                       d_model=512, 
                       n_head=8, 
                       num_layer=4, 
                       dropout=0.3, 
                       num_class = num_class).to(device)
# model = MLSTMfcn(max_seq_len=window_size, num_features=feature_len).to(device)
criterion = nn.BCEWithLogitsLoss()
# criterion = nn.CrossEntropyLoss()
# criterion = nn.HuberLoss()
optim = torch.optim.Adam(model.parameters(), lr=lr)

In [88]:
dataset = WindowDataset(csv_data, 
                        csv_label,
                       window_size = window_size, 
                       stride = window_stride)
# dataset = WindowDataset(csv_data.values, 
#                         csv_label.values, 
#                        window_size = window_size, 
#                        stride = window_stride)
dataloader = DataLoader(dataset, batch_size=batch_size)

100%|███████████████████████████████████████████████████████████████████████████| 23976/23976 [00:52<00:00, 453.46it/s]

dataset shape ====  (23976, 144, 7)





In [102]:
def cal_accuracy(pred, label, threshold=0) :
    acc = torch.zeros(pred.shape[0])
    acc[pred > threshold] = 0
    acc[pred < threshold] = 1
    score = [1 if acc[i] == label[i] else 0 for i in range(pred.shape[0])]
    return sum(score) / pred.shape[0]

In [103]:
# def cal_accuracy(pred, label, threshold=0.5) :
#     res = torch.argmax(pred, dim=1)
    
#     print(pred)
#     print(res)
#     print(label)

#     score = [1 if res[i] == label[i] else 0 for i in range(pred.shape[0])]
#     return sum(score) / pred.shape[0]

In [None]:
progress = tqdm(range(epoch))

model.train()
for i in progress :
    batchloss = 0.0
    batchacc= 0
    for (data, label) in dataloader :
        optim.zero_grad()
        src_mask = model.generate_square_subsequent_mask(data.shape[1]).to(device)      
        
        pred = model(data.to(device), src_mask)
        
        loss = criterion(pred, label.to(device, dtype=torch.float64))
#         print(data.shape)
#         print(pred.shape)
        loss.backward()
        optim.step()
        
        score = cal_accuracy(pred.cpu(), label.cpu())
        progress.set_description("loss : {:0.6f} acc : {:0.6f}".format(loss.cpu().item(), score))
#         progress.set_description("loss : {:0.6f}".format(loss.cpu().item()))
        
        batchloss += loss
        batchacc += score
    print(f"Epoch : [{i} / {epoch}] Loss : {round((batchloss/len(dataloader)).item(), 4)} Acc : {round((batchacc/len(dataloader)), 4)}")
#     print(pred)
#     print(label)
#     print(f"Epoch : [{i} / {epoch}] Loss : {round((batchloss/len(dataloader)).item(), 4)}")
    
torch.save(model.state_dict(), f'./model.pt')

loss : 0.689149 acc : 0.437500:   0%|                                                          | 0/200 [00:45<?, ?it/s]

In [15]:
torch.save(model.state_dict(), f'./model.pt')

In [None]:
__output = torch.full([10, 64], 1.5)
__output.type()

In [20]:
data = torch.rand(2, 144, 6).to(device)
data
src_mask = model.generate_square_subsequent_mask(data.shape[1]).to(device)  
re = model(data, src_mask)
re

tensor([-0.0590, -0.0577], device='cuda:0', grad_fn=<SqueezeBackward1>)

# MLSTM-fcn

In [None]:
class SELayer(nn.Module):
    def __init__(self, channel, reduction=16):
        super(SELayer, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        b, c, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1)
        return x * y.expand_as(x)

class MLSTMfcn(nn.Module):
    def __init__(self, *, num_classes=1, max_seq_len=144, num_features=6,
                 num_lstm_out=128, num_lstm_layers=1, 
                 conv1_nf=128, conv2_nf=256, conv3_nf=128,
                 lstm_drop_p=0.8, fc_drop_p=0.3):
        
        super(MLSTMfcn, self).__init__()
        self.num_classes = num_classes
        self.max_seq_len = max_seq_len
        self.num_features = num_features

        self.num_lstm_out = num_lstm_out
        self.num_lstm_layers = num_lstm_layers

        self.conv1_nf = conv1_nf
        self.conv2_nf = conv2_nf
        self.conv3_nf = conv3_nf

        self.lstm_drop_p = lstm_drop_p
        self.fc_drop_p = fc_drop_p

        self.lstm = nn.LSTM(input_size=self.num_features, 
                            hidden_size=self.num_lstm_out,
                            num_layers=self.num_lstm_layers,
                            batch_first=True)
        
        self.conv1 = nn.Conv1d(self.num_features, self.conv1_nf, 8)
        self.conv2 = nn.Conv1d(self.conv1_nf, self.conv2_nf, 5)
        self.conv3 = nn.Conv1d(self.conv2_nf, self.conv3_nf, 3)

        self.bn1 = nn.BatchNorm1d(self.conv1_nf)
        self.bn2 = nn.BatchNorm1d(self.conv2_nf)
        self.bn3 = nn.BatchNorm1d(self.conv3_nf)

        self.se1 = SELayer(self.conv1_nf)  # ex 128
        self.se2 = SELayer(self.conv2_nf)  # ex 256

        self.relu = nn.ReLU()
        self.lstmDrop = nn.Dropout(self.lstm_drop_p)
        self.convDrop = nn.Dropout(self.fc_drop_p)

        self.fc = nn.Sequential(
            nn.Linear(self.conv3_nf+self.num_lstm_out, 128),
            nn.ReLU(),
            nn.Linear(128, self.num_classes)
        )

#         self.out_layer = nn.Linear(1000+128, self.num_classes)
#         self.dropout = nn.Dropout(0.1)
    
    def forward(self, x):
        ''' input x should be in size [B,T,F], where 
            B = Batch size
            T = Time samples
            F = features
        '''
#         x1 = nn.utils.rnn.pack_padded_sequence(x, seq_lens.cpu(), 
#                                                batch_first=True, 
#                                                enforce_sorted=False)
#         x1 = nn.utils.rnn.pack_padded_sequence(x, seq_lens, 
#                                                batch_first=True, 
#                                                enforce_sorted=False)
        
        x1, (ht,ct) = self.lstm(x)
#         x1, _ = nn.utils.rnn.pad_packed_sequence(x1, batch_first=True, 
#                                                  padding_value=0.0)
        x1 = x1[:,-1,:]
        
        x2 = x.transpose(2,1)
        x2 = self.convDrop(self.relu(self.bn1(self.conv1(x2))))
        x2 = self.se1(x2)
        x2 = self.convDrop(self.relu(self.bn2(self.conv2(x2))))
        x2 = self.se2(x2)
        x2 = self.convDrop(self.relu(self.bn3(self.conv3(x2))))
        x2 = torch.mean(x2,2)
        
        x_all = torch.cat((x1,x2),dim=1)
        x_out = self.fc(x_all)
#         concat = torch.cat([enc_out, x_out], dim=1)  # enc_out + hidden 
#         output = self.dropout(concat)
#         x_output = self.out_layer(output)
#         x_out = F.log_softmax(x_out, dim=1)

        return x_out

In [19]:
class Data_preprocess() :
    def __init__(self, ticker=None, interval=None, to=None, count=None, df=None, csv_list=None) :
        if ticker and interval and to and count :
            self.data, self.label, self.dataset = self.preprocess(pyupbit.get_ohlcv(ticker=ticker, interval=interval, to=to, count=count))
        elif  df :
            self.data, self.label, self.dataset = self.preprocess(df)
            
        elif csv_list :
            for csv_path in csv_list :
                try :
                    df = pd.read_csv(csv_path)
                    csv_data, csv_label, _ = self.preprocess(df)

                    fold_name = csv_path.split('\\')[-2]
                    file_name = csv_path.split('\\')[-1]
                    new_path = csv_path.replace(fold_name, 'preprocess_'+fold_name).replace(csv_path.split('\\')[-1],'')
                    os.makedirs(new_path, exist_ok=True)
                    
                    csv_dataset = pd.concat([csv_data, csv_label], axis=1)
                    csv_dataset.to_csv(os.path.join(new_path, file_name))
                    
                except :
                    print(f"ERROR from {csv_path}")
            
            
            
    def MinMax(self, dataset_df) :
        norm = MinMaxScaler()
        norm_dataset = norm.fit_transform(dataset_df)
        return pd.DataFrame(norm_dataset, columns=list(dataset_df.columns))
    
    
    def add_after10(self, dataset_df) :
        after10 = np.zeros_like(self.norm_dataset['close'])
        for i in range(len(dataset_df['close']) - 1) :
            if dataset_df['close'][i + 1] > dataset_df['close'][i] :
                after10[i] = 1
            else : 
                after10[i] = 0
            
        return after10
    
    
    def drop_feature(self, dataset_df) :
        # index(시간) 제거 
        dataset_df = dataset_df.drop(columns=['Unnamed: 0'])
        # value 제거
        dataset_df = dataset_df.drop(columns=['value'])
        return dataset_df
    
    
    def add_avgPrice(self, dataset_df) :
        return (dataset_df['high'] + dataset_df['low'] + 
                dataset_df['open'] + dataset_df['close']) // 4
       
    
    def preprocess(self, dataset, latest=False) :
        
        # drop feature
        dataset_df = self.drop_feature(dataset)
#         display(dataset_df)
        # avg_price 추가
        dataset_df['avg_price'] = self.add_avgPrice(dataset_df)
        
        if latest == True :
            # 가장 예전 데이터 삭제 - norm이랑 original 둘 다 적용
            self.dataset = self.dataset.drop([self.dataset.index[0]]).drop(columns=['after10'])
            self.norm_dataset = self.norm_dataset.drop([self.norm_dataset.index[0]])

            # ori dataset에 추가
            self.dataset = pd.concat([self.dataset, dataset_df])
            self.dataset = self.dataset.reset_index(drop=True)
            
            # min max 정규화 (MinMaxScaler) 적용
            self.norm_dataset = self.MinMax(self.dataset)
            
            # after10 추가
            self.dataset['after10'] = self.add_after10(self.dataset)
            
        
        else :
            # min max 정규화 (MinMaxScaler) 적용
            self.norm_dataset = self.MinMax(dataset_df)
            
            # after10 추가
            dataset_df['after10'] = self.add_after10(dataset_df)
        
        # 예측될 값(label)인 10분 후 가격
        self.norm_dataset['after10'] = self.add_after10(self.norm_dataset)
        
        # 마지막 행 삭제 (다음 가격을 모르므로 라벨을 붙일 수 없음)
        self.norm_dataset = self.norm_dataset.iloc[:-1]
        
        return self.norm_dataset.drop(columns=['after10']), self.norm_dataset['after10'], dataset_df
    
