In [1]:
import torch 
from torch import nn
import torch.nn.functional as F 
from torch.optim import Adam

import numpy as np
import os
import random
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import pandas as pd
from sklearn.preprocessing import StandardScaler
# import util
from datetime import date
import argparse
from progressbar import *
import datetime

## Load Data

In [64]:
data = pd.read_csv('../dataset/upbit_ohlcv_1700.csv', parse_dates=['index'])

In [65]:
data.head()

Unnamed: 0,index,open,high,low,close,volume,value
0,2017-09-29 09:00:00+09:00,4586000.0,4709000.0,4476000.0,4657000.0,59.352373,272455800.0
1,2017-09-30 09:00:00+09:00,4657000.0,4896000.0,4651000.0,4895000.0,19.998483,95614760.0
2,2017-10-01 09:00:00+09:00,4889000.0,4978000.0,4682000.0,4962000.0,27.323332,133132500.0
3,2017-10-02 09:00:00+09:00,4962000.0,5095000.0,4956000.0,5025000.0,31.728004,159931500.0
4,2017-10-03 09:00:00+09:00,5021000.0,5079000.0,4811000.0,4964000.0,11.899307,58901300.0


In [35]:
# standard scler 구현
class StandardScaler():
    def __init__(self):
        self.mean = 0.
        self.std = 1.
    
    def fit(self, data):
        self.mean = data.mean(0)
        self.std = data.std(0)

    def transform(self, data):
        mean = torch.from_numpy(self.mean).type_as(data).to(data.device) if torch.is_tensor(data) else self.mean
        std = torch.from_numpy(self.std).type_as(data).to(data.device) if torch.is_tensor(data) else self.std
        return (data - mean) / std

    def inverse_transform(self, data):
        mean = torch.from_numpy(self.mean).type_as(data).to(data.device) if torch.is_tensor(data) else self.mean
        std = torch.from_numpy(self.std).type_as(data).to(data.device) if torch.is_tensor(data) else self.std
        return (data * std) + mean

In [53]:
# 시간 특징을 freq에 따라 추출
def time_features(dates, freq='h'):
    dates['month'] = dates.date.apply(lambda row:row.month,1)
    dates['day'] = dates.date.apply(lambda row:row.day,1)
    dates['weekday'] = dates.date.apply(lambda row:row.weekday(),1)
    dates['hour'] = dates.date.apply(lambda row:row.hour,1)
    dates['minute'] = dates.date.apply(lambda row:row.minute,1)
    dates['minute'] = dates.minute.map(lambda x:x//15)
    freq_map = {
        'y':[],'m':['month'],'w':['month'],'d':['month','day','weekday'],
        'b':['month','day','weekday'],'h':['month','day','weekday','hour'],
        't':['month','day','weekday','hour','minute'],
    }
    return dates[freq_map[freq.lower()]].values

In [66]:
dataframe = data[['index', 'open']]
dataframe.head()

Unnamed: 0,index,open
0,2017-09-29 09:00:00+09:00,4586000.0
1,2017-09-30 09:00:00+09:00,4657000.0
2,2017-10-01 09:00:00+09:00,4889000.0
3,2017-10-02 09:00:00+09:00,4962000.0
4,2017-10-03 09:00:00+09:00,5021000.0


In [67]:
dataframe.rename(columns={'index':'date'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.rename(columns={'index':'date'}, inplace=True)


In [70]:
from datetime import timedelta
pred_len=24*7


scaler = StandardScaler()
df_raw = dataframe.copy()
df_raw["date"] = pd.to_datetime(df_raw["date"])

delta = df_raw["date"].iloc[1] - df_raw["date"].iloc[0]
if delta>=timedelta(hours=1):
    freq='h'
else:
    freq='t'

border1 = 0
border2 = len(df_raw)
cols_data = df_raw.columns[1:]
df_data = df_raw[cols_data]

data = df_data.values
    
tmp_stamp = df_raw[['date']][border1:border2]
tmp_stamp['date'] = pd.to_datetime(tmp_stamp.date)
pred_dates = pd.date_range(tmp_stamp.date.values[-1], periods=pred_len+1, freq=freq)

df_stamp = pd.DataFrame(columns = ['date'])
df_stamp.date = list(tmp_stamp.date.values) + list(pred_dates[1:])
data_stamp = time_features(df_stamp, freq=freq)

data_x = data[border1:border2]
data_y = data[border1:border2]
data_stamp = data_stamp

In [71]:
data_x

array([[ 4586000.],
       [ 4657000.],
       [ 4889000.],
       ...,
       [38933000.],
       [37436000.],
       [37827000.]])

In [72]:
data_y

array([[ 4586000.],
       [ 4657000.],
       [ 4889000.],
       ...,
       [38933000.],
       [37436000.],
       [37827000.]])

In [74]:
data_stamp

array([[ 9, 29,  4,  0],
       [ 9, 30,  5,  0],
       [10,  1,  6,  0],
       ...,
       [ 5, 31,  1, 22],
       [ 5, 31,  1, 23],
       [ 6,  1,  2,  0]], dtype=int64)

In [76]:
index = 0
seq_len = pred_len
label_len = pred_len


In [77]:
s_begin = index
s_end = s_begin + seq_len
r_begin = s_end - label_len
r_end = r_begin + label_len + pred_len 



In [78]:
s_begin

0

In [79]:
s_end

168

In [80]:
r_begin

0

In [81]:
r_end

336

In [83]:
len(data_x) - seq_len - pred_len + 1


1365

In [93]:
from torch.utils.data import Dataset
import torch
from torch.utils import data
from torch.utils.data import DataLoader

class Dataset_Pred(Dataset):
    def __init__(self, dataframe, size=None, scale=True):
        self.seq_len = size[0]
        self.label_len = size[1]
        self.pred_len = size[2]
        self.dataframe = dataframe
        
        self.scale = scale
        self.__read_data__()

    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = self.dataframe
        df_raw["date"] = pd.to_datetime(df_raw["date"])

        delta = df_raw["date"].iloc[1] - df_raw["date"].iloc[0]
        if delta>=timedelta(hours=1):
            self.freq='d'
        else:
            self.freq='t'

        border1 = 0
        border2 = len(df_raw)
        cols_data = df_raw.columns[1:]
        df_data = df_raw[cols_data]


        if self.scale:
            self.scaler.fit(df_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values
            
        tmp_stamp = df_raw[['date']][border1:border2]
        tmp_stamp['date'] = pd.to_datetime(tmp_stamp.date)
        pred_dates = pd.date_range(tmp_stamp.date.values[-1], periods=self.pred_len+1, freq=self.freq)
        
        df_stamp = pd.DataFrame(columns = ['date'])
        df_stamp.date = list(tmp_stamp.date.values) + list(pred_dates[1:])
        data_stamp = time_features(df_stamp, freq=self.freq)

        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp
    
    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]
        seq_x_mark = self.data_stamp[s_begin:s_end]
        seq_y_mark = self.data_stamp[r_begin:r_end]
        return seq_x, seq_y, seq_x_mark, seq_y_mark

    def __len__(self):
        return len(self.data_x) - self.seq_len- self.pred_len + 1

In [94]:
custom_dataset = Dataset_Pred(dataframe, (96, 48, 24), scale=True)

In [95]:
dataloader = DataLoader(custom_dataset, batch_size=64)

seq_x, seq_y, seq_x_mark, seq_y_mark = next(iter(dataloader))

In [96]:
seq_x.shape

torch.Size([64, 96, 1])

In [97]:
seq_y.shape

torch.Size([64, 72, 1])

In [98]:
seq_x_mark.shape

torch.Size([64, 96, 3])

In [99]:
seq_y_mark.shape

torch.Size([64, 72, 3])

In [101]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
# 한번의 batch를 실행하는 코드
def _process_one_batch(batch_x, batch_y, batch_x_mark, batch_y_mark):
    batch_x = batch_x.float().to(device)
    batch_y = batch_y.float()
    batch_x_mark = batch_x_mark.float().to(device)
    batch_y_mark = batch_y_mark.float().to(device)
    dec_inp = torch.zeros([batch_y.shape[0], pred_len, batch_y.shape[-1]]).float()
    dec_inp = torch.cat([batch_y[:,:label_len,:], dec_inp], dim=1).float().to(device)
    outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
    batch_y = batch_y[:,-pred_len:,0:].to(device)
    return outputs, batch_y