# 프로젝트 개요

kaggle의 bitstamp 데이터를 활용해 3가지 분석을 진행

1. 시계열 데이터 분석
2. 데이터 불균형 -> scaling
3. 이전의 같은 데이터로 학습시킨 Machine Learning 모델과 성능 비교 
> https://github.com/juhwano/Machine_Learning/blob/main/time_series/%EB%B9%84%ED%8A%B8%EC%BD%94%EC%9D%B8_%EC%8B%9C%EA%B3%84%EC%97%B4_%EC%98%88%EC%B8%A12.ipynb

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


# Library & Data Import

In [None]:
import pandas as pd
import numpy as np
import os
import random
import warnings
warnings.filterwarnings('ignore')

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model, layers, optimizers

In [None]:
# seed값 고정
np.random.seed(2021)
random.seed(2021)
tf.random.set_seed(2021)
os.environ['PYTHONHASHSEED'] = '0'

In [None]:
data = pd.read_csv('/content/gdrive/MyDrive/big_data/time_series/data/bitstampUSD.csv')

In [None]:
data.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
0,1325317920,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
1,1325317980,,,,,,,
2,1325318040,,,,,,,
3,1325318100,,,,,,,
4,1325318160,,,,,,,


# 데이터 전처리

In [None]:
# Timestamp 날짜 변환
# unit='s'  : 초 단위 계산
# index 재설정
data.index = pd.to_datetime(data['Timestamp'], unit='s')
# 기존 Timestamp 제거
data = data.drop(['Timestamp'], axis=1)
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-12-31 07:52:00,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
2011-12-31 07:53:00,,,,,,,
2011-12-31 07:54:00,,,,,,,
2011-12-31 07:55:00,,,,,,,
2011-12-31 07:56:00,,,,,,,


In [None]:
# Timestamp 일 단위로 변경
data = data.resample('D').mean()
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-12-31,4.465,4.4825,4.465,4.4825,23.82947,106.330084,4.471603
2012-01-01,4.806667,4.806667,4.806667,4.806667,7.200667,35.25972,4.806667
2012-01-02,5.0,5.0,5.0,5.0,19.048,95.24,5.0
2012-01-03,5.2525,5.2525,5.2525,5.2525,11.00466,58.100651,5.2525
2012-01-04,5.2,5.223333,5.2,5.223333,11.914807,63.119577,5.208159


In [None]:
# 결측치 제거
data = data.dropna()

## scaling

In [None]:
from sklearn.preprocessing import RobustScaler

rob_scaler = RobustScaler()

data['open'] = rob_scaler.fit_transform(data['Open'].values.reshape(-1,1))
data['high'] = rob_scaler.fit_transform(data['High'].values.reshape(-1,1))
data['low'] = rob_scaler.fit_transform(data['Low'].values.reshape(-1,1))
data['close'] = rob_scaler.fit_transform(data['Close'].values.reshape(-1,1))
data['volume_(BTC)'] = rob_scaler.fit_transform(data['Volume_(BTC)'].values.reshape(-1,1))
data['volume_(Currency)'] = rob_scaler.fit_transform(data['Volume_(Currency)'].values.reshape(-1,1))
data['weighted_Price'] = rob_scaler.fit_transform(data['Weighted_Price'].values.reshape(-1,1))

In [None]:
data.drop(['Open','High','Low','Close','Volume_(BTC)','Volume_(Currency)','Weighted_Price'], axis=1, inplace=True)

In [None]:
data.head()

Unnamed: 0_level_0,open,high,low,close,volume_(BTC),volume_(Currency),weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-12-31,-0.099101,-0.09906,-0.099124,-0.099099,1.919561,-0.196897,-0.099097
2012-01-01,-0.099053,-0.099014,-0.099075,-0.099052,-0.049931,-0.198978,-0.099049
2012-01-02,-0.099025,-0.098987,-0.099047,-0.099025,1.35325,-0.197222,-0.099022
2012-01-03,-0.098989,-0.09895,-0.099011,-0.098989,0.400609,-0.198309,-0.098986
2012-01-04,-0.098996,-0.098955,-0.099019,-0.098993,0.508405,-0.198162,-0.098992


## train, test 데이터 나누기



In [None]:
train = data.iloc[:-120]
test = data.iloc[-120:]

## sampling

In [None]:
train = train.sample(frac=1)
test = test.sample(frac=1)

In [None]:
X_train = train.drop(['weighted_Price'], axis=1)
y_train = train['weighted_Price']

X_test = test.drop(['weighted_Price'], axis=1)
y_test = test['weighted_Price']

In [None]:
X_train.head()

Unnamed: 0_level_0,open,high,low,close,volume_(BTC),volume_(Currency)
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-08-15,-0.06189,-0.061876,-0.061894,-0.06189,0.0162,-0.139943
2012-08-19,-0.098415,-0.098377,-0.098439,-0.098416,2.263715,-0.192983
2020-04-14,0.884085,0.883818,0.884211,0.884126,-0.292051,0.838624
2016-12-08,0.009961,0.009936,0.00999,0.009969,-0.336284,-0.09245
2013-01-22,-0.097292,-0.097254,-0.097314,-0.097292,1.352725,-0.19041


## DataFrame -> tensor 변환

In [None]:
X_train = X_train.values
y_train = y_train.values

X_test = X_test.values
y_test = y_test.values

# 모델

In [None]:
# [sample, timestep, features]

X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

# 모델 설정
model_lstm = keras.Sequential()
model_lstm.add(layers.LSTM(units=10, activation='relu'))
model_lstm.add(layers.BatchNormalization())
model_lstm.add(layers.Dense(1))

#학습

In [None]:
# 학습 방법 설정
model_lstm.compile(loss='mse', optimizer=optimizers.Adam(learning_rate=0.001))

In [None]:
# 학습
model_lstm.fit(X_train, y_train, validation_split=0.2, batch_size=32, epochs=10, shuffle=True, verbose=2)

Epoch 1/10
82/82 - 3s - loss: 0.2388 - val_loss: 0.2399 - 3s/epoch - 38ms/step
Epoch 2/10
82/82 - 0s - loss: 0.0493 - val_loss: 0.2040 - 196ms/epoch - 2ms/step
Epoch 3/10
82/82 - 0s - loss: 0.0246 - val_loss: 0.1612 - 225ms/epoch - 3ms/step
Epoch 4/10
82/82 - 0s - loss: 0.0202 - val_loss: 0.1159 - 245ms/epoch - 3ms/step
Epoch 5/10
82/82 - 0s - loss: 0.0196 - val_loss: 0.0735 - 243ms/epoch - 3ms/step
Epoch 6/10
82/82 - 0s - loss: 0.0164 - val_loss: 0.0436 - 233ms/epoch - 3ms/step
Epoch 7/10
82/82 - 0s - loss: 0.0168 - val_loss: 0.0228 - 227ms/epoch - 3ms/step
Epoch 8/10
82/82 - 0s - loss: 0.0145 - val_loss: 0.0203 - 208ms/epoch - 3ms/step
Epoch 9/10
82/82 - 0s - loss: 0.0168 - val_loss: 0.0045 - 192ms/epoch - 2ms/step
Epoch 10/10
82/82 - 0s - loss: 0.0157 - val_loss: 0.0113 - 236ms/epoch - 3ms/step


<keras.callbacks.History at 0x7f72233dccd0>

# 예측

In [None]:
y_pred = model_lstm.predict(X_test)

# 평가

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

4.188858426764705

# 결론

- 무조건 딥러닝이 좋다거나, 무조건 머신러닝이 좋다고 생각하는 것보다 여러 모델을 비교하고, 논문을 자주 찾아보는 게 좋은 자세이다.