# 프로젝트 개요

kaggle의 bitstamp 데이터를 활용해 3가지 분석을 진행

1. 시계열 데이터 분석
2. 데이터 불균형 -> scaling
3. RandomForest vs KNN

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


# Library & Data Import

In [None]:
import pandas as pd
import numpy as np
# MinMaxScaler : 데이터의 min, max값을 주고, 0~1 사이값으로 데이터를 scaling할 수 있게 해준다.
from sklearn.preprocessing import MinMaxScaler
# train_test_split : train과 test로 data를 나눠주는 함수
from sklearn.model_selection import train_test_split
import os
import random

import warnings
warnings.filterwarnings('ignore')

### seed값 고정과정
- seed값은 컴퓨터가 매번 다른 확률로 처리함
- seed값을 고정시켜줌으로써 다른 사람과 내가 같은 결과를 낼 수 있게 고정

In [None]:
np.random.seed(2021)
random.seed(2021)
os.environ['PYTHONHASHSEED'] = '0'

In [None]:
data = pd.read_csv('/content/gdrive/MyDrive/big_data/time_series/data/bitstampUSD.csv')

In [None]:
data.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
0,1325317920,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
1,1325317980,,,,,,,
2,1325318040,,,,,,,
3,1325318100,,,,,,,
4,1325318160,,,,,,,


- 8개의 column
- NaN값이 많이 보인다.
- Open, Close, High, Low 컬럼 추측
- Weighted_Price값이 예측해야하는 값

# 데이터셋 살펴보기

In [None]:
data.head(3)

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
0,1325317920,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
1,1325317980,,,,,,,
2,1325318040,,,,,,,


In [None]:
data.tail()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
4857372,1617148560,58714.31,58714.31,58686.0,58686.0,1.384487,81259.372187,58692.753339
4857373,1617148620,58683.97,58693.43,58683.97,58685.81,7.294848,428158.14664,58693.226508
4857374,1617148680,58693.43,58723.84,58693.43,58723.84,1.705682,100117.07037,58696.198496
4857375,1617148740,58742.18,58770.38,58742.18,58760.59,0.720415,42332.958633,58761.866202
4857376,1617148800,58767.75,58778.18,58755.97,58778.18,2.712831,159417.751,58764.349363


- Timestamp가 커지면서 다른 값들도 커지는 경향이 있다

In [None]:
# dimension
data.shape

(4857377, 8)

In [None]:
# information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4857377 entries, 0 to 4857376
Data columns (total 8 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Timestamp          int64  
 1   Open               float64
 2   High               float64
 3   Low                float64
 4   Close              float64
 5   Volume_(BTC)       float64
 6   Volume_(Currency)  float64
 7   Weighted_Price     float64
dtypes: float64(7), int64(1)
memory usage: 296.5 MB


In [None]:
data.describe()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
count,4857377.0,3613769.0,3613769.0,3613769.0,3613769.0,3613769.0,3613769.0,3613769.0
mean,1471301000.0,6009.024,6013.357,6004.488,6009.014,9.323249,41762.84,6008.935
std,84280190.0,8996.247,9003.521,8988.778,8996.36,30.54989,151824.8,8995.992
min,1325318000.0,3.8,3.8,1.5,1.5,0.0,0.0,3.8
25%,1398179000.0,443.86,444.0,443.52,443.86,0.4097759,452.1422,443.8306
50%,1471428000.0,3596.97,3598.19,3595.62,3597.0,1.979811,3810.124,3596.804
75%,1544288000.0,8627.27,8632.98,8621.09,8627.16,7.278216,25698.21,8627.637
max,1617149000.0,61763.56,61781.83,61673.55,61781.8,5853.852,13900670.0,61716.21


통계치
- count : 데이터의 갯수
- mean : 전체 데이터의 평균
- std : 표준편차
- min : 최솟값
- 25%, 50%, 75% : 지점의 값들
- max : 최댓값

25%, 50%, 75%, max를 비교해서 이상치가 있나없나 추측할 수 있다.

In [None]:
# 결측치
# df.isna().sum()
data.isnull().sum()

Timestamp                  0
Open                 1243608
High                 1243608
Low                  1243608
Close                1243608
Volume_(BTC)         1243608
Volume_(Currency)    1243608
Weighted_Price       1243608
dtype: int64

# 데이터 전처리

In [None]:
# Timestamp 날짜 변환
# unit='s'  : 초 단위 계산
# index 재설정
data.index = pd.to_datetime(data['Timestamp'], unit='s')
data.head()

Unnamed: 0_level_0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-12-31 07:52:00,1325317920,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
2011-12-31 07:53:00,1325317980,,,,,,,
2011-12-31 07:54:00,1325318040,,,,,,,
2011-12-31 07:55:00,1325318100,,,,,,,
2011-12-31 07:56:00,1325318160,,,,,,,


- 데이터가 분단위로 구성

In [None]:
# 필요없어진 기존의 Timestamp 제거
data = data.drop(['Timestamp'], axis=1)
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-12-31 07:52:00,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
2011-12-31 07:53:00,,,,,,,
2011-12-31 07:54:00,,,,,,,
2011-12-31 07:55:00,,,,,,,
2011-12-31 07:56:00,,,,,,,


- 시간을 분단위가 아닌 일단위로 바꿔준다.
> 같은 날짜를 가진 row의 평균을 내서 날짜 단위로 묶어준다

In [None]:
data = data.resample('D').mean()
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-12-31,4.465,4.4825,4.465,4.4825,23.82947,106.330084,4.471603
2012-01-01,4.806667,4.806667,4.806667,4.806667,7.200667,35.25972,4.806667
2012-01-02,5.0,5.0,5.0,5.0,19.048,95.24,5.0
2012-01-03,5.2525,5.2525,5.2525,5.2525,11.00466,58.100651,5.2525
2012-01-04,5.2,5.223333,5.2,5.223333,11.914807,63.119577,5.208159


In [None]:
# 결측치 제거
data = data.dropna()
data.isnull().sum()

Open                 0
High                 0
Low                  0
Close                0
Volume_(BTC)         0
Volume_(Currency)    0
Weighted_Price       0
dtype: int64

## scaling

큰 값과 작은 값을 맞춰주지 않으면 학습에 악영향

In [None]:
from sklearn.preprocessing import RobustScaler

rob_scaler = RobustScaler()

# scaling 진행
data['open'] = rob_scaler.fit_transform(data['Open'].values.reshape(-1,1))
data['high'] = rob_scaler.fit_transform(data['High'].values.reshape(-1,1))
data['low'] = rob_scaler.fit_transform(data['Low'].values.reshape(-1,1))
data['close'] = rob_scaler.fit_transform(data['Close'].values.reshape(-1,1))
data['volume_(BTC)'] = rob_scaler.fit_transform(data['Volume_(BTC)'].values.reshape(-1,1))
data['volume_(Currency)'] = rob_scaler.fit_transform(data['Volume_(Currency)'].values.reshape(-1,1))
data['weighted_Price'] = rob_scaler.fit_transform(data['Weighted_Price'].values.reshape(-1,1))
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price,open,high,low,close,volume_(BTC),volume_(Currency),weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2011-12-31,4.465,4.4825,4.465,4.4825,23.82947,106.330084,4.471603,-0.099101,-0.09906,-0.099124,-0.099099,1.919561,-0.196897,-0.099097
2012-01-01,4.806667,4.806667,4.806667,4.806667,7.200667,35.25972,4.806667,-0.099053,-0.099014,-0.099075,-0.099052,-0.049931,-0.198978,-0.099049
2012-01-02,5.0,5.0,5.0,5.0,19.048,95.24,5.0,-0.099025,-0.098987,-0.099047,-0.099025,1.35325,-0.197222,-0.099022
2012-01-03,5.2525,5.2525,5.2525,5.2525,11.00466,58.100651,5.2525,-0.098989,-0.09895,-0.099011,-0.098989,0.400609,-0.198309,-0.098986
2012-01-04,5.2,5.223333,5.2,5.223333,11.914807,63.119577,5.208159,-0.098996,-0.098955,-0.099019,-0.098993,0.508405,-0.198162,-0.098992


In [None]:
# scaling된 컬럼만 남기고 기존 값 제거
data.drop(['Open','High','Low','Close','Volume_(BTC)','Volume_(Currency)','Weighted_Price'], axis=1, inplace=True)

In [None]:
data.head()

Unnamed: 0_level_0,open,high,low,close,volume_(BTC),volume_(Currency),weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-12-31,-0.099101,-0.09906,-0.099124,-0.099099,1.919561,-0.196897,-0.099097
2012-01-01,-0.099053,-0.099014,-0.099075,-0.099052,-0.049931,-0.198978,-0.099049
2012-01-02,-0.099025,-0.098987,-0.099047,-0.099025,1.35325,-0.197222,-0.099022
2012-01-03,-0.098989,-0.09895,-0.099011,-0.098989,0.400609,-0.198309,-0.098986
2012-01-04,-0.098996,-0.098955,-0.099019,-0.098993,0.508405,-0.198162,-0.098992


## train, test 데이터 분리

**시간순으로 이루어진 데이터들을 이용해 test가 미래의 값을 예측하길 원함**
- train과 test를 단순히 섞고, split하는 개념이 아니다.
> overfitting, cheating 문제가 생길 수 있다.
- train은 이전 시점, test는 미래 시점이 되야 한다.

In [None]:
# 처음부터 120일까지 train data에 넣어준다.
train = data.iloc[:-120]
# 120일부터 마지막까지 test data에 넣어준다.
test = data.iloc[-120:]

## 샘플링(섞기)

In [None]:
# frac=1 모든 데이터를 샘플링해서 재정의
train = train.sample(frac=1)
test = test.sample(frac=1)

In [None]:
X_train = train.drop(['weighted_Price'], axis=1)
y_train = train['weighted_Price']

X_test = test.drop(['weighted_Price'], axis=1)
y_test = test['weighted_Price']

## DataFrame -> Numpy 변환

In [None]:
X_train = X_train.values
y_train = y_train.values

X_test = X_test.values
y_test = y_test.values

# 모델 학습(Random forest)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10)
random_forest.fit(X_train, y_train)

RandomForestRegressor(max_depth=10)

# 예측

In [None]:
y_pred = random_forest.predict(X_test)

# 평가

mse
-> 성능확인을 위한 lib

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

3.422446034113787

---

# 모델 학습(KNN)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn = KNeighborsRegressor(n_neighbors=2)
knn.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=2)

# 예측

In [None]:
y_pred = knn.predict(X_test)

# 평가

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

3.5168008051237116

---

# 결론
- rmse값은 작을수록 좋은 성능을 낼 수 있다고 말할 수 있다.
- RandomForest가 KNN모델보다 성능이 좋다.