# 암호화폐(BitCoin)의 가격 예측 모델

### 필요 모듈 설치
> pyupbit 모듈은 업비트 API를 파이썬에서 쉽게 사용하기 위해서 개발되었다. 따로 API를 호출할 필요가 없어 간편하다. 

[Github 문서](https://github.com/sharebook-kr/pyupbit)

In [None]:
!pip install pyupbit
!pip3 install xgboost
!pip3 install bayesian-optimization

In [3]:
import pyupbit

### 업비트에서 일봉(day)차트의 데이터 가져오기
* `open`: 시가
* `high`: 고가
* `low`: 저가
* `close`: 종가
* `volume`: 거래량
* `value`: 거래대금

In [4]:
# 원본 데이터
df_origin = pyupbit.get_ohlcv("KRW-BTC", "day", count=2000, period=1)

print(df_origin.shape)
df_origin.head()

(1832, 6)


Unnamed: 0,open,high,low,close,volume,value
2017-09-25 09:00:00,4201000.0,4333000.0,4175000.0,4322000.0,132.484755,560214600.0
2017-09-26 09:00:00,4317000.0,4418000.0,4311000.0,4321000.0,22.78834,99507240.0
2017-09-27 09:00:00,4322000.0,4677000.0,4318000.0,4657000.0,32.269662,144827600.0
2017-09-28 09:00:00,4657000.0,4772000.0,4519000.0,4586000.0,80.588243,372186000.0
2017-09-29 09:00:00,4586000.0,4709000.0,4476000.0,4657000.0,59.352373,272455800.0


### EDA, Feature engineering
* 새로운 컬럼을 생성
  * `candle`: 전날 대비 캔들의 비율(음봉은 음수, 양봉은 양수)
  * `volume_ratio`: 전날 대비 거래량의 비율
  * `profit_loss`: 다음 날의 손실(0)과 이득(1)을 나타내는 컬럼을 생성 

In [40]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

df = df_origin.copy()

# 전날 대비 캔들의 길이
candle_lst = [np.nan]
for c in range(len(df)-1):
  candle_ratio = (df['close'].iloc[c+1]-df['open'].iloc[c+1]) / abs(df['close'].iloc[c]-df['open'].iloc[c]+1)
  candle_lst.append(candle_ratio.round(2))

df['candle'] = candle_lst


# 전날 대비 거래량 비율
volume_lst = [np.nan]
for c in range(len(df)-1):
  volume_ratio = df['volume'].iloc[c+1] / df['volume'].iloc[c]
  volume_lst.append(volume_ratio.round(2))

df['volume_ratio'] = volume_lst


# 다음날의 손익 컬럼 생성(1은 수익, 0은 손실을 의미)
pl_lst = []
for p in range(len(df)-1):
  if df.iloc[p+1, 3]-df.iloc[p+1,0] > 0:
    pl_lst.append(1)
  elif df.iloc[p+1,3]-df.iloc[p+1,0] <= 0:
    pl_lst.append(0)
# 마지막 행의 손익을 알 수 없으므로 결측치 처리
pl_lst.append(np.nan)
df['profit_loss'] = pl_lst

# 첫 행과 마지막 행 삭제
df.dropna(inplace=True)

# 데이터 타입 변경
df = df.astype({'open':int, 'high':int, 'low':int, 'close':int, 'profit_loss':int})
df

Unnamed: 0,open,high,low,close,volume,value,candle,volume_ratio,profit_loss
2017-09-26 09:00:00,4317000,4418000,4311000,4321000,22.788340,9.950724e+07,0.03,0.17,1
2017-09-27 09:00:00,4322000,4677000,4318000,4657000,32.269662,1.448276e+08,83.73,1.42,0
2017-09-28 09:00:00,4657000,4772000,4519000,4586000,80.588243,3.721860e+08,-0.21,2.50,1
2017-09-29 09:00:00,4586000,4709000,4476000,4657000,59.352373,2.724558e+08,1.00,0.74,1
2017-09-30 09:00:00,4657000,4896000,4651000,4895000,19.998483,9.561476e+07,3.35,0.34,1
...,...,...,...,...,...,...,...,...,...
2022-09-25 09:00:00,27270000,27499000,26910000,27051000,2488.111300,6.783023e+10,-0.68,0.86,1
2022-09-26 09:00:00,27051000,27749000,27000000,27597000,4639.446161,1.270504e+11,2.49,1.86,0
2022-09-27 09:00:00,27604000,29112000,27130000,27501000,11326.095177,3.191365e+11,-0.19,2.44,1
2022-09-28 09:00:00,27495000,28421000,26800000,27952000,7772.212439,2.141910e+11,4.44,0.69,1


### 특성과 타겟을 분리, 학습과 테스트 데이터를 분리

In [6]:
from sklearn.model_selection import train_test_split

features = ['candle',	'volume_ratio']
target = 'profit_loss'

X, y = df[features], df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

### 데이터 정규화 및 기준모델의 정확도 계산

In [37]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Scaling
scaler_std = StandardScaler()

X_train_std = scaler_std.fit_transform(X_train)
X_test_std = scaler_std.transform(X_test)


# 기준모델
base = y_train.mode()[0]
baseline = len(y_train) * [base]
y_pred_base = len(y_test) * [base]
# 기준모델의 정확도
print("Train accuracy:", accuracy_score(y_train, baseline).round(3))
print("Test accuracy:", accuracy_score(y_test, y_pred_base).round(3))

Train accuracy: 0.526
Test accuracy: 0.525


### 기준모델의 Confusion Matrix

In [36]:
y_true0 = y_test.value_counts()[0]
y_true1 = y_test.value_counts()[1]
confusion_base = pd.DataFrame(index = ['True 0', 'True 1'], data = {'Predicted 0':[0,0], 'Predicted 1':[y_true0,y_true1]})
confusion_base

Unnamed: 0,Predicted 0,Predicted 1
True 0,0,174
True 1,0,192


## 로지스틱 모델

### 모델의 정확도

In [41]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score

# 로지스틱 회귀
logCV_model = LogisticRegressionCV(cv=5)
logCV_model.fit(X_train_std, y_train)

# 로지스틱 회귀모델의 정확도
print("Train accuracy:", logCV_model.score(X_train_std, y_train).round(3))

Train accuracy: 0.527


### 평가지표

In [39]:
from sklearn.metrics import classification_report

y_pred_test_log = logCV_model.predict(X_test_std)

print(classification_report(y_test, y_pred_test_log))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       174
           1       0.52      1.00      0.69       192

    accuracy                           0.52       366
   macro avg       0.26      0.50      0.34       366
weighted avg       0.28      0.52      0.36       366



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 랜덤포레스트 모델

In [15]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(oob_score=True, n_jobs=-1)
rf_model.fit(X_train, y_train)

print("Train accuracy:", rf_model.score(X_train, y_train).round(3))
print("Out-of-bag 샘플의 정확도:", rf_model.oob_score_.round(3))

Train accuracy: 0.997
Out-of-bag 샘플의 정확도: 0.51


In [16]:
y_pred_test_rf = rf_model.predict(X_test)
print(classification_report(y_test, y_pred_test_rf))

              precision    recall  f1-score   support

           0       0.48      0.41      0.44       174
           1       0.53      0.60      0.56       192

    accuracy                           0.51       366
   macro avg       0.51      0.51      0.50       366
weighted avg       0.51      0.51      0.51       366



## XGBoost

In [31]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    eval_metric="error",
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
    learning_rate=0.1,
    use_label_encoder=False
)

xgb_model.fit(X_train, y_train)

# 정확도
print("Traing accuracy", xgb_model.score(X_train, y_train).round(3))

y_pred_xgb = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

Traing accuracy 0.727
              precision    recall  f1-score   support

           0       0.46      0.41      0.43       174
           1       0.51      0.55      0.53       192

    accuracy                           0.49       366
   macro avg       0.48      0.48      0.48       366
weighted avg       0.48      0.49      0.48       366

