In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib

In [71]:
data = pd.read_csv("./data/ETFs_main.csv")
data

Unnamed: 0,Dates,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO
0,2007-02-20,146.04,145.56,146.200,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055
1,2007-02-21,145.98,145.61,146.070,145.0,63971500.0,67.28,82.90,2.3653,0.32,49.86,25.12,10.20,39.975
2,2007-02-22,145.87,146.05,146.420,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.220
3,2007-02-23,145.30,145.74,145.790,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035
4,2007-02-26,145.17,145.83,145.950,145.0,69320062.0,68.10,83.08,2.3795,0.31,50.90,25.04,11.15,39.960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2766,2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.87,1.7807,0.48,9.72,25.77,28.38,38.180
2767,2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.87,1.7651,0.48,9.57,25.94,30.11,37.870
2768,2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.55,1.7505,0.40,9.29,25.55,36.07,37.320
2769,2018-12-27,248.07,242.57,248.290,239.0,186267297.0,120.57,86.00,1.7581,0.44,9.62,25.57,29.96,37.900


* CLOSE_SPY: SPY(S&P 500 ETF)의 종가. SPY는 S&P 500 지수를 추종하는 ETF로, 미국 주식시장의 전반적인 성과를 반영합니다.
* OPEN: SPY의 시가. 거래가 시작될 때의 가격을 의미합니다.
* HIGH: SPY의 고가. 해당 거래일 동안 기록된 가장 높은 가격입니다.
* LOW: SPY의 저가. 해당 거래일 동안 기록된 가장 낮은 가격입니다.
* VOLUME: SPY의 거래량. 해당 거래일 동안 주식이 얼마나 많이 거래되었는지를 나타냅니다.
* CLOSE_GLD: GLD(Gold ETF)의 종가. 금 가격을 추종하는 ETF로, 금의 시장 가격을 반영합니다.
* CLOSE_FXY: FXY(Japanese Yen ETF)의 종가. 일본 엔화의 성과를 추종하는 ETF입니다.
* CLOSE_T10Y2Y: 10년 만기 미국 국채와 2년 만기 국채 간의 금리 차이를 나타내는 지표입니다. 일반적으로 경기 예측에 사용됩니다.
* CLOSE_TED: TED 스프레드(TED Spread)로, 미국 3개월 만기 재무부채권 금리와 3개월 만기 유로달러 금리 차이를 나타냅니다. 금융시장의 위험을 측정하는 지표로 자주 사용됩니다.
* CLOSE_USO: USO(Crude Oil ETF)의 종가. 원유 가격을 추종하는 ETF입니다.
* CLOSE_UUP: UUP(U.S. Dollar ETF)의 종가. 미국 달러 지수를 추종하는 ETF로, 달러의 가치 변동을 반영합니다.
* CLOSE_VIX: VIX(Volatility Index) 종가. 흔히 '공포 지수'라고도 불리며, 시장의 변동성(주로 S&P 500의 향후 30일간의 예상 변동성)을 나타냅니다. 
* CLOSE_VWO: VWO(Emerging Markets ETF)의 종가. 신흥 시장 주식에 투자하는 ETF입니다.

In [72]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2771 entries, 0 to 2770
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Dates         2771 non-null   object 
 1   CLOSE_SPY     2771 non-null   float64
 2   OPEN          2771 non-null   float64
 3   HIGH          2771 non-null   float64
 4   LOW           2771 non-null   float64
 5   VOLUME        2771 non-null   float64
 6   CLOSE_GLD     2771 non-null   float64
 7   CLOSE_FXY     2771 non-null   float64
 8   CLOSE_T10Y2Y  2771 non-null   float64
 9   CLOSE_TED     2771 non-null   float64
 10  CLOSE_USO     2771 non-null   float64
 11  CLOSE_UUP     2771 non-null   float64
 12  CLOSE_VIX     2771 non-null   float64
 13  CLOSE_VWO     2771 non-null   float64
dtypes: float64(13), object(1)
memory usage: 303.2+ KB


In [73]:
data['Dates'] = pd.to_datetime(data['Dates'])
data['Dates'].dtype

dtype('<M8[ns]')

In [74]:
data.head(2)

Unnamed: 0,Dates,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO
0,2007-02-20,146.04,145.56,146.2,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055
1,2007-02-21,145.98,145.61,146.07,145.0,63971500.0,67.28,82.9,2.3653,0.32,49.86,25.12,10.2,39.975


In [75]:
data = data.set_index('Dates')
data

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2007-02-20,146.04,145.56,146.200,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055
2007-02-21,145.98,145.61,146.070,145.0,63971500.0,67.28,82.90,2.3653,0.32,49.86,25.12,10.20,39.975
2007-02-22,145.87,146.05,146.420,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.220
2007-02-23,145.30,145.74,145.790,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035
2007-02-26,145.17,145.83,145.950,145.0,69320062.0,68.10,83.08,2.3795,0.31,50.90,25.04,11.15,39.960
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.87,1.7807,0.48,9.72,25.77,28.38,38.180
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.87,1.7651,0.48,9.57,25.94,30.11,37.870
2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.55,1.7505,0.40,9.29,25.55,36.07,37.320
2018-12-27,248.07,242.57,248.290,239.0,186267297.0,120.57,86.00,1.7581,0.44,9.62,25.57,29.96,37.900


# 분석에 사용할 기술적 지표 만들기
* MA_45: 45일 단순 이동평균(Simple Moving Average): 특정 자산의 45일간 평균 종가
* VMA_45: 45일 거래량 (Volume Moving Average): 지난 45일간의 평균 거래량
* RSI_14: 14일 상대강도지수(Relative Strength Index): 14일 동안의 자산 가격 변동을 바탕으로 과매수 또는 과매도 상태를 평가하는 지표

## RSI(Reletive Strength Index) 공식
* RSI는 자산의 가격 변동 강도를 측정해 과매수, 또는 과매도 상태를 평가하는 기술적 분석 지표/
* 주로 14일 동안의 가격 변동 기준으로 계산, 값은 0 - 100사이
* RSI 값이 70이상: 과매수 상태(매도 시점을 고려)
* RSI 값이 30이하: 과매도 상태(매수 시점을 고려)
* RSI는 주가가 너무 빠르게 상승하거나 하락했는지 확인하는데 사용
* 투자자들의 매수/매도 결정을 내릴 때 중요한 참고 지표로 활용

### RSI 계산 공식
$$ RSI = {100 - {100 \over 1 + RS}} $$
### RS(Relative Strength)
$$ RS = {Average Gain \over Average Loss} $$
* Average Gain: 일정 기간(14일)동안 가격 상승분의 평균 
* Average Loss: 일정 기간(14일)동안 가격 하락분의 평균
* 상승 Gain = 오늘의 종가 - 어제의 종가
* 하락 Loss = 어제의 종가 - 오늘의 종가


In [76]:
days = 45

단순이동평균 구해서 데이터 프레임에 합치기

In [77]:
data.columns

Index(['CLOSE_SPY', 'OPEN', 'HIGH', 'LOW', 'VOLUME', 'CLOSE_GLD', 'CLOSE_FXY',
       'CLOSE_T10Y2Y', 'CLOSE_TED', 'CLOSE_USO', 'CLOSE_UUP', 'CLOSE_VIX',
       'CLOSE_VWO'],
      dtype='object')

In [78]:
ma = pd.Series(data['CLOSE_SPY'].rolling(window=days).mean(), name='MA_' + str(days))
ma

Dates
2007-02-20           NaN
2007-02-21           NaN
2007-02-22           NaN
2007-02-23           NaN
2007-02-26           NaN
                 ...    
2018-12-20    269.767778
2018-12-21    269.018889
2018-12-24    267.995333
2018-12-27    267.275778
2018-12-28    266.639111
Name: MA_45, Length: 2771, dtype: float64

In [79]:
data = data.join(ma)
data

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2007-02-20,146.04,145.56,146.200,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055,
2007-02-21,145.98,145.61,146.070,145.0,63971500.0,67.28,82.90,2.3653,0.32,49.86,25.12,10.20,39.975,
2007-02-22,145.87,146.05,146.420,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.220,
2007-02-23,145.30,145.74,145.790,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035,
2007-02-26,145.17,145.83,145.950,145.0,69320062.0,68.10,83.08,2.3795,0.31,50.90,25.04,11.15,39.960,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.87,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.87,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889
2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.55,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333
2018-12-27,248.07,242.57,248.290,239.0,186267297.0,120.57,86.00,1.7581,0.44,9.62,25.57,29.96,37.900,267.275778


In [80]:
# 거래량 이동평균 vma
vma = pd.Series(data['VOLUME'].rolling(days).mean(), name="VMA_" + str(days))
vma

Dates
2007-02-20             NaN
2007-02-21             NaN
2007-02-22             NaN
2007-02-23             NaN
2007-02-26             NaN
                  ...     
2018-12-20    1.240592e+08
2018-12-21    1.274610e+08
2018-12-24    1.281067e+08
2018-12-27    1.297876e+08
2018-12-28    1.301996e+08
Name: VMA_45, Length: 2771, dtype: float64

In [81]:
data = data.join(vma)
data

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2007-02-20,146.04,145.56,146.200,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055,,
2007-02-21,145.98,145.61,146.070,145.0,63971500.0,67.28,82.90,2.3653,0.32,49.86,25.12,10.20,39.975,,
2007-02-22,145.87,146.05,146.420,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.220,,
2007-02-23,145.30,145.74,145.790,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035,,
2007-02-26,145.17,145.83,145.950,145.0,69320062.0,68.10,83.08,2.3795,0.31,50.90,25.04,11.15,39.960,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.87,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.87,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08
2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.55,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333,1.281067e+08
2018-12-27,248.07,242.57,248.290,239.0,186267297.0,120.57,86.00,1.7581,0.44,9.62,25.57,29.96,37.900,267.275778,1.297876e+08


RSI지수 구하기

In [82]:
# diff 를 통해 변화량 구하기
delta = data['CLOSE_SPY'].diff()
delta

Dates
2007-02-20      NaN
2007-02-21    -0.06
2007-02-22    -0.11
2007-02-23    -0.57
2007-02-26    -0.13
              ...  
2018-12-20    -4.09
2018-12-21    -6.47
2018-12-24    -6.36
2018-12-27    13.73
2018-12-28    -0.32
Name: CLOSE_SPY, Length: 2771, dtype: float64

In [83]:
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
gain

Dates
2007-02-20         NaN
2007-02-21         NaN
2007-02-22         NaN
2007-02-23         NaN
2007-02-26         NaN
                ...   
2018-12-20    0.515000
2018-12-21    0.515000
2018-12-24    0.395714
2018-12-27    1.115714
2018-12-28    1.115714
Name: CLOSE_SPY, Length: 2771, dtype: float64

In [84]:
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
loss

Dates
2007-02-20         NaN
2007-02-21         NaN
2007-02-22         NaN
2007-02-23         NaN
2007-02-26         NaN
                ...   
2018-12-20    2.472857
2018-12-21    2.892143
2018-12-24    3.346429
2018-12-27    3.346429
2018-12-28    2.722857
Name: CLOSE_SPY, Length: 2771, dtype: float64

상대강도 RS 계산

In [85]:
RS = gain / loss
RS

Dates
2007-02-20         NaN
2007-02-21         NaN
2007-02-22         NaN
2007-02-23         NaN
2007-02-26         NaN
                ...   
2018-12-20    0.208261
2018-12-21    0.178069
2018-12-24    0.118250
2018-12-27    0.333404
2018-12-28    0.409759
Name: CLOSE_SPY, Length: 2771, dtype: float64

RSI 공식에 따라서 RSI 값 계산
* 70 이상이면 과매수, 30 이하면 과매도

In [86]:
RSI = 100 - ( 100 / (1 + RS))
RSI

Dates
2007-02-20          NaN
2007-02-21          NaN
2007-02-22          NaN
2007-02-23          NaN
2007-02-26          NaN
                ...    
2018-12-20    17.236433
2018-12-21    15.115304
2018-12-24    10.574537
2018-12-27    25.004002
2018-12-28    29.065873
Name: CLOSE_SPY, Length: 2771, dtype: float64

In [87]:
RSI.name = 'RSI_14' 
RSI

Dates
2007-02-20          NaN
2007-02-21          NaN
2007-02-22          NaN
2007-02-23          NaN
2007-02-26          NaN
                ...    
2018-12-20    17.236433
2018-12-21    15.115304
2018-12-24    10.574537
2018-12-27    25.004002
2018-12-28    29.065873
Name: RSI_14, Length: 2771, dtype: float64

In [88]:
data = data.join(RSI)
data

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2007-02-20,146.04,145.56,146.200,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055,,,
2007-02-21,145.98,145.61,146.070,145.0,63971500.0,67.28,82.90,2.3653,0.32,49.86,25.12,10.20,39.975,,,
2007-02-22,145.87,146.05,146.420,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.220,,,
2007-02-23,145.30,145.74,145.790,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035,,,
2007-02-26,145.17,145.83,145.950,145.0,69320062.0,68.10,83.08,2.3795,0.31,50.90,25.04,11.15,39.960,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.87,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08,17.236433
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.87,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08,15.115304
2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.55,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333,1.281067e+08,10.574537
2018-12-27,248.07,242.57,248.290,239.0,186267297.0,120.57,86.00,1.7581,0.44,9.62,25.57,29.96,37.900,267.275778,1.297876e+08,25.004002


In [89]:
data = data.dropna()
data

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2007-04-27,149.53,149.09,149.740,149.0,106984094.0,67.56,83.7300,2.4474,0.55,51.84,24.54,12.45,41.750,143.551556,1.106696e+08,83.438685
2007-04-30,148.29,149.64,149.740,148.0,100874203.0,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,1.116466e+08,70.956720
2007-05-02,149.54,148.90,149.950,149.0,87129805.0,66.66,83.3800,2.4366,0.59,49.59,24.66,13.08,42.020,143.680667,1.121613e+08,79.237288
2007-05-03,150.35,149.97,150.400,149.0,87204945.0,67.49,83.1100,2.4346,0.60,49.28,24.69,13.09,42.435,143.780222,1.123421e+08,79.604579
2007-05-04,150.92,150.75,151.120,150.0,96408930.0,68.19,83.2300,2.4006,0.60,48.30,24.60,12.91,42.595,143.905111,1.128853e+08,79.411765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.8700,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08,17.236433
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.8700,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08,15.115304
2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.5500,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333,1.281067e+08,10.574537
2018-12-27,248.07,242.57,248.290,239.0,186267297.0,120.57,86.0000,1.7581,0.44,9.62,25.57,29.96,37.900,267.275778,1.297876e+08,25.004002


타겟 변수 생성 pct_change 변동률을 구해주는 함수

In [90]:
data.loc[:, 'pct_change'] = data['CLOSE_SPY'].pct_change()
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:, 'pct_change'] = data['CLOSE_SPY'].pct_change()


Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14,pct_change
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2007-04-27,149.53,149.09,149.740,149.0,106984094.0,67.56,83.7300,2.4474,0.55,51.84,24.54,12.45,41.750,143.551556,1.106696e+08,83.438685,
2007-04-30,148.29,149.64,149.740,148.0,100874203.0,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,1.116466e+08,70.956720,-0.008293
2007-05-02,149.54,148.90,149.950,149.0,87129805.0,66.66,83.3800,2.4366,0.59,49.59,24.66,13.08,42.020,143.680667,1.121613e+08,79.237288,0.008429
2007-05-03,150.35,149.97,150.400,149.0,87204945.0,67.49,83.1100,2.4346,0.60,49.28,24.69,13.09,42.435,143.780222,1.123421e+08,79.604579,0.005417
2007-05-04,150.92,150.75,151.120,150.0,96408930.0,68.19,83.2300,2.4006,0.60,48.30,24.60,12.91,42.595,143.905111,1.128853e+08,79.411765,0.003791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.8700,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08,17.236433,-0.016278
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.8700,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08,15.115304,-0.026176
2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.5500,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333,1.281067e+08,10.574537,-0.026423
2018-12-27,248.07,242.57,248.290,239.0,186267297.0,120.57,86.0000,1.7581,0.44,9.62,25.57,29.96,37.900,267.275778,1.297876e+08,25.004002,0.058590


수익이 났으면 1, 손해가 나면 0

In [91]:
data.loc[:,'target'] = np.where(data['pct_change'] > 0, 1, 0)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:,'target'] = np.where(data['pct_change'] > 0, 1, 0)


Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14,pct_change,target
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2007-04-27,149.53,149.09,149.740,149.0,106984094.0,67.56,83.7300,2.4474,0.55,51.84,24.54,12.45,41.750,143.551556,1.106696e+08,83.438685,,0
2007-04-30,148.29,149.64,149.740,148.0,100874203.0,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,1.116466e+08,70.956720,-0.008293,0
2007-05-02,149.54,148.90,149.950,149.0,87129805.0,66.66,83.3800,2.4366,0.59,49.59,24.66,13.08,42.020,143.680667,1.121613e+08,79.237288,0.008429,1
2007-05-03,150.35,149.97,150.400,149.0,87204945.0,67.49,83.1100,2.4346,0.60,49.28,24.69,13.09,42.435,143.780222,1.123421e+08,79.604579,0.005417,1
2007-05-04,150.92,150.75,151.120,150.0,96408930.0,68.19,83.2300,2.4006,0.60,48.30,24.60,12.91,42.595,143.905111,1.128853e+08,79.411765,0.003791,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.8700,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08,17.236433,-0.016278,0
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.8700,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08,15.115304,-0.026176,0
2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.5500,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333,1.281067e+08,10.574537,-0.026423,0
2018-12-27,248.07,242.57,248.290,239.0,186267297.0,120.57,86.0000,1.7581,0.44,9.62,25.57,29.96,37.900,267.275778,1.297876e+08,25.004002,0.058590,1


In [92]:
data = data.dropna()
data

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14,pct_change,target
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2007-04-30,148.29,149.64,149.740,148.0,100874203.0,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,1.116466e+08,70.956720,-0.008293,0
2007-05-02,149.54,148.90,149.950,149.0,87129805.0,66.66,83.3800,2.4366,0.59,49.59,24.66,13.08,42.020,143.680667,1.121613e+08,79.237288,0.008429,1
2007-05-03,150.35,149.97,150.400,149.0,87204945.0,67.49,83.1100,2.4346,0.60,49.28,24.69,13.09,42.435,143.780222,1.123421e+08,79.604579,0.005417,1
2007-05-04,150.92,150.75,151.120,150.0,96408930.0,68.19,83.2300,2.4006,0.60,48.30,24.60,12.91,42.595,143.905111,1.128853e+08,79.411765,0.003791,1
2007-05-08,150.75,150.58,150.920,150.0,80583938.0,67.88,83.3700,2.3913,0.60,48.64,24.73,13.21,42.360,144.029111,1.131357e+08,74.368231,-0.001126,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.8700,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08,17.236433,-0.016278,0
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.8700,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08,15.115304,-0.026176,0
2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.5500,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333,1.281067e+08,10.574537,-0.026423,0
2018-12-27,248.07,242.57,248.290,239.0,186267297.0,120.57,86.0000,1.7581,0.44,9.62,25.57,29.96,37.900,267.275778,1.297876e+08,25.004002,0.058590,1


미래 예측을 위해서 타겟 변수를 shift(-1) 이동.<br>
현재 행의 주가 데이터를 다음 행의 주가 데이터로 미리 이동시켜서 오늘 데이터를 기반으로 내일 주가를 예측하는 구조를 만듬

In [93]:
# 다음날 예측을 위한 타겟 변수 shift
data['target'] = data['target'].shift(-1)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['target'] = data['target'].shift(-1)


Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14,pct_change,target
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2007-04-30,148.29,149.64,149.740,148.0,100874203.0,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,1.116466e+08,70.956720,-0.008293,1.0
2007-05-02,149.54,148.90,149.950,149.0,87129805.0,66.66,83.3800,2.4366,0.59,49.59,24.66,13.08,42.020,143.680667,1.121613e+08,79.237288,0.008429,1.0
2007-05-03,150.35,149.97,150.400,149.0,87204945.0,67.49,83.1100,2.4346,0.60,49.28,24.69,13.09,42.435,143.780222,1.123421e+08,79.604579,0.005417,1.0
2007-05-04,150.92,150.75,151.120,150.0,96408930.0,68.19,83.2300,2.4006,0.60,48.30,24.60,12.91,42.595,143.905111,1.128853e+08,79.411765,0.003791,0.0
2007-05-08,150.75,150.58,150.920,150.0,80583938.0,67.88,83.3700,2.3913,0.60,48.64,24.73,13.21,42.360,144.029111,1.131357e+08,74.368231,-0.001126,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.8700,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08,17.236433,-0.016278,0.0
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.8700,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08,15.115304,-0.026176,0.0
2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.5500,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333,1.281067e+08,10.574537,-0.026423,1.0
2018-12-27,248.07,242.57,248.290,239.0,186267297.0,120.57,86.0000,1.7581,0.44,9.62,25.57,29.96,37.900,267.275778,1.297876e+08,25.004002,0.058590,0.0


In [94]:
data = data.dropna()
data

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14,pct_change,target
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2007-04-30,148.29,149.64,149.740,148.0,100874203.0,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,1.116466e+08,70.956720,-0.008293,1.0
2007-05-02,149.54,148.90,149.950,149.0,87129805.0,66.66,83.3800,2.4366,0.59,49.59,24.66,13.08,42.020,143.680667,1.121613e+08,79.237288,0.008429,1.0
2007-05-03,150.35,149.97,150.400,149.0,87204945.0,67.49,83.1100,2.4346,0.60,49.28,24.69,13.09,42.435,143.780222,1.123421e+08,79.604579,0.005417,1.0
2007-05-04,150.92,150.75,151.120,150.0,96408930.0,68.19,83.2300,2.4006,0.60,48.30,24.60,12.91,42.595,143.905111,1.128853e+08,79.411765,0.003791,0.0
2007-05-08,150.75,150.58,150.920,150.0,80583938.0,67.88,83.3700,2.3913,0.60,48.64,24.73,13.21,42.360,144.029111,1.131357e+08,74.368231,-0.001126,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-19,251.26,255.17,259.400,249.0,214992797.0,117.43,84.8300,1.7824,0.44,10.02,25.97,25.58,37.890,270.407333,1.225288e+08,30.487250,-0.016056,0.0
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.8700,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08,17.236433,-0.016278,0.0
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.8700,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08,15.115304,-0.026176,0.0
2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.5500,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333,1.281067e+08,10.574537,-0.026423,1.0


In [95]:
data.columns

Index(['CLOSE_SPY', 'OPEN', 'HIGH', 'LOW', 'VOLUME', 'CLOSE_GLD', 'CLOSE_FXY',
       'CLOSE_T10Y2Y', 'CLOSE_TED', 'CLOSE_USO', 'CLOSE_UUP', 'CLOSE_VIX',
       'CLOSE_VWO', 'MA_45', 'VMA_45', 'RSI_14', 'pct_change', 'target'],
      dtype='object')

In [96]:
X = data.drop(['CLOSE_SPY', 'OPEN', 'HIGH', 'LOW', 'VOLUME','pct_change','target'], axis=1)
y = data['target']

In [97]:
X

Unnamed: 0_level_0,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2007-04-30,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,1.116466e+08,70.956720
2007-05-02,66.66,83.3800,2.4366,0.59,49.59,24.66,13.08,42.020,143.680667,1.121613e+08,79.237288
2007-05-03,67.49,83.1100,2.4346,0.60,49.28,24.69,13.09,42.435,143.780222,1.123421e+08,79.604579
2007-05-04,68.19,83.2300,2.4006,0.60,48.30,24.60,12.91,42.595,143.905111,1.128853e+08,79.411765
2007-05-08,67.88,83.3700,2.3913,0.60,48.64,24.73,13.21,42.360,144.029111,1.131357e+08,74.368231
...,...,...,...,...,...,...,...,...,...,...,...
2018-12-19,117.43,84.8300,1.7824,0.44,10.02,25.97,25.58,37.890,270.407333,1.225288e+08,30.487250
2018-12-20,119.24,85.8700,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08,17.236433
2018-12-21,118.72,85.8700,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08,15.115304
2018-12-24,120.02,86.5500,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333,1.281067e+08,10.574537


In [98]:
y

Dates
2007-04-30    1.0
2007-05-02    1.0
2007-05-03    1.0
2007-05-04    0.0
2007-05-08    1.0
             ... 
2018-12-19    0.0
2018-12-20    0.0
2018-12-21    0.0
2018-12-24    1.0
2018-12-27    0.0
Name: target, Length: 2725, dtype: float64

In [99]:
y.value_counts()

target
1.0    1471
0.0    1254
Name: count, dtype: int64

# 시계열 데이터의 홀드아웃
* 시계열 데이터이기 때문에 홀드아웃시 날짜가 섞이면 안됨
* train_test_split 옵션에서 shuffle 옵션을 반드시 False

In [100]:
from sklearn.model_selection import train_test_split

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=3)

In [102]:
X_train

Unnamed: 0_level_0,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2007-04-30,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,1.116466e+08,70.956720
2007-05-02,66.66,83.3800,2.4366,0.59,49.59,24.66,13.08,42.020,143.680667,1.121613e+08,79.237288
2007-05-03,67.49,83.1100,2.4346,0.60,49.28,24.69,13.09,42.435,143.780222,1.123421e+08,79.604579
2007-05-04,68.19,83.2300,2.4006,0.60,48.30,24.60,12.91,42.595,143.905111,1.128853e+08,79.411765
2007-05-08,67.88,83.3700,2.3913,0.60,48.64,24.73,13.21,42.360,144.029111,1.131357e+08,74.368231
...,...,...,...,...,...,...,...,...,...,...,...
2015-06-15,113.73,78.7000,1.8717,0.26,20.15,24.87,15.39,41.130,210.623089,9.845141e+07,44.292893
2015-06-16,113.32,78.7000,1.9052,0.28,20.26,24.92,14.81,41.220,210.673644,9.836155e+07,40.627391
2015-06-17,113.85,78.6800,1.9232,0.28,20.21,24.70,14.50,41.480,210.711200,9.927621e+07,42.900532
2015-06-18,115.32,79.0300,1.9083,0.27,20.36,24.62,13.19,41.850,210.772089,1.013461e+08,55.840456


In [103]:
X_test

Unnamed: 0_level_0,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-06-22,113.64,78.7100,1.9294,0.27,20.16,24.73,12.74,42.01,210.863644,1.024936e+08,51.610942
2015-06-23,112.89,78.3900,1.9415,0.27,20.47,25.02,12.11,42.39,210.899422,1.018036e+08,50.374065
2015-06-24,112.59,78.4199,1.9277,0.27,20.21,24.97,13.26,42.17,210.902311,1.023230e+08,51.174869
2015-06-25,112.44,78.5800,1.9315,0.27,19.98,24.93,14.01,42.02,210.944756,1.002339e+08,50.280025
2015-06-26,112.56,78.4200,1.9364,0.27,19.98,25.00,14.02,41.25,210.944089,1.005003e+08,54.520918
...,...,...,...,...,...,...,...,...,...,...,...
2018-12-19,117.43,84.8300,1.7824,0.44,10.02,25.97,25.58,37.89,270.407333,1.225288e+08,30.487250
2018-12-20,119.24,85.8700,1.7807,0.48,9.72,25.77,28.38,38.18,269.767778,1.240592e+08,17.236433
2018-12-21,118.72,85.8700,1.7651,0.48,9.57,25.94,30.11,37.87,269.018889,1.274610e+08,15.115304
2018-12-24,120.02,86.5500,1.7505,0.40,9.29,25.55,36.07,37.32,267.995333,1.281067e+08,10.574537


In [104]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [105]:
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, random_state=3)
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.48      0.87      0.62       384
         1.0       0.60      0.18      0.27       434

    accuracy                           0.50       818
   macro avg       0.54      0.52      0.45       818
weighted avg       0.54      0.50      0.43       818



# 금융 데이터 분석에서의 부스팅 모델 VS 배깅 모델
* 금융 데이터에서는 부스팅 모델보다 배깅 모델을 사용하는 것이 더 안정적임
* 부스팅 모델은 잘 못 분석한 데이터터를 다시 분석하기 때문에 과적합 문제가 발생
* 이에 반해 배깅모델은 독립적인 데이터로 독립적인 분류기가 분석하기 때문에 독립성 높음
* 따라서 과적합의 위험이 상대적으로 적어 배깅모델이 금융데이터에 더 적합함

In [106]:
from sklearn.ensemble import RandomForestClassifier

In [107]:
rfc = RandomForestClassifier(n_estimators=1000, max_depth=3, n_jobs=-1, bootstrap=False)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.49      0.38      0.43       384
         1.0       0.54      0.65      0.59       434

    accuracy                           0.52       818
   macro avg       0.52      0.52      0.51       818
weighted avg       0.52      0.52      0.51       818



# 일반화 성능 향상 및 하이퍼파라미터 튜딩

시계열 데이터의 교차검증은 랜덤하게 하면 안되기 때문에<br>
TimeSeriesSplit 함수를 사용해야 함

In [108]:
from sklearn.model_selection import TimeSeriesSplit

In [109]:
ts_splited = TimeSeriesSplit(n_splits=5).split(X_train)

하이퍼파라미터 튜닝을 위한 GridSearch 분석

In [110]:
from sklearn.model_selection import GridSearchCV

In [111]:
params = dict(bootstrap=[False], n_estimators=range(10, 200, 10), max_depth=[3, 5, 7, 9],
             min_samples_leaf= [2,3,4,5], min_samples_split=[2,4,6,8,10], max_features=[4])

In [112]:
params

{'bootstrap': [False],
 'n_estimators': range(10, 200, 10),
 'max_depth': [3, 5, 7, 9],
 'min_samples_leaf': [2, 3, 4, 5],
 'min_samples_split': [2, 4, 6, 8, 10],
 'max_features': [4]}

In [113]:
grid_cv = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=ts_splited, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print("best_params_", grid_cv.best_params_)
print("best_score_", grid_cv.best_score_)

best_params_ {'bootstrap': False, 'max_depth': 3, 'max_features': 4, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 10}
best_score_ 0.5444794952681388


In [114]:
rfc = RandomForestClassifier(bootstrap=False, max_depth=3, max_features=4, min_samples_leaf=2, min_samples_split=4, n_estimators=10)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.48      0.41      0.44       384
         1.0       0.54      0.61      0.57       434

    accuracy                           0.51       818
   macro avg       0.51      0.51      0.51       818
weighted avg       0.51      0.51      0.51       818



# 상승 하강 판단 기준 변경 후 재분석

In [115]:
data['pct_change'].describe()

count    2725.000000
mean        0.000271
std         0.013029
min        -0.098448
25%        -0.004321
50%         0.000545
75%         0.005791
max         0.128249
Name: pct_change, dtype: float64

In [116]:
data2 = data.copy()

In [117]:
data2

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14,pct_change,target
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2007-04-30,148.29,149.64,149.740,148.0,100874203.0,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,1.116466e+08,70.956720,-0.008293,1.0
2007-05-02,149.54,148.90,149.950,149.0,87129805.0,66.66,83.3800,2.4366,0.59,49.59,24.66,13.08,42.020,143.680667,1.121613e+08,79.237288,0.008429,1.0
2007-05-03,150.35,149.97,150.400,149.0,87204945.0,67.49,83.1100,2.4346,0.60,49.28,24.69,13.09,42.435,143.780222,1.123421e+08,79.604579,0.005417,1.0
2007-05-04,150.92,150.75,151.120,150.0,96408930.0,68.19,83.2300,2.4006,0.60,48.30,24.60,12.91,42.595,143.905111,1.128853e+08,79.411765,0.003791,0.0
2007-05-08,150.75,150.58,150.920,150.0,80583938.0,67.88,83.3700,2.3913,0.60,48.64,24.73,13.21,42.360,144.029111,1.131357e+08,74.368231,-0.001126,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-19,251.26,255.17,259.400,249.0,214992797.0,117.43,84.8300,1.7824,0.44,10.02,25.97,25.58,37.890,270.407333,1.225288e+08,30.487250,-0.016056,0.0
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.8700,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08,17.236433,-0.016278,0.0
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.8700,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08,15.115304,-0.026176,0.0
2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.5500,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333,1.281067e+08,10.574537,-0.026423,1.0


In [122]:
data2['target'] = np.where(data['pct_change'] > 0.0005, 1, 0)
data2['target'].value_counts()

target
1    1375
0    1350
Name: count, dtype: int64

In [123]:
data2['target'] = data2['target'].shift(-1)
data2 = data2.dropna()

In [124]:
data2

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14,pct_change,target
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2007-04-30,148.29,149.64,149.740,148.0,100874203.0,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,1.116466e+08,70.956720,-0.008293,1.0
2007-05-02,149.54,148.90,149.950,149.0,87129805.0,66.66,83.3800,2.4366,0.59,49.59,24.66,13.08,42.020,143.680667,1.121613e+08,79.237288,0.008429,1.0
2007-05-03,150.35,149.97,150.400,149.0,87204945.0,67.49,83.1100,2.4346,0.60,49.28,24.69,13.09,42.435,143.780222,1.123421e+08,79.604579,0.005417,1.0
2007-05-04,150.92,150.75,151.120,150.0,96408930.0,68.19,83.2300,2.4006,0.60,48.30,24.60,12.91,42.595,143.905111,1.128853e+08,79.411765,0.003791,0.0
2007-05-08,150.75,150.58,150.920,150.0,80583938.0,67.88,83.3700,2.3913,0.60,48.64,24.73,13.21,42.360,144.029111,1.131357e+08,74.368231,-0.001126,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-17,255.36,259.40,260.650,254.0,165492297.0,117.87,84.7300,1.8149,0.44,10.45,25.97,24.52,38.290,270.872000,1.238587e+08,35.093320,-0.019618,0.0
2018-12-19,251.26,255.17,259.400,249.0,214992797.0,117.43,84.8300,1.7824,0.44,10.02,25.97,25.58,37.890,270.407333,1.225288e+08,30.487250,-0.016056,0.0
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.8700,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08,17.236433,-0.016278,0.0
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.8700,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08,15.115304,-0.026176,0.0


In [126]:
X2 = data2.drop(['CLOSE_SPY', 'OPEN', 'HIGH', 'LOW', 'VOLUME','pct_change','target'], axis=1)
y2 = data2['target']

In [127]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, shuffle=False, random_state=3)

In [128]:
ts_splited2 = TimeSeriesSplit(n_splits=5).split(X2_train)

하이퍼파라미터 튜닝을 위한 GridSearch 분석

In [129]:
from sklearn.model_selection import GridSearchCV

In [130]:
params = dict(bootstrap=[False], n_estimators=range(10, 200, 10), max_depth=[3, 5, 7, 9],
             min_samples_leaf= [2,3,4,5], min_samples_split=[2,4,6,8,10], max_features=[4],
             random_state=[10])

In [131]:
params

{'bootstrap': [False],
 'n_estimators': range(10, 200, 10),
 'max_depth': [3, 5, 7, 9],
 'min_samples_leaf': [2, 3, 4, 5],
 'min_samples_split': [2, 4, 6, 8, 10],
 'max_features': [4],
 'random_state': [10]}

In [132]:
grid_cv = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=ts_splited2, n_jobs=-1)
grid_cv.fit(X2_train, y2_train)
print("best_params_", grid_cv.best_params_)
print("best_score_", grid_cv.best_score_)

best_params_ {'bootstrap': False, 'max_depth': 3, 'max_features': 4, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 30, 'random_state': 10}
best_score_ 0.5293375394321768


In [134]:
rfc = RandomForestClassifier(bootstrap=False, max_depth=3, max_features=4, min_samples_leaf=3, min_samples_split=2, n_estimators=30, random_state=10)
rfc.fit(X2_train, y2_train)
pred = rfc.predict(X2_test)
print(classification_report(y2_test, pred))

              precision    recall  f1-score   support

         0.0       0.53      0.67      0.59       415
         1.0       0.53      0.38      0.44       403

    accuracy                           0.53       818
   macro avg       0.53      0.53      0.52       818
weighted avg       0.53      0.53      0.52       818

