#### 머신러닝 코드 구조 <br>

![이미지](https://github.com/DA4BAM/dataset/blob/master/new_code.png?raw=true "code step1")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
stock = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/SK.csv') 
stock.drop('AdjClose', axis=1, inplace=True)

exch_rate = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/USD_KRW.csv')
exch_rate.drop(['open', 'high', 'low'], axis=1, inplace=True)
exch_rate.rename(columns={'date':'Date', 'close':'exch_Close', 'diff':'exch_Diff'},inplace=True)

In [4]:
exch_rate.head()

Unnamed: 0,Date,exch_Close,exch_Diff
0,2019-12-31,1155.1,-0.0025
1,2019-12-30,1158.0,-0.0015
2,2019-12-27,1159.7,-0.0023
3,2019-12-26,1162.3,0.0013
4,2019-12-25,1160.8,-0.0024


In [5]:
# 하나의 데이터로 결합
# how, on : stock의 Date를 기준으로
data = pd.merge(stock, exch_rate, how='left', on='Date')

data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089


## 1. 데이터 이해

### 1) 둘러보기

In [6]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089


In [7]:
# Date를 Datetime 자료형으로 바꿔주기
data.dtypes

Date           object
Open          float64
High          float64
Low           float64
Close         float64
Volume        float64
exch_Close    float64
exch_Diff     float64
dtype: object

In [8]:
data.shape

(977, 8)

### 2) 기초통계량

In [9]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Volume,exch_Close,exch_Diff
count,972.0,972.0,972.0,972.0,972.0,977.0,977.0
mean,252080.761317,255116.255144,248993.312757,252241.769547,117131.1,1139.488025,3e-05
std,29305.098277,29550.396185,28885.988224,29268.540457,85008.66,39.095668,0.005343
min,193000.0,194500.0,189000.0,192500.0,0.0,1054.9,-0.0204
25%,227500.0,230375.0,224875.0,227875.0,76687.25,1117.9,-0.0032
50%,256250.0,259500.0,253500.0,256750.0,96966.0,1133.9,0.0001
75%,274500.0,277125.0,270625.0,274000.0,134895.0,1169.1,0.0034
max,331000.0,331000.0,321500.0,328500.0,1473645.0,1243.1,0.0265


## 2. 데이터 준비

### 1) 변수 정리

In [10]:
# NaN 확인
data.isnull().sum()

Date          0
Open          5
High          5
Low           5
Close         5
Volume        5
exch_Close    0
exch_Diff     0
dtype: int64

In [11]:
data['Open'].isnull()

0      False
1      False
2      False
3      False
4      False
       ...  
972    False
973    False
974    False
975    False
976    False
Name: Open, Length: 977, dtype: bool

In [12]:
# NaN 행들을 확인
data.loc[data['Open'].isnull()]

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
458,2017-11-16,,,,,,1097.2,-0.0077
463,2017-11-23,,,,,,1084.0,-0.0019
487,2018-01-02,,,,,,1063.2,-0.0032
700,2018-11-15,,,,,,1127.9,-0.0042
945,2019-11-14,,,,,,1169.1,-0.0021


In [13]:
# 2019-11-14는 목요일이나 NaN으로 채워져 있음
data.loc[(data['Date']>='2019-11-12')&(data['Date']<='2019-11-20')]

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
943,2019-11-12,271000.0,271000.0,268500.0,269000.0,168608.0,1166.5,0.0009
944,2019-11-13,268000.0,269500.0,261500.0,263000.0,257602.0,1171.5,0.0043
945,2019-11-14,,,,,,1169.1,-0.0021
946,2019-11-15,265000.0,270000.0,265000.0,270000.0,198430.0,1163.7,-0.0046
947,2019-11-18,270000.0,275000.0,269500.0,275000.0,185370.0,1167.2,0.0031
948,2019-11-19,274500.0,276500.0,272500.0,272500.0,222226.0,1168.4,0.001
949,2019-11-20,271500.0,274500.0,268500.0,272000.0,210122.0,1170.8,0.0021


### 2) NaN 처리

In [14]:
# 방법1 : 행을 제거한다.

data1 = data.dropna(axis=0)
data1.isnull().sum()

Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
exch_Close    0
exch_Diff     0
dtype: int64

In [15]:
data1.loc[(data1['Date']>='2019-11-12')&(data1['Date']<='2019-11-20')]

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
943,2019-11-12,271000.0,271000.0,268500.0,269000.0,168608.0,1166.5,0.0009
944,2019-11-13,268000.0,269500.0,261500.0,263000.0,257602.0,1171.5,0.0043
946,2019-11-15,265000.0,270000.0,265000.0,270000.0,198430.0,1163.7,-0.0046
947,2019-11-18,270000.0,275000.0,269500.0,275000.0,185370.0,1167.2,0.0031
948,2019-11-19,274500.0,276500.0,272500.0,272500.0,222226.0,1168.4,0.001
949,2019-11-20,271500.0,274500.0,268500.0,272000.0,210122.0,1170.8,0.0021


In [16]:
# 방법2 : 이전 값으로 채운다. 

data2 = data.fillna(method='ffill')
data2.isnull().sum()

Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
exch_Close    0
exch_Diff     0
dtype: int64

In [17]:
data2.loc[(data2['Date']>='2019-11-12')&(data2['Date']<='2019-11-20')]

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
943,2019-11-12,271000.0,271000.0,268500.0,269000.0,168608.0,1166.5,0.0009
944,2019-11-13,268000.0,269500.0,261500.0,263000.0,257602.0,1171.5,0.0043
945,2019-11-14,268000.0,269500.0,261500.0,263000.0,257602.0,1169.1,-0.0021
946,2019-11-15,265000.0,270000.0,265000.0,270000.0,198430.0,1163.7,-0.0046
947,2019-11-18,270000.0,275000.0,269500.0,275000.0,185370.0,1167.2,0.0031
948,2019-11-19,274500.0,276500.0,272500.0,272500.0,222226.0,1168.4,0.001
949,2019-11-20,271500.0,274500.0,268500.0,272000.0,210122.0,1170.8,0.0021


In [18]:
# 방법3 : 앞뒤값의 중간값으로 채우기

data3 = data.interpolate(method='linear') # default는 linear
data3.isnull().sum()

Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
exch_Close    0
exch_Diff     0
dtype: int64

In [19]:
data3.loc[(data3['Date']>='2019-11-12')&(data3['Date']<='2019-11-20')]

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
943,2019-11-12,271000.0,271000.0,268500.0,269000.0,168608.0,1166.5,0.0009
944,2019-11-13,268000.0,269500.0,261500.0,263000.0,257602.0,1171.5,0.0043
945,2019-11-14,266500.0,269750.0,263250.0,266500.0,228016.0,1169.1,-0.0021
946,2019-11-15,265000.0,270000.0,265000.0,270000.0,198430.0,1163.7,-0.0046
947,2019-11-18,270000.0,275000.0,269500.0,275000.0,185370.0,1167.2,0.0031
948,2019-11-19,274500.0,276500.0,272500.0,272500.0,222226.0,1168.4,0.001
949,2019-11-20,271500.0,274500.0,268500.0,272000.0,210122.0,1170.8,0.0021


### 3) Feature Engineering

- 내일의 주가에 영향을 주는 요인은?

In [20]:
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089


1) 날짜 데이터 다루기

- 날짜로부터 추가 변수를 도출해내기 위해서 날짜 변수를 생성.
- 추가 변수 생성이 끝난 후 날짜 변수 제거.

In [21]:
data2.dtypes

Date           object
Open          float64
High          float64
Low           float64
Close         float64
Volume        float64
exch_Close    float64
exch_Diff     float64
dtype: object

In [22]:
# 문자열 형식을 날짜 형식으로 변환
data2['Date'] = pd.to_datetime(data2['Date'])
print(data2.dtypes)

Date          datetime64[ns]
Open                 float64
High                 float64
Low                  float64
Close                float64
Volume               float64
exch_Close           float64
exch_Diff            float64
dtype: object


In [23]:
# 요일을 추가
data2['WeekDay'] = data2['Date'].dt.dayofweek # 요일 번호
data2.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,0
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,1
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,2
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,3
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,4
5,2016-01-11,238000.0,241500.0,236000.0,239000.0,84152.0,1204.8,-0.0026,0
6,2016-01-12,240000.0,246000.0,237000.0,237500.0,86196.0,1210.3,0.0046,1
7,2016-01-13,239000.0,245000.0,238000.0,242500.0,90207.0,1210.2,-0.0001,2
8,2016-01-14,239000.0,240500.0,235000.0,240000.0,96090.0,1207.7,-0.0021,3
9,2016-01-15,243500.0,243500.0,234500.0,234500.0,99523.0,1214.8,0.0059,4


In [24]:
# 요일 이름으로 다시 추가.
data2['WeekDay'] = data2['Date'].dt.day_name() # 요일 이름
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday


In [25]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 977 entries, 0 to 976
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        977 non-null    datetime64[ns]
 1   Open        977 non-null    float64       
 2   High        977 non-null    float64       
 3   Low         977 non-null    float64       
 4   Close       977 non-null    float64       
 5   Volume      977 non-null    float64       
 6   exch_Close  977 non-null    float64       
 7   exch_Diff   977 non-null    float64       
 8   WeekDay     977 non-null    object        
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 76.3+ KB


2) 이전 데이터 붙이기

- .shift() : 예) 전날 주가, 전날 환율
- .rolling() : 예) 7일 이동평균 주가

In [26]:
# 전날 주가를 추가합니다.
# 기존 값을 한칸 아래로 내린 새로운 열 만들기
data2['Close_lag1'] = data2['Close'].shift() # default = 1
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0


In [27]:
# 7일 이동최대값 주가
# min_periods가 1이기 때문에 첫칸부터 연산값이 채워짐.
data2['Close_MA7_lag1'] = data2['Close'].rolling(7, min_periods=1).max()
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,234500.0
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,241000.0
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,241000.0
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,241000.0
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,241500.0


In [28]:
# 7일 이동평균 주가
data2['Close_MA7_lag1'] = data2['Close'].rolling(7, min_periods=1).mean()
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,234500.0
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,237750.0
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,238166.666667
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238750.0
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,239300.0


In [29]:
data2['Close_MA7_lag1'] = data2['Close'].rolling(7, min_periods=1).mean().shift()
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,234500.0
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,237750.0
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238166.666667
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,238750.0


3) 월 데이터 추가

In [30]:
data2['Month'] = data2['Date'].dt.month
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1,Month
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,,1
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,234500.0,1
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,237750.0,1
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238166.666667,1
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,238750.0,1


4) 전날 거개량 컬럼, 전날 환율 증감 컬럼

In [31]:
data2['Vol_lag1'] = data2['Volume'].shift(1)
data2['exch_lag1'] = data2['exch_Diff'].shift(1)

data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1,Month,Vol_lag1,exch_lag1
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,,1,,
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,234500.0,1,173905.0,0.0127
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,237750.0,1,182985.0,0.0004
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238166.666667,1,108574.0,0.0082
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,238750.0,1,113376.0,-0.0027


5) 전날 주가 - 전전날 주가 컬럼

In [32]:
data2['Close_diff'] = data2['Close_lag1'] - data2['Close'].shift(2)

data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1,Month,Vol_lag1,exch_lag1,Close_diff
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,,1,,,
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,234500.0,1,173905.0,0.0127,
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,237750.0,1,182985.0,0.0004,6500.0
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238166.666667,1,108574.0,0.0082,-2000.0
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,238750.0,1,113376.0,-0.0027,1500.0


In [33]:
data2['Close_diff'] = data2['Close_lag1'] - data2['Close_lag1'].shift()

data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1,Month,Vol_lag1,exch_lag1,Close_diff
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,,1,,,
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,234500.0,1,173905.0,0.0127,
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,237750.0,1,182985.0,0.0004,6500.0
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238166.666667,1,108574.0,0.0082,-2000.0
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,238750.0,1,113376.0,-0.0027,1500.0


6) 전날 종가 - 전날 시가 컬럼

In [34]:
data2['Close-Open'] = data2['Close'].shift() - data2['Open'].shift()

data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1,Month,Vol_lag1,exch_lag1,Close_diff,Close-Open
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,,1,,,,
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,234500.0,1,173905.0,0.0127,,-8500.0
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,237750.0,1,182985.0,0.0004,6500.0,5000.0
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238166.666667,1,108574.0,0.0082,-2000.0,-2000.0
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,238750.0,1,113376.0,-0.0027,1500.0,3500.0


### 4) Dummy Variable

- 범주형 변수를 숫자로 만드는 방법
- pd.get_dummies, pd.concat, drop
- 불필요한 컬럼들 제거

In [35]:
# dummy variable
dumm_weekday = pd.get_dummies(data2['WeekDay'], drop_first=True, prefix='day')
dumm_weekday.head()

Unnamed: 0,day_Monday,day_Thursday,day_Tuesday,day_Wednesday
0,1,0,0,0
1,0,0,1,0
2,0,0,0,1
3,0,1,0,0
4,0,0,0,0


In [36]:
dumm_weekday = pd.get_dummies(data2['WeekDay'], drop_first=True)
dumm_weekday.head()

Unnamed: 0,Monday,Thursday,Tuesday,Wednesday
0,1,0,0,0
1,0,0,1,0
2,0,0,0,1
3,0,1,0,0
4,0,0,0,0


In [37]:
data3 = pd.concat([data2, dumm_weekday], axis=1)
data3.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1,Month,Vol_lag1,exch_lag1,Close_diff,Close-Open,Monday,Thursday,Tuesday,Wednesday
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,,1,,,,,1,0,0,0
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,234500.0,1,173905.0,0.0127,,-8500.0,0,0,1,0
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,237750.0,1,182985.0,0.0004,6500.0,5000.0,0,0,0,1
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238166.666667,1,108574.0,0.0082,-2000.0,-2000.0,0,1,0,0
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,238750.0,1,113376.0,-0.0027,1500.0,3500.0,0,0,0,0


In [38]:
# 컬럼 삭제
drop_x = ['Date','Open','High','Low','Volume','exch_Close','exch_Diff','WeekDay']
data3.drop(drop_x, axis = 1, inplace = True)
data3.head()

Unnamed: 0,Close,Close_lag1,Close_MA7_lag1,Month,Vol_lag1,exch_lag1,Close_diff,Close-Open,Monday,Thursday,Tuesday,Wednesday
0,234500.0,,,1,,,,,1,0,0,0
1,241000.0,234500.0,234500.0,1,173905.0,0.0127,,-8500.0,0,0,1,0
2,239000.0,241000.0,237750.0,1,182985.0,0.0004,6500.0,5000.0,0,0,0,1
3,240500.0,239000.0,238166.666667,1,108574.0,0.0082,-2000.0,-2000.0,0,1,0,0
4,241500.0,240500.0,238750.0,1,113376.0,-0.0027,1500.0,3500.0,0,0,0,0


In [39]:
# shift를 하다보내 행에 NaN이 다시 들어갔습니다. dropna로 제거합시다.
data3 = data3.dropna(axis = 0)
data3.head()

Unnamed: 0,Close,Close_lag1,Close_MA7_lag1,Month,Vol_lag1,exch_lag1,Close_diff,Close-Open,Monday,Thursday,Tuesday,Wednesday
2,239000.0,241000.0,237750.0,1,182985.0,0.0004,6500.0,5000.0,0,0,0,1
3,240500.0,239000.0,238166.666667,1,108574.0,0.0082,-2000.0,-2000.0,0,1,0,0
4,241500.0,240500.0,238750.0,1,113376.0,-0.0027,1500.0,3500.0,0,0,0,0
5,239000.0,241500.0,239300.0,1,81557.0,0.0089,1000.0,1000.0,1,0,0,0
6,237500.0,239000.0,239250.0,1,84152.0,-0.0026,-2500.0,1000.0,0,0,1,0


### 5) Data Split

- sklearn의 데이터 분할 함수 사용
    - 요인, x, feature, 조작변수, 통제변수, 리스크벡터, Input, 독립변수
    - 결과, y, target, label, Output, 종속변수

In [40]:
from sklearn.model_selection import train_test_split

In [41]:
# features와 target 분리
X = data3.drop('Close', axis=1)
y = data3.iloc[:, 0]

In [42]:
# train : test = 7 : 3
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=1)

In [43]:
train_x.shape, train_y.shape

((682, 11), (682,))

### 6) Scaling features

In [44]:
from sklearn.preprocessing import MinMaxScaler

In [45]:
# 사용할 함수 선언
scaler = MinMaxScaler()

In [46]:
# 함수를 만들고
#scaler.fit(train_x)

# 변환시키기
#train_x = scaler.transform(train_x)

train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [47]:
pd.DataFrame(train_x).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0
mean,0.438411,0.45517,0.495734,0.103815,0.43403,0.479434,0.488208,0.195015,0.196481,0.186217,0.217009
std,0.214321,0.232798,0.310865,0.06985,0.115929,0.10944,0.108574,0.396503,0.397628,0.389567,0.412511
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.258272,0.252746,0.181818,0.068378,0.365139,0.421053,0.422535,0.0,0.0,0.0,0.0
50%,0.474265,0.490173,0.454545,0.086258,0.4371,0.473684,0.478873,0.0,0.0,0.0,0.0
75%,0.595588,0.628613,0.727273,0.118671,0.507463,0.539474,0.549296,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
