In [4]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import train_test_split

# 감성사전 데이터 가져와서 점수만 뽑기

In [5]:
temp = pd.read_csv('news_with_sent_score.csv')
temp.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [6]:
sent_score = temp['sent_score']

In [7]:
sent_score

0     -6.268386e+07
1     -6.255832e+07
2     -6.957441e+07
3     -7.305733e+07
4     -9.356633e+07
           ...     
115   -5.824726e+07
116   -6.441351e+07
117   -3.751069e+07
118   -3.147741e+07
119   -3.986893e+07
Name: sent_score, Length: 120, dtype: float64

In [8]:
del temp

# 기존의 데이터 가져와서 합치기

In [9]:
df = pd.read_csv('분석용데이터_utf.csv')

In [10]:
#Date를 index로 넣어줌
df.index = df['Date']
df.drop(['Date'], axis = 1, inplace = True)

In [11]:
df

Unnamed: 0_level_0,M2,comp basemoney rate,base rate,consumer price index,kospi close,ex-rate close,spi
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
201101월,1676.4,4.52,2.75,3.4,2069.73,1119.25,81.2
201102월,1674.4,4.72,2.75,3.9,1939.30,1124.65,82.5
201103월,1677.5,4.54,3.00,4.1,2106.70,1095.50,83.4
201104월,1684.8,4.54,3.00,3.8,2192.36,1068.00,83.9
201105월,1690.5,4.44,3.00,3.9,2142.47,1078.00,84.1
...,...,...,...,...,...,...,...
202008월,3100.4,2.19,0.50,0.7,2326.17,1187.94,113.2
202009월,3115.2,2.24,0.50,1.0,2327.89,1164.65,114.2
202010월,3152.8,2.24,0.50,0.1,2267.15,1136.49,115.8
202011월,3183.5,2.25,0.50,0.6,2591.34,1109.32,118.6


In [13]:
df.spi.head(6)

Date
201101월    81.2
201102월    82.5
201103월    83.4
201104월    83.9
201105월    84.1
201106월    84.3
Name: spi, dtype: float64

In [32]:
sent_score

Date
201101월   -6.268386e+07
201102월   -6.255832e+07
201103월   -6.957441e+07
201104월   -7.305733e+07
201105월   -9.356633e+07
               ...     
202008월   -5.824726e+07
202009월   -6.441351e+07
202010월   -3.751069e+07
202011월   -3.147741e+07
202012월   -3.986893e+07
Name: sent_score, Length: 120, dtype: float64

In [25]:
sent_score.index = df.index

In [33]:
df = pd.concat([sent_score, df], axis = 1)

In [34]:
df

Unnamed: 0_level_0,sent_score,M2,comp basemoney rate,base rate,consumer price index,kospi close,ex-rate close,spi
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
201101월,-6.268386e+07,1676.4,4.52,2.75,3.4,2069.73,1119.25,81.2
201102월,-6.255832e+07,1674.4,4.72,2.75,3.9,1939.30,1124.65,82.5
201103월,-6.957441e+07,1677.5,4.54,3.00,4.1,2106.70,1095.50,83.4
201104월,-7.305733e+07,1684.8,4.54,3.00,3.8,2192.36,1068.00,83.9
201105월,-9.356633e+07,1690.5,4.44,3.00,3.9,2142.47,1078.00,84.1
...,...,...,...,...,...,...,...,...
202008월,-5.824726e+07,3100.4,2.19,0.50,0.7,2326.17,1187.94,113.2
202009월,-6.441351e+07,3115.2,2.24,0.50,1.0,2327.89,1164.65,114.2
202010월,-3.751069e+07,3152.8,2.24,0.50,0.1,2267.15,1136.49,115.8
202011월,-3.147741e+07,3183.5,2.25,0.50,0.6,2591.34,1109.32,118.6


# Preprocessing

+ spi를 일단 하나씩 sliding 해야 됨. 타겟에 들어갈 것은 다음 달 spi니까.
    - 예를 들어 2011년 1월 index에 들어가야 하는 next_spi 값은 2011년 2월의 spi가 들어가야 함.


+ 전달의 spi와 차가 음수 -> 0, 차가 0 이상 -> 1 (차이없는 것은 상방으로 넣기로 했으므로)

## spi sliding

In [35]:
df['next_spi'] = df['spi'].shift(-1)
df['next_spi'][-1] = 125.4

In [36]:
df

Unnamed: 0_level_0,sent_score,M2,comp basemoney rate,base rate,consumer price index,kospi close,ex-rate close,spi,next_spi
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
201101월,-6.268386e+07,1676.4,4.52,2.75,3.4,2069.73,1119.25,81.2,82.5
201102월,-6.255832e+07,1674.4,4.72,2.75,3.9,1939.30,1124.65,82.5,83.4
201103월,-6.957441e+07,1677.5,4.54,3.00,4.1,2106.70,1095.50,83.4,83.9
201104월,-7.305733e+07,1684.8,4.54,3.00,3.8,2192.36,1068.00,83.9,84.1
201105월,-9.356633e+07,1690.5,4.44,3.00,3.9,2142.47,1078.00,84.1,84.3
...,...,...,...,...,...,...,...,...,...
202008월,-5.824726e+07,3100.4,2.19,0.50,0.7,2326.17,1187.94,113.2,114.2
202009월,-6.441351e+07,3115.2,2.24,0.50,1.0,2327.89,1164.65,114.2,115.8
202010월,-3.751069e+07,3152.8,2.24,0.50,0.1,2267.15,1136.49,115.8,118.6
202011월,-3.147741e+07,3183.5,2.25,0.50,0.6,2591.34,1109.32,118.6,121.8


## Adjust Next_spi

In [37]:
#전달의 spi와 차가 음수 -> 0
#전달의 spi와 차가 0 이상 -> 1
df['subtract_spi'] = df['next_spi'] - df['next_spi'].shift()
df['subtract_spi'][0] = 1.3 #첫 달은 nan인데 그 전 달 데이터가 0.5라는거 구해와서 차이 뺸 것이 0.3
df['spi_target'] = df['subtract_spi'].apply(lambda x : 0 if x < 0 else 1) #음수면0 아니면 1

In [38]:
# 감성점수 정규화 안한 데이터 먼저 저장
# df.to_csv('not_scaled_data.csv')

In [39]:
#전처리에 사용되었던 더미 변수들 drop
df.drop(['spi', 'subtract_spi', 'next_spi'], axis = 1, inplace = True)

In [40]:
df

Unnamed: 0_level_0,sent_score,M2,comp basemoney rate,base rate,consumer price index,kospi close,ex-rate close,spi_target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
201101월,-6.268386e+07,1676.4,4.52,2.75,3.4,2069.73,1119.25,1
201102월,-6.255832e+07,1674.4,4.72,2.75,3.9,1939.30,1124.65,1
201103월,-6.957441e+07,1677.5,4.54,3.00,4.1,2106.70,1095.50,1
201104월,-7.305733e+07,1684.8,4.54,3.00,3.8,2192.36,1068.00,1
201105월,-9.356633e+07,1690.5,4.44,3.00,3.9,2142.47,1078.00,1
...,...,...,...,...,...,...,...,...
202008월,-5.824726e+07,3100.4,2.19,0.50,0.7,2326.17,1187.94,1
202009월,-6.441351e+07,3115.2,2.24,0.50,1.0,2327.89,1164.65,1
202010월,-3.751069e+07,3152.8,2.24,0.50,0.1,2267.15,1136.49,1
202011월,-3.147741e+07,3183.5,2.25,0.50,0.6,2591.34,1109.32,1


In [55]:
df.spi_target.value_counts()

1    87
0    33
Name: spi_target, dtype: int64

# RF Classifier

In [56]:
X = df.copy()
y = X.pop('spi_target')

In [57]:
#90개월이 아닌 다른 경우의 수에도 적용할 수 있도록...
#X, y, 몇 개월을 검토할지 입력
#X_train, y_train은 그전것 전부, X_test은 그 다음 한 개월치만...  y_test은 굳이 안함. 어차피 나중에 실제 y에서 인덱싱하면되자나

def split_train_test(X, y , num_of_train):
    """
    ex) split_train_test(X, y, 90) 
    
    => X_train 은 90개의 앞의 월별 데이터, X_test는 다음 91개째 하나의 월별 데이터를 반환
    """
    X_train = X.iloc[:num_of_train]
    X_test = pd.DataFrame(X.iloc[num_of_train]).T
    y_train = y.iloc[:num_of_train]
    
    return X_train, X_test, y_train

def train_predict(X, y, num_of_train):
    """
    y_pred = [90개월치 훈련 91개월차 예측값, 91개월치 훈련 92개월차 예측값, ...119개월 훈련 120개월차 예측값] 을 만들기 위해
    먼저 y_pred = [] 초기화
    
    y_test는 결과(confusion matrix) 비교 위해 list형태로 바꿔줌.
    num_of_train 입력해주면 그 시점부터 마지막 하나 전 월까지 훈련시키고 테스트하고를 반복해서 y_pred에 그때마다 넣어주고
    한 train data 사용하면 메모리 정리를 위해 지워주고 다음 for 문에서 재할당해줌.
    
    예를 들어, i가 90이면
    i = 90 ->train : 0~89 데이터 => 훈련해서 90개차 test data 한 줄 넣고 예측 결과 뽑아서 그걸 y_pred에 추가 -> data delete
    i = 91 ->train : 0~90 데이터 => 훈련해서 91개차 test data 한 줄 넣고 예측 결과 뽑아서 그걸 y_pred에 추가 -> data delete
    ... 반복...
    
    """
    y_pred = []
    y_test = y[num_of_train:].tolist()
    for i in range(num_of_train, X.shape[0]):
        # 1 ~ i, i+1 의 데이터 생성
        X_train, X_test, y_train = split_train_test(X, y, i)
        # default parameter 
        rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1, verbose = 1)
        rf_clf.fit(X_train, y_train)
        pred = rf_clf.predict(X_test)
        y_pred.append(pred[0])
        del X_train, X_test, y_train #메모리 삭제

    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("auc: ", roc_auc_score(y_test,y_pred))

In [58]:
#성능까지...다..뽑아버렷..!
train_predict(X, y, 90)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parall

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=

[[ 6  1]
 [ 1 22]]
              precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       0.96      0.96      0.96        23

    accuracy                           0.93        30
   macro avg       0.91      0.91      0.91        30
weighted avg       0.93      0.93      0.93        30

auc:  0.906832298136646


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
