In [11]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import train_test_split

# Import Data

In [52]:
df = pd.read_csv('분석용데이터_utf.csv')

In [53]:
df.index = df['Date']
df.drop(['Date'], axis = 1, inplace = True)

In [54]:
df

Unnamed: 0_level_0,spi,M2,comp basemoney rate,base rate,consumer price index,kospi close,ex-rate close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
201101월,0.5,1676.4,4.52,2.75,3.4,2069.73,1119.25
201102월,0.8,1674.4,4.72,2.75,3.9,1939.30,1124.65
201103월,1.0,1677.5,4.54,3.00,4.1,2106.70,1095.50
201104월,0.8,1684.8,4.54,3.00,3.8,2192.36,1068.00
201105월,0.7,1690.5,4.44,3.00,3.9,2142.47,1078.00
...,...,...,...,...,...,...,...
202008월,0.5,3100.4,2.19,0.50,0.7,2326.17,1187.94
202009월,0.4,3115.2,2.24,0.50,1.0,2327.89,1164.65
202010월,0.3,3152.8,2.24,0.50,0.1,2267.15,1136.49
202011월,0.5,3183.5,2.25,0.50,0.6,2591.34,1109.32


# Preprocessing

+ spi를 일단 하나씩 sliding 해야 됨. 타겟에 들어갈 것은 다음 달 spi니까.

+ 전달의 spi와 차가 음수 -> 0, 차가 0 이상 -> 1 (차이없는 것은 상방으로 넣기로 했으므로)

## spi sliding

In [55]:
df['next_spi'] = df['spi'].shift(-1)
df['next_spi'][-1] = 0.8

In [56]:
df

Unnamed: 0_level_0,spi,M2,comp basemoney rate,base rate,consumer price index,kospi close,ex-rate close,next_spi
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
201101월,0.5,1676.4,4.52,2.75,3.4,2069.73,1119.25,0.8
201102월,0.8,1674.4,4.72,2.75,3.9,1939.30,1124.65,1.0
201103월,1.0,1677.5,4.54,3.00,4.1,2106.70,1095.50,0.8
201104월,0.8,1684.8,4.54,3.00,3.8,2192.36,1068.00,0.7
201105월,0.7,1690.5,4.44,3.00,3.9,2142.47,1078.00,0.5
...,...,...,...,...,...,...,...,...
202008월,0.5,3100.4,2.19,0.50,0.7,2326.17,1187.94,0.4
202009월,0.4,3115.2,2.24,0.50,1.0,2327.89,1164.65,0.3
202010월,0.3,3152.8,2.24,0.50,0.1,2267.15,1136.49,0.5
202011월,0.5,3183.5,2.25,0.50,0.6,2591.34,1109.32,0.9


## Adjust Next_spi

In [57]:
#전달의 spi와 차가 음수 -> 0
#전달의 spi와 차가 0 이상 -> 1
df['subtract_spi'] = df['next_spi'] - df['next_spi'].shift()
df['subtract_spi'][0] = 0.3 #그 전달이 0.5 라서 차이 뺸 것
df['spi_target'] = df['subtract_spi'].apply(lambda x : 0 if x < 0 else 1)

In [59]:
# df.to_csv('preprocessed_data_RF.csv')

In [60]:
df.drop(['spi', 'subtract_spi', 'next_spi'], axis = 1, inplace = True)


In [61]:
df

Unnamed: 0_level_0,M2,comp basemoney rate,base rate,consumer price index,kospi close,ex-rate close,spi_target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
201101월,1676.4,4.52,2.75,3.4,2069.73,1119.25,1
201102월,1674.4,4.72,2.75,3.9,1939.30,1124.65,1
201103월,1677.5,4.54,3.00,4.1,2106.70,1095.50,0
201104월,1684.8,4.54,3.00,3.8,2192.36,1068.00,0
201105월,1690.5,4.44,3.00,3.9,2142.47,1078.00,0
...,...,...,...,...,...,...,...
202008월,3100.4,2.19,0.50,0.7,2326.17,1187.94,0
202009월,3115.2,2.24,0.50,1.0,2327.89,1164.65,0
202010월,3152.8,2.24,0.50,0.1,2267.15,1136.49,1
202011월,3183.5,2.25,0.50,0.6,2591.34,1109.32,1


# RF Classifier

In [65]:
X = df.copy()
y = X.pop('spi_target')

In [131]:
#90개월이 아닌 다른 경우의 수에도 적용할 수 있도록...
#X, y, 몇 개월을 검토할지 입력
#X_train, y_train은 그전것 전부, X_test은 그 다음 한 개월치만...  y_test은 굳이 안함. 어차피 나중에 실제 y에서 인덱싱하면되자나
def split_train_test(X, y , num_of_train):
    X_train = X.iloc[:num_of_train]
    X_test = pd.DataFrame(X.iloc[num_of_train]).T
    y_train = y.iloc[:num_of_train]
    
    return X_train, X_test, y_train

def model_evaluation(label, predict):
    cf_matrix = confusion_matrix(label, predict)
    Accuracy = (cf_matrix[0][0] + cf_matrix[1][1]) / sum(sum(cf_matrix))
    Precision = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[0][1])
    Recall = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[1][0])
    Specificity = cf_matrix[0][0] / (cf_matrix[0][0] + cf_matrix[0][1])
    F1_Score = (2 * Recall * Precision) / (Recall + Precision)
    F2_Score = (5 * Recall * Precision) / (Recall + 4*Precision) # Recall을 Precision보다 2배 중요하게 생각하여 F2 Score 사용
    print("Accuracy: ", Accuracy) 
    print("Precision: ", Precision)
    print("Recall: ", Recall)
    print("Specificity: ", Specificity)
    print("F1_Score: ", F1_Score)
    print("F2_Score: ", F2_Score)
    
def train_predict(X, y, num_of_train):
    y_pred = []
    y_test = y[num_of_train:].tolist()
    for i in range(num_of_train, X.shape[0]):
        X_train, X_test, y_train = split_train_test(X, y, num_of_train)
        rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1, verbose = 1)
        rf_clf.fit(X_train, y_train)
        pred = rf_clf.predict(X_test)
        y_pred.append(pred[0])
        del X_train, X_test, y_train

    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("auc: ", roc_auc_score(y_test,y_pred))

In [132]:
train_predict(X, y, 90)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parall

[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.

[[ 0 12]
 [ 0 18]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.60      1.00      0.75        18

    accuracy                           0.60        30
   macro avg       0.30      0.50      0.37        30
weighted avg       0.36      0.60      0.45        30

auc:  0.5


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start