# Random Forest + Grid Search

## 데이터 불러오기 및 전처리

In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
pre_train = pd.read_csv('good_pretrain.csv')
pre_test = pd.read_csv('final_test.csv')

In [4]:
pre_train = pre_train.drop(columns = ['Unnamed: 0'])
pre_test = pre_test.drop(columns = ['Unnamed: 0'])

In [5]:
pre_train

Unnamed: 0,M_SAD,SitTime,PA_TOT,I_SB_EX,I_SB_FR,M_SUI_CON,i_SB_BK,BMI,M_STR,F_SODA,PA_MSC
0,M_SAD_2,SitTime_3,PA_TOT_5,I_SB_EX_5,I_SB_FR_2,M_SUI_CON_2,i_SB_BK_4,BMI_1,M_STR_1,F_SODA_4,PA_MSC_4
1,M_SAD_1,SitTime_0,PA_TOT_4,I_SB_EX_2,I_SB_FR_2,M_SUI_CON_1,i_SB_BK_2,BMI_1,M_STR_4,F_SODA_2,PA_MSC_6
2,M_SAD_1,SitTime_3,PA_TOT_2,I_SB_EX_5,I_SB_FR_5,M_SUI_CON_1,i_SB_BK_5,BMI_2,M_STR_3,F_SODA_1,PA_MSC_1
3,M_SAD_1,SitTime_0,PA_TOT_1,I_SB_EX_1,I_SB_FR_1,M_SUI_CON_1,i_SB_BK_1,BMI_1,M_STR_3,F_SODA_1,PA_MSC_1
4,M_SAD_1,SitTime_1,PA_TOT_1,I_SB_EX_2,I_SB_FR_2,M_SUI_CON_1,i_SB_BK_2,BMI_1,M_STR_4,F_SODA_3,PA_MSC_1
...,...,...,...,...,...,...,...,...,...,...,...
59347,M_SAD_2,SitTime_2,PA_TOT_2,I_SB_EX_2,I_SB_FR_2,M_SUI_CON_1,i_SB_BK_2,BMI_2,M_STR_2,F_SODA_3,PA_MSC_3
59348,M_SAD_1,SitTime_4,PA_TOT_1,I_SB_EX_5,I_SB_FR_5,M_SUI_CON_1,i_SB_BK_5,BMI_1,M_STR_4,F_SODA_1,PA_MSC_1
59349,M_SAD_2,SitTime_0,PA_TOT_1,I_SB_EX_2,I_SB_FR_2,M_SUI_CON_2,i_SB_BK_4,BMI_2,M_STR_2,F_SODA_2,PA_MSC_3
59350,M_SAD_1,SitTime_1,PA_TOT_1,I_SB_EX_2,I_SB_FR_2,M_SUI_CON_2,i_SB_BK_2,BMI_1,M_STR_3,F_SODA_5,PA_MSC_6


In [6]:
pre_train= pre_train.applymap(lambda x: x[-1])
pre_test= pre_test.applymap(lambda x: x[-1])

In [7]:
pre_train.loc[(pre_train.M_SUI_CON == 1),'M_SUI_CON']= 0
pre_train.loc[(pre_train.M_SUI_CON == 2),'M_SUI_CON']= 1
pre_test.loc[(pre_test.M_SUI_CON == 1),'M_SUI_CON']= 0
pre_test.loc[(pre_test.M_SUI_CON == 2),'M_SUI_CON']= 1

In [8]:
#랜포라 팩터화 시켜주기
for i in range(len(pre_train.columns)):
    pre_train[pre_train.columns[i]] = pre_train[pre_train.columns[i]].astype('category')
    
for i in range(len(pre_test.columns)):
    pre_test[pre_test.columns[i]] = pre_test[pre_test.columns[i]].astype('category')

In [9]:
train_features = pre_train.drop(['M_SUI_CON'], axis = 1)
test_features = pre_test.drop(['M_SUI_CON'], axis = 1)
train_label = pre_train['M_SUI_CON']
test_label = pre_test['M_SUI_CON']

## 모델 정의 및 훈련

In [12]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 0, stop = 100, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2,4,6,8,10,12,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8,10]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4, 6, 8, 10],
 'min_samples_split': [2, 4, 6, 8, 10, 12, 14],
 'n_estimators': [0, 11, 22, 33, 44, 55, 66, 77, 88, 100]}


In [13]:
#GRID SEARCH 시행
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_label)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8, 10],
                                        'min_samples_split': [2, 4, 6, 8, 10,
                                                              12, 14],
                                        'n_estimators': [0, 11, 22, 33, 44, 55,
                                                         66, 77, 88, 100]},
                   random_state=42, verbose=2)

In [14]:
#최고의 파라미터
rf_random.best_params_

{'n_estimators': 100,
 'min_samples_split': 6,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [15]:
#위에 나온 파라미터로 훈련
Upgrade_model = RandomForestClassifier(n_estimators = 100, min_samples_split= 6,min_samples_leaf = 1,
max_features = 'sqrt',max_depth = None, bootstrap = False)
Upgrade_model.fit(train_features, train_label)

RandomForestClassifier(bootstrap=False, max_features='sqrt',
                       min_samples_split=6)

### 모델 저장(필요시)

In [17]:
import pickle
# save the Base model to disk
Random_Grid_RF_filename = 'Random_Grid_RF.sav'
pickle.dump(Upgrade_model, open(Random_Grid_RF_filename, 'wb'))
# load the model from disk
#Upgrade_model = pickle.load(open(Random_Grid_RF_filename, 'rb'))

## 예측 

In [18]:
# Performance Measure 출력 함수 만들기
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

def model_evaluation(label, predict):
    cf_matrix = confusion_matrix(label, predict)
    Accuracy = (cf_matrix[0][0] + cf_matrix[1][1]) / sum(sum(cf_matrix))
    Precision = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[0][1])
    Recall = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[1][0])
    Specificity = cf_matrix[0][0] / (cf_matrix[0][0] + cf_matrix[0][1])
    F1_Score = (2 * Recall * Precision) / (Recall + Precision)
    F2_Score = (5 * Recall * Precision) / (Recall + 4*Precision) # Recall을 Precision보다 2배 중요하게 생각하여 F2 Score 사용
    
    print("Accuracy: ", Accuracy) 
    print("Precision: ", Precision)
    print("Recall: ", Recall)
    print("Specificity: ", Specificity)
    print("F1_Score: ", F1_Score)
    print("F2_Score: ", F2_Score)

In [19]:
y_pred_Upgrade = Upgrade_model.predict(test_features)

## 성능 결과

In [20]:
model_evaluation(test_label, y_pred_Upgrade)

Accuracy:  0.8127709351131907
Precision:  0.3304484657749803
Recall:  0.4518558364712211
Specificity:  0.8657093261795803
F1_Score:  0.3817314246762099
F2_Score:  0.4209260372820205


In [21]:
roc_auc_score(test_label, y_pred_Upgrade)

0.6587825813254007