# 데이터셋 출처
* https://www.kaggle.com/uciml/pima-indians-diabetes-database

## 데이터 구성
* Pregnancies : 임신횟수
* Glucose : 2시간 동안의경구 포도당 내성 검사에서 혈장 포도당 농도
* BloodPressure : 이완기 혈압 (mm Hg)
* SkinThickness : 삼두근 피부 주름 두께 (mm), 체지방을 추정하는데 사용되는 값
* Insulin : 2시간 혈청 인슐린(mu U / ml)
* BMI : 체질량 지수(체중kg / 키(m)^2)
* DiabetesPedigreeFunction : 당뇨병 혈통 기능
* Age : 나이
* Outcome : 768개중에 268개의 결과 클래스 변수(0 또는 1)는 1이고 나머지는 0입니다.

# 필요한 라이브러리 로드

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# 데이터셋 로드

In [2]:
df = pd.read_csv("data/diabetes_feature.csv")
df.shape

(768, 16)

In [3]:
# 데이터셋을 미리보기

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Pregnancies_high,Age_low,Age_middle,Age_high,Insulin_nan,Insulin_log,low_glu_insulin
0,6,0.848324,72,35,0,33.6,0.468492,50,1,False,False,True,False,169.5,5.138735,False
1,1,-1.123396,66,29,0,26.6,-0.365061,31,0,False,False,True,False,102.5,4.639572,True
2,8,1.943724,64,0,0,23.3,0.604397,32,1,True,False,True,False,169.5,5.138735,False
3,1,-0.998208,66,23,94,28.1,-0.920763,21,0,False,True,False,False,94.0,4.553877,True
4,0,0.504055,40,35,168,43.1,5.484909,33,1,False,False,True,False,168.0,5.129899,False


# 학습과 예측에 사용할 데이터셋 만들기

In [4]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Pregnancies_high',
       'Age_low', 'Age_middle', 'Age_high', 'Insulin_nan', 'Insulin_log',
       'low_glu_insulin'],
      dtype='object')

In [5]:
X = df[['Glucose', 'BloodPressure', 'SkinThickness',
    'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high',
    'Insulin_nan', 'low_glu_insulin']]
X.shape

(768, 9)

In [6]:
y = df["Outcome"]
y.shape

(768,)

In [7]:
# 사이킷런에서 제공하는 model_selection 의 train_test_split 으로 만들기

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.2, random_state=42)

In [8]:
# train 세트의 문제와 정답의 데이터 수 확인

X_train.shape, y_train.shape

((614, 9), (614,))

In [9]:
# test 세트의 문제와 정답의 데이터 수 확인

X_test.shape, y_test.shape

((154, 9), (154,))

# 여러개의 알고리즘을 사용해서 비교하기

In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


estimators = [DecisionTreeClassifier(random_state=42), 
            RandomForestClassifier(random_state=42),
            GradientBoostingClassifier(random_state=42)]
estimators

[DecisionTreeClassifier(random_state=42),
 RandomForestClassifier(random_state=42),
 GradientBoostingClassifier(random_state=42)]

In [17]:
max_depth = np.random.randint(2, 20, 10)
max_depth

array([10,  7,  6, 14,  3, 15, 17, 17, 14,  9])

In [19]:
max_features = np.random.uniform(0.3, 1.0, 10)
max_features

array([0.43120078, 0.5696988 , 0.96626425, 0.49701398, 0.79847271,
       0.78113764, 0.76324754, 0.53615632, 0.88578556, 0.93483355])

In [32]:
results = []
for estimator in estimators:
    result = []
    result.append(estimator.__class__.__name__)
    results.append(result)
results

[['DecisionTreeClassifier'],
 ['RandomForestClassifier'],
 ['GradientBoostingClassifier']]

In [42]:
from sklearn.model_selection import RandomizedSearchCV

max_depth = np.random.randint(2, 20, 10)
max_features = np.random.uniform(0.3, 1.0, 10)

param_distributions = {"max_depth": max_depth, 
                       "max_features": max_features}

results = []
for estimator in estimators:
    result = []
    if estimator.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions["n_estimators"] = np.random.randint(100, 200, 10)
        
    clf = RandomizedSearchCV(estimator, 
                       param_distributions, 
                       n_iter=100, 
                       scoring="accuracy", 
                       n_jobs=-1, 
                       cv=5, 
                       verbose=2
                      )
    clf.fit(X_train, y_train)
    result.append(estimator.__class__.__name__)
    result.append(clf.best_params_)
    result.append(clf.best_score_)
    result.append(clf.score(X_test, y_test))
    result.append(clf.cv_results_)
    results.append(result)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    6.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   47.7s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.6min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.4min finished


In [44]:
df = pd.DataFrame(results, columns=["estimator", "best_params", "train_score", "test_score", "cv_result"])

In [51]:
pd.DataFrame(df.loc[1, "cv_result"]).sort_values(by="rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
57,1.618671,0.060197,0.094347,0.013148,198,0.768472,18,"{'n_estimators': 198, 'max_features': 0.768472...",0.869919,0.934959,0.861789,0.910569,0.950820,0.905611,0.034997,1
15,1.530722,0.101524,0.088749,0.012679,198,0.815675,7,"{'n_estimators': 198, 'max_features': 0.815674...",0.878049,0.943089,0.869919,0.902439,0.934426,0.905584,0.029253,2
85,1.145743,0.027873,0.056568,0.002726,198,0.814095,7,"{'n_estimators': 198, 'max_features': 0.814094...",0.878049,0.943089,0.869919,0.902439,0.934426,0.905584,0.029253,2
39,1.348626,0.043268,0.082353,0.023264,172,0.790221,7,"{'n_estimators': 172, 'max_features': 0.790220...",0.886179,0.951220,0.869919,0.886179,0.934426,0.905584,0.031431,2
48,1.168529,0.050778,0.068761,0.016033,175,0.580672,14,"{'n_estimators': 175, 'max_features': 0.580671...",0.878049,0.934959,0.853659,0.902439,0.950820,0.903985,0.035667,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,0.903882,0.057285,0.050971,0.004145,168,0.768472,2,"{'n_estimators': 168, 'max_features': 0.768472...",0.796748,0.869919,0.845528,0.804878,0.934426,0.850300,0.049831,96
34,1.252480,0.036154,0.073158,0.009902,172,0.814095,2,"{'n_estimators': 172, 'max_features': 0.814094...",0.788618,0.869919,0.837398,0.804878,0.942623,0.848687,0.054644,97
37,0.945658,0.064966,0.074357,0.008517,172,0.815675,2,"{'n_estimators': 172, 'max_features': 0.815674...",0.788618,0.869919,0.837398,0.804878,0.942623,0.848687,0.054644,97
17,1.176325,0.115161,0.062964,0.020560,168,0.790221,2,"{'n_estimators': 168, 'max_features': 0.790220...",0.788618,0.869919,0.837398,0.804878,0.942623,0.848687,0.054644,97
