In [1]:
# 데이터 분석을 위한 pandas, 수치계산을 위한 numpy
# 시각화를 위한 seaborn, matplotlib.pyplot 을 로드합니다. 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv("diabetes_feature.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Pregnancies_high,Age_low,Age_middle,Age_high,Insulin_nan,Insulin_log,low_glu_insulin
0,6,148,72,35,0,33.6,0.627,50,1,False,False,True,False,169.5,5.138735,False
1,1,85,66,29,0,26.6,0.351,31,0,False,False,True,False,102.5,4.639572,False
2,8,183,64,0,0,23.3,0.672,32,1,True,False,True,False,169.5,5.138735,False
3,1,89,66,23,94,28.1,0.167,21,0,False,True,False,False,94.0,4.553877,True
4,0,137,40,35,168,43.1,2.288,33,1,False,False,True,False,168.0,5.129899,False


In [3]:
df_insulin = pd.read_csv("diabetes_fill_insulin.csv")
df['Insulin'] = df_insulin['Insulin']
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Pregnancies_high,Age_low,Age_middle,Age_high,Insulin_nan,Insulin_log,low_glu_insulin
0,6,148,72,35,181.916836,33.6,0.627,50,1,False,False,True,False,169.5,5.138735,False
1,1,85,66,29,61.675,26.6,0.351,31,0,False,False,True,False,102.5,4.639572,False
2,8,183,64,0,180.096667,23.3,0.672,32,1,True,False,True,False,169.5,5.138735,False
3,1,89,66,23,94.0,28.1,0.167,21,0,False,True,False,False,94.0,4.553877,True
4,0,137,40,35,168.0,43.1,2.288,33,1,False,False,True,False,168.0,5.129899,False


## 학습,예측 데이터셋 만들기

In [4]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Pregnancies_high',
       'Age_low', 'Age_middle', 'Age_high', 'Insulin_nan', 'Insulin_log',
       'low_glu_insulin'],
      dtype='object')

In [6]:
X = df[['Glucose', 'BloodPressure', 'SkinThickness',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high', 'Insulin']]
X.shape

(768, 8)

In [7]:
y = df['Outcome']
y.shape

(768,)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
X_train.shape, y_train.shape

((614, 8), (614,))

In [10]:
X_test.shape, y_test.shape

((154, 8), (154,))

## 여러 개의 알고리즘을 사용해서 비교하기

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

estimators = [DecisionTreeClassifier(random_state=42),
             RandomForestClassifier(random_state=42),
             GradientBoostingClassifier(random_state=42)]
estimators

[DecisionTreeClassifier(random_state=42),
 RandomForestClassifier(random_state=42),
 GradientBoostingClassifier(random_state=42)]

In [12]:
max_depth = np.random.randint(2, 20, 10)
max_depth

array([12,  5, 16,  7, 15, 12,  3,  8, 15, 17])

In [13]:
max_features = np.random.uniform(0.3, 1.0, 10)
max_features

array([0.87649337, 0.41476965, 0.83756549, 0.53864939, 0.51346386,
       0.86665874, 0.90283766, 0.84032345, 0.89944982, 0.84502208])

In [None]:
from sklearn.model_selection import RandomizedSearchCV

max_depth = np.random.randint(2, 20, 10)
max_features = np.random.uniform(0.3, 1.0, 10)

param_distributions = {'max_depth':max_depth, 
                       'max_features':max_features}

results=[]
for estimator in estimators:
  result=[]
  if estimator.__class__.__name__ != 'DecisionTreeClassifier':
    param_distributions['n_estimators'] = np.random.randint(100, 200, 10)
  clf = RandomizedSearchCV(estimator, 
                    param_distributions, 
                    n_iter=100, 
                    scoring='accuracy', 
                    n_jobs=-1,
                    cv=5,
                    verbose=2
                    )
  clf.fit(X_train, y_train)
  result.append(estimator.__class__.__name__)
  result.append(clf.best_params_)
  result.append(clf.best_score_)
  result.append(clf.score(X_test, y_test))
  result.append(clf.cv_results_)
  results.append(result)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
pd.DataFrame(results, columns=['estimator', 'best_params', 'train_score', 'test_score', 'cv_result'])

Unnamed: 0,estimator,best_params,train_score,test_score,cv_result
0,DecisionTreeClassifier,"{'max_features': 0.8097817650047376, 'max_dept...",0.856737,0.824675,"{'mean_fit_time': [0.008034992218017577, 0.007..."
1,RandomForestClassifier,"{'n_estimators': 126, 'max_features': 0.639416...",0.903985,0.857143,"{'mean_fit_time': [1.8769474029541016, 0.42539..."
2,GradientBoostingClassifier,"{'n_estimators': 352, 'max_features': 0.515722...",0.902332,0.850649,"{'mean_fit_time': [1.005486822128296, 1.087396..."


In [None]:
pd.DataFrame(df.iloc[1, 'cv_result']).sort_values(by='rank_test_score')

In [None]:
clf.best_score_