In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('diabetes_feature.csv')
df.shape

(768, 16)

## 학습과 예측에 사용할 데이터셋 만들기

In [3]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Pregnancies_high',
       'Age_low', 'Age_middle', 'Age_high', 'Insulin_nan', 'Insulin_log',
       'low_glu_insulin'],
      dtype='object')

In [4]:
X = df[['Glucose', 'BloodPressure', 'SkinThickness',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high',
        'Insulin_nan','low_glu_insulin']]
X.shape

(768, 9)

In [5]:
y=df['Outcome']
y.shape

(768,)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
X_train.shape

(614, 9)

In [8]:
y_train.shape

(614,)

In [9]:
X_test.shape, y_test.shape

((154, 9), (154,))

## 여러 개의 알고리즘을 사용해서 비교하기

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

estimators = [DecisionTreeClassifier(random_state=42),
             RandomForestClassifier(random_state=42),
             GradientBoostingClassifier(random_state=42)]
estimators

[DecisionTreeClassifier(random_state=42),
 RandomForestClassifier(random_state=42),
 GradientBoostingClassifier(random_state=42)]

In [12]:
max_depth = np.random.randint(2, 20, 10)
max_depth

array([14,  2, 17, 11, 17, 14,  4,  2,  8,  6])

In [20]:
max_features = np.random.uniform(0.3, 1.0, 10)
max_features

array([0.75357865, 0.3406784 , 0.33956062, 0.51619315, 0.87029812,
       0.30565866, 0.87351371, 0.93691513, 0.44679979, 0.92204886])

In [23]:
from sklearn.model_selection import RandomizedSearchCV

max_depth = np.random.randint(2, 20, 10)
max_features = np.random.uniform(0.3, 1.0, 10)

param_distributions = {'max_depth':max_depth, 
                       'max_features':max_features}

results=[]
for estimator in estimators:
  result=[]
  if estimator.__class__.__name__ != 'DecisionTreeClassifier':
    param_distributions['n_estimators'] = np.random.randint(100, 200, 10)
  clf = RandomizedSearchCV(estimator, 
                    param_distributions, 
                    n_iter=100, 
                    scoring='accuracy', 
                    n_jobs=-1,
                    cv=5,
                    verbose=2
                    )
  clf.fit(X_train, y_train)
  result.append(estimator.__class__.__name__)
  result.append(clf.best_params_)
  result.append(clf.best_score_)
  result.append(clf.score(X_test, y_test))
  result.append(clf.cv_results_)
  results.append(result)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits


KeyboardInterrupt: ignored

In [22]:
pd.DataFrame(results, columns=['estimator', 'best_params', 'train_score', 'test_score', 'cv_result'])

Unnamed: 0,estimator,best_params,train_score,test_score,cv_result
0,DecisionTreeClassifier,"{'max_features': 0.8097817650047376, 'max_dept...",0.856737,0.824675,"{'mean_fit_time': [0.008034992218017577, 0.007..."
1,RandomForestClassifier,"{'n_estimators': 126, 'max_features': 0.639416...",0.903985,0.857143,"{'mean_fit_time': [1.8769474029541016, 0.42539..."
2,GradientBoostingClassifier,"{'n_estimators': 352, 'max_features': 0.515722...",0.902332,0.850649,"{'mean_fit_time': [1.005486822128296, 1.087396..."


In [None]:
pd.DataFrame(df.iloc[1, 'cv_result']).sort_values(by='rank_test_score')

In [None]:
clf.best_score_