# (Boostcourse) 프로젝트로 배우는 scikit-learn - Decision Tree

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('diabetes_feature.csv')
df.shape

(768, 13)

In [4]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Pregnancies_high,Insulin_nan,Insulin_log,low_glu_insulin
0,6,148,72,35,0,33.6,0.627,50,1,0,169.5,5.138735,False
1,1,85,66,29,0,26.6,0.351,31,0,0,102.5,4.639572,True
2,8,183,64,0,0,23.3,0.672,32,1,1,169.5,5.138735,False
3,1,89,66,23,94,28.1,0.167,21,0,0,94.0,4.553877,True
4,0,137,40,35,168,43.1,2.288,33,1,0,168.0,5.129899,False


## Split

In [6]:
from sklearn.model_selection import train_test_split

X = df[['Glucose', 'BloodPressure', 'SkinThickness',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high',
        'Insulin_log', 'low_glu_insulin']]
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

print('train 개수: ', X_train.shape, y_train.shape)
print('test 개수: ', X_test.shape, y_test.shape)

train 개수:  (537, 9) (537,)
test 개수:  (231, 9) (231,)


In [7]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state = 42)

model.fit(X_train, y_train)
y_predict = model.predict(X_test)

### 최적의 hyper parameter 찾기 : for문

In [131]:
for max_depth in range(3,12):
    model = DecisionTreeClassifier(max_depth = max_depth, random_state = 42)
    y_predict = model.fit(X_train, y_train).predict(X_test)
    score = accuracy_score(y_test, y_predict)
    print(max_depth, score)

3 0.8658008658008658
4 0.8571428571428571
5 0.8614718614718615
6 0.8528138528138528
7 0.8311688311688312
8 0.8398268398268398
9 0.8311688311688312
10 0.8398268398268398
11 0.8398268398268398


### 최적의 hyper parameter 찾기 : Grid Search

In [8]:
from sklearn.model_selection import GridSearchCV

model = DecisionTreeClassifier(random_state = 42)

param_grid = {'max_depth' : range(3,12), 
              'max_features' : [0.3, 0.5, 0.7, 0.9, 1]}

clf = GridSearchCV(model, 
                   param_grid=param_grid, 
                   n_jobs = -1, 
                   cv = 5, 
                   verbose = 1)

clf.fit(X_train, y_train)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed:    3.4s finished


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(3, 12),
                         'max_features': [0.3, 0.5, 0.7, 0.9, 1]},
             verbose=1)

n_job = -1을 지정하면 장비에서 사용가능한 모든 프로세서를 사용한다.

cv (cross validation)의 약자로 5개의 fold로 나눈다.

verbose = 1 옵션을 통해 로그 출력창을 확인 할 수 있다.

In [9]:
print(clf.best_params_)
print(clf.best_score_)

{'max_depth': 10, 'max_features': 0.9}
0.8714953271028036


In [10]:
clf.score(X_test, y_test)

0.8441558441558441

In [11]:
pd.DataFrame(clf.cv_results_).sort_values(by = 'rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
43,0.004662,0.00092,0.002182,0.000595,11,0.9,"{'max_depth': 11, 'max_features': 0.9}",0.898148,0.851852,0.906542,0.859813,0.841121,0.871495,0.026014,1
38,0.005952,0.001131,0.003076,0.000371,10,0.9,"{'max_depth': 10, 'max_features': 0.9}",0.898148,0.851852,0.906542,0.859813,0.841121,0.871495,0.026014,1
13,0.003868,0.000729,0.002084,0.000371,5,0.9,"{'max_depth': 5, 'max_features': 0.9}",0.907407,0.842593,0.813084,0.869159,0.878505,0.86215,0.032109,3
33,0.005852,0.001059,0.003671,0.001278,9,0.9,"{'max_depth': 9, 'max_features': 0.9}",0.898148,0.851852,0.859813,0.850467,0.841121,0.86028,0.019842,4
12,0.003373,0.000486,0.001786,0.000397,5,0.7,"{'max_depth': 5, 'max_features': 0.7}",0.907407,0.851852,0.82243,0.878505,0.841121,0.860263,0.029745,5


### 최적의 hyper parameter 찾기 : Random Search

In [12]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {'max_depth' : np.random.randint(1,20,100), 
                       'max_features' : np.random.uniform(0.7, 1.0, 100),
                      'min_samples_split' : list(range(2,7))}

clf = RandomizedSearchCV(model, 
                   param_distributions = param_distributions, 
                   n_iter=100, 
                   scoring='accuracy', 
                   n_jobs=-1, 
                   cv=5, 
                   random_state=42)

clf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': array([ 8,  9, 18, 11,  4, 17,  6,  6,  7,  3,  2,  7,  3, 16, 13, 14,  2,
        8, 17,  9,  2, 19, 15,  8, 10,  5, 13,  1, 13, 15, 17, 19, 19,  9,
       10, 17,  7, 11,  1, 17, 17, 15, 13,  3,  5, 19,  5, 11,  9, 14,  5,
       11,  9,  9, 10,  7, 13, 17,  3, 18,  6,  8, 17,  3,  5, 16,  1,  2,
       12,  2, 18, 10,  9,  7,  3, 14, 16,  6, 19, 15, 16, 15, 14, 19, 12,...
       0.9253087 , 0.78856492, 0.76196586, 0.96923634, 0.82056714,
       0.80230002, 0.9174423 , 0.88393938, 0.9121063 , 0.77882843,
       0.92289928, 0.79672683, 0.7242198 , 0.99431252, 0.8681459 ,
       0.79681609, 0.8591139 , 0.84303083, 0.95204023, 0.89358613,
       0.97401949, 0.86257058, 0.78839424, 0.74879182, 0.96754518,
       0.97719039, 0.74549312, 0.87953716, 0.82731468, 0.77285801]),
                                        'min_sa

In [13]:
print(clf.best_params_)
print(clf.best_score_)

{'min_samples_split': 4, 'max_features': 0.8653774444284837, 'max_depth': 5}
0.8715126341294566


In [14]:
clf.score(X_test, y_test)

0.8701298701298701

In [15]:
pd.DataFrame(clf.cv_results_).sort_values(by = 'rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
84,0.00625,0.002834,0.002777,0.000243,4,0.865377,5,"{'min_samples_split': 4, 'max_features': 0.865...",0.87037,0.87037,0.869159,0.841121,0.906542,0.871513,0.020779,1
12,0.00496,0.000314,0.002778,0.000242,2,0.959508,14,"{'min_samples_split': 2, 'max_features': 0.959...",0.898148,0.851852,0.906542,0.859813,0.841121,0.871495,0.026014,2
60,0.005456,0.000314,0.004167,0.002885,5,0.95204,6,"{'min_samples_split': 5, 'max_features': 0.952...",0.898148,0.87963,0.841121,0.859813,0.878505,0.871443,0.019415,3
82,0.005158,0.000243,0.002678,0.000243,5,0.800126,9,"{'min_samples_split': 5, 'max_features': 0.800...",0.842593,0.851852,0.88785,0.878505,0.869159,0.865992,0.016685,4
64,0.005157,0.000243,0.002581,0.000371,5,0.926528,8,"{'min_samples_split': 5, 'max_features': 0.926...",0.87963,0.861111,0.878505,0.878505,0.831776,0.865905,0.018405,5
