In [46]:
import pandas as pd
import csv
import numpy as np

In [54]:
pima = pd.read_csv('pima-indians-diabetes.csv', skiprows=9, names = ['pregnant', 'plasma', 'BP', 'skin', 'insulin', 'body', 'diabetes', 'age', 'target'])
pima.head()

Unnamed: 0,pregnant,plasma,BP,skin,insulin,body,diabetes,age,target
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [48]:
pima.target.value_counts()

0    500
1    268
Name: target, dtype: int64

In [56]:
pima_data = pima.iloc[:,:-1]
pima_data.head()

Unnamed: 0,pregnant,plasma,BP,skin,insulin,body,diabetes,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    pima_data, pima.target, random_state=2021,
    test_size=0.2, stratify=pima.target
)

In [59]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((614, 8), (154, 8), (614,), (154,))

In [60]:
from sklearn.tree import DecisionTreeClassifier
pima = DecisionTreeClassifier(random_state=2021)
pima.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2021,
 'splitter': 'best'}

In [61]:
pima.fit(X_train, y_train)

DecisionTreeClassifier(random_state=2021)

- 예측

In [62]:
pred = pima.predict(X_test)

- 평가

In [65]:
pima.score(X_test, y_test)

0.7077922077922078

- 튜닝과 교차검증

In [66]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [2,3,4], 
    'min_samples_split': [2,3,4]
}

In [67]:
grid_dt = GridSearchCV(pima, param_grid=params, scoring='accuracy', cv=3)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [68]:
grid_dt.best_score_

0.7443328550932568

In [69]:
grid_dt.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [70]:
best_dt = grid_dt.best_estimator_

In [71]:
best_dt.score(X_test, y_test)

0.7142857142857143

In [72]:
df = pd.DataFrame({'y':y_test, 'PD':pred})
df.head()

Unnamed: 0,y,PD
152,1,1
197,1,0
219,1,0
369,1,1
383,0,0
