# 피마 인디언 당뇨병 에측

In [55]:
import numpy as np
import pandas as pd

- 1. 데이터 전처리

In [56]:
#헤더가 여러줄일때 한번에 불러오는 방법
cols = pd.read_csv('pima-indians-diabetes.csv', header=None, delimiter='\t')
cols = cols[:9]  #헤더의 마지막줄까지 불러온다.
colnames = cols[0].map(lambda x: x[5:]).values #앞에 불필요한 요소 제거
df = pd.read_csv('pima-indians-diabetes.csv', skiprows = 9, header=None)
df.columns = colnames
df.head()

Unnamed: 0,Number_of_times_pregnant,Plasma_glucose_concentration_a_2_hours_in_an_oral_glucose_tolerance_test,Diastolic_blood_pressure_(mm_Hg),Triceps_skin_fold_thickness_(mm),2-Hour_serum_insulin_(mu_U/ml),Body_mass_index_(weight_in_kg/(height_in_m)^2),Diabetes_pedigree_function,Age_(years),Class_variable_(0_or_1)
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [57]:
X = df.iloc[:,:-1].values
y = df['Class_variable_(0_or_1)'].values
X.shape, y.shape

# 사이키런에서는 넘파이 어레이도 판다스 시리즈도 모두 수용한다.

((768, 8), (768,))

In [58]:
np.unique(y, return_counts=True)

(array([0, 1], dtype=int64), array([500, 268], dtype=int64))

- Train/test Dataset 분리

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y , stratify=y,test_size=0.2, random_state=2021
)

In [60]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([400, 214], dtype=int64))

- Model 생성 및 학습

In [61]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)

In [62]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=2021)

- 예측 및 평가

In [63]:
pred = dtc.predict(X_test)

In [64]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7077922077922078

In [65]:
dtc.score(X_test, y_test)

0.7077922077922078

- 최적의 하이퍼 파라메터 도출 및 교차 검증

In [66]:
params = {
    'max_depth':[2,4,6],
    'min_samples_split':[2,4,6]
}

In [67]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=3)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 4, 6],
                         'min_samples_split': [2, 4, 6]},
             scoring='accuracy')

In [68]:
grid_dt.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [69]:
params = {
    'max_depth':[2,3,4],
    'min_samples_split':[2,3,4]
}

In [70]:
grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=3)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [71]:
grid_dt.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [72]:
best_dt = grid_dt.best_estimator_
best_dt.score(X_test, y_test)

0.7142857142857143

- 실제 값 하나가 주어졌을때 당뇨병 여부를 확인하는 방법

In [73]:
y_test[33]

0

In [74]:
X_test[33]

array([  0.   , 126.   ,  86.   ,  27.   , 120.   ,  27.4  ,   0.515,
        21.   ])

In [75]:
test_Data = X_test[33]

In [76]:
result=best_dt.predict(test_Data.reshape(1,8))[0]
print('음성'if result == 0 else '양성')

#predict는 여러개의 데이터를 처리하는데 하나만 넣어주면 에러가 발행한다
#따라서 reshape를 해줘야함 (2차원으로 만드는작업)

음성
