## 과제

- 피마 인디언 당뇨병 예측

In [1]:
import pandas as pd
import numpy as np

### 1. 전처리

In [2]:
pima = pd.read_csv('pima.csv',
names=['pregnant','plasma','pressure','thickness','insulin','BMI','pedigree','age','class'])

In [3]:
pima.info()     #데이터 받으면 null값 있는지 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pregnant   768 non-null    int64  
 1   plasma     768 non-null    int64  
 2   pressure   768 non-null    int64  
 3   thickness  768 non-null    int64  
 4   insulin    768 non-null    int64  
 5   BMI        768 non-null    float64
 6   pedigree   768 non-null    float64
 7   age        768 non-null    int64  
 8   class      768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
print(pima['class'].value_counts())
pima.head()

0    500
1    268
Name: class, dtype: int64


Unnamed: 0,pregnant,plasma,pressure,thickness,insulin,BMI,pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### 2. Train/Test 분리

In [5]:
X = pima.iloc[:, :-1]
y = pima['class']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y,
    test_size=0.2, random_state=2021
)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [8]:
y_test.value_counts()

0    100
1     54
Name: class, dtype: int64

### 3. 모델 생성 및 학습

In [9]:
from sklearn.tree import DecisionTreeClassifier

In [10]:
dtc = DecisionTreeClassifier(random_state=2021)
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=2021)

### 4. 학습된 모델로 예측

In [12]:
pred = dtc.predict(X_test)

### 5.실제값과 예측값 비교하여 모델 성능 평가

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7077922077922078

### GridSearchCV 사용

In [14]:
from sklearn.model_selection import GridSearchCV

In [16]:
params = {
    'max_depth' : [2, 4, 6],
    'min_samples_split' : [2, 4, 6]
}

In [17]:
dtc = DecisionTreeClassifier(random_state=2021)
grid_dtc = GridSearchCV(dtc, param_grid=params, cv=3)
grid_dtc.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 4, 6],
                         'min_samples_split': [2, 4, 6]})

In [18]:
grid_dtc.best_score_

0.7443248844253149

In [19]:
grid_dtc.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [20]:
pred = grid_dtc.best_estimator_.predict(X_test)
accuracy_score(y_test, pred)

0.7337662337662337

In [21]:
pred.shape

(154,)

- 실제 값 하나를 주고 당뇨병인지 아닌지 여부를 확인

In [22]:
estimator = grid_dtc.best_estimator_
test_data = [4,110,92,0,0,37.6,0.191,30]
predict = estimator.predict(np.array(test_data).reshape(1,8))

In [23]:
predict

array([0], dtype=int64)

In [24]:
predict[0]

0

In [25]:
estimator.predict([test_data])[0]

0