피마인디언 당뇨병 예측

In [1]:
import numpy as np
import pandas as pd

- 데이터 전처리

In [2]:
df = pd.read_csv('pima-indians-diabetes.csv', skiprows=9, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df[1].head()

0    148
1     85
2    183
3     89
4    137
Name: 1, dtype: int64

In [4]:
# 1. Number of times pregnant
# 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
# 3. Diastolic blood pressure (mm Hg)
# 4. Triceps skin fold thickness (mm)
# 5. 2-Hour serum insulin (mu U/ml)
# 6. Body mass index (weight in kg/(height in m)^2)
# 7. Diabetes pedigree function
# 8. Age (years)
# 9. Class variable (0 or 1)
df.columns = ['P','G','BP','S','I','BMI','D','Age','Class']
df.head()

Unnamed: 0,P,G,BP,S,I,BMI,D,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df.index[:5]

RangeIndex(start=0, stop=5, step=1)

In [6]:
# 결측치 데이터가 있는지 확인
df.isna().sum()                 # isna() 또는 isnull() 사용하여 확인할 수 있음

P        0
G        0
BP       0
S        0
I        0
BMI      0
D        0
Age      0
Class    0
dtype: int64

In [7]:
X = df.iloc[:, :-1]
X.head()

Unnamed: 0,P,G,BP,S,I,BMI,D,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [34]:
X = df.iloc[:, :-1].values
type(X)

numpy.ndarray

In [35]:
X[:5]

array([[6.000e+00, 1.480e+02, 7.200e+01, 3.500e+01, 0.000e+00, 3.360e+01,
        6.270e-01, 5.000e+01],
       [1.000e+00, 8.500e+01, 6.600e+01, 2.900e+01, 0.000e+00, 2.660e+01,
        3.510e-01, 3.100e+01],
       [8.000e+00, 1.830e+02, 6.400e+01, 0.000e+00, 0.000e+00, 2.330e+01,
        6.720e-01, 3.200e+01],
       [1.000e+00, 8.900e+01, 6.600e+01, 2.300e+01, 9.400e+01, 2.810e+01,
        1.670e-01, 2.100e+01],
       [0.000e+00, 1.370e+02, 4.000e+01, 3.500e+01, 1.680e+02, 4.310e+01,
        2.288e+00, 3.300e+01]])

In [36]:
# 여러가지 방법으로 y 값을 취할 수 있음
y = df.Class                    # Series
y = df['Class'].values          # Numpy array
y = df.Class.values             # Numpy array
y = df.iloc[:, -1].values       # Numpy array

In [37]:
X.shape, y.shape

((768, 8), (768,))

- Train/Test dataset으로 분리

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2021
)

In [39]:
np.unique(y, return_counts=True)

(array([0, 1], dtype=int64), array([500, 268], dtype=int64))

In [40]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([400, 214], dtype=int64))

- GridSearchCV, DecisionTreeClassifier

In [41]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)
params = {
    'max_depth': [2,3,4,5,6],
    'min_samples_split': [2,3,4]
}

In [42]:
from sklearn.model_selection import GridSearchCV
grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=5)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [43]:
grid_dt.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [44]:
grid_dt.best_estimator_.score(X_test, y_test)

0.7337662337662337

In [45]:
0.7337662337662337
best_dt = grid_dt.best_estimator_
best_dt.score(X_test, y_test)

0.7337662337662337

- 실제 적용

In [46]:
X_test[10], y_test[10]

(array([ 12.   , 100.   ,  84.   ,  33.   , 105.   ,  30.   ,   0.488,
         46.   ]),
 0)

In [47]:
test_data = X_test[10].reshape(1,-1)

In [48]:
pred = best_dt.predict(test_data)
pred

array([0], dtype=int64)

In [49]:
print('양성' if pred[0] == 1 else '음성')       # 3항 연산자

음성


In [50]:
if pred[0] == 1:
    print('양성')
else:
    print('음성')

음성


- 참고 사항 : 결측치

In [51]:
df2 = pd.DataFrame(np.arange(25).reshape(5,5), index=list('abcde'), columns=list('vwxyz'))
df2

Unnamed: 0,v,w,x,y,z
a,0,1,2,3,4
b,5,6,7,8,9
c,10,11,12,13,14
d,15,16,17,18,19
e,20,21,22,23,24


In [52]:
df2['v']['c'] = np.nan
df2.z['d'] = np.nan
df2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.z['d'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,v,w,x,y,z
a,0.0,1,2,3,4.0
b,5.0,6,7,8,9.0
c,,11,12,13,14.0
d,15.0,16,17,18,
e,20.0,21,22,23,24.0


In [53]:
df2.isna()

Unnamed: 0,v,w,x,y,z
a,False,False,False,False,False
b,False,False,False,False,False
c,True,False,False,False,False
d,False,False,False,False,True
e,False,False,False,False,False


In [54]:
df2.isna().sum()

v    1
w    0
x    0
y    0
z    1
dtype: int64

In [55]:
df2.isna().sum().sum()

2