- 학습의 목표
- 머시러닝의 분류모델을 이용하여, 여러가지 평가지표를 적용하여 확인
- 의학(당뇨병 여부 판단): 재현율지표를 확인


In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler,MinMaxScaler,Binarizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score,roc_auc_score
from sklearn.metrics import confusion_matrix,precision_recall_curve,roc_curve

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as ms

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [9]:
diabetes_df =pd.read_csv('./data/diabetes.csv')
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [16]:
#target분포확인
diabetes_df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [17]:
diabetes_df.keys()

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [None]:
#분류를 위한 예측모델 생성
#모델셀렉션, 교차검증
#학습,예측및 평가

In [22]:
#분류를 위한 예측모델 생성

rf_model = RandomForestClassifier()


#데이터 세트 분리 
label=diabetes_df['Outcome']
features =diabetes_df.drop(columns=['Outcome'],axis=1)

print('label\n', label)
print('features\n', features)

label
 0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64
features
      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93    

In [23]:
#모델셀렉션, 교차검증

In [24]:
from sklearn.model_selection import cross_validate,KFold
from sklearn.metrics import make_scorer, f1_score



X_train,X_test,y_train,y_test=train_test_split(features,label,     #엑스는 피처,, 와이는 라벨
                test_size=0.2,
                random_state=20)
print('train data\n', X_train)
print('train label\n', y_train)
print("*"* 50)
print('test data\n', X_test)
print('test label\n', y_test)


train data
      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
446            1      100             72             12       70  25.3   
260            3      191             68             15      130  30.9   
570            3       78             70              0        0  32.5   
590           11      111             84             40        0  46.8   
30             5      109             75             26        0  36.0   
..           ...      ...            ...            ...      ...   ...   
218            5       85             74             22        0  29.0   
223            7      142             60             33      190  28.8   
271            2      108             62             32       56  25.2   
474            4      114             64              0        0  28.9   
355            9      165             88              0        0  30.4   

     DiabetesPedigreeFunction  Age  
446                     0.658   28  
260                     0

In [32]:
#학습을 위한 학습기
diabetes_dtc =RandomForestClassifier(random_state =20)   # 학습은 트레인으로
diabetes_dtc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=20, verbose=0,
                       warm_start=False)

In [31]:
#예측 (predict)수행
prediction = diabetes_dtc.predict(X_test)    #예측은 테스트로
print('y_test\n', y_test)
print('prediction\n', prediction)   #예측한거임

y_test
 143    1
14     1
504    0
557    0
736    0
      ..
57     0
140    0
713    0
673    0
559    0
Name: Outcome, Length: 154, dtype: int64
prediction
 [1 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0
 0 0 1 1 0 1 0 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0
 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 0 0 0
 0 0 0 0 0 0]


In [30]:
#학습,예측및 평가


from sklearn.metrics import accuracy_score


print('예측정확도:{0:.2f}'.format(accuracy_score(y_test, prediction)))  #정확도를 맹신 nono

예측정확도:0.75


In [None]:
#임계값별 정밀도-재현율 확인 및 시각화
#당수치 , 혈압, 피하지방, 인슐린, 체질량지수의 분포를 히스토그램으로 확인
#표준화,정규화