In [1]:
# 피마 인디언 당뇨병 예측

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [43]:
# 로지스틱 회귀
# 의사결정 나무
# 랜덤포레스트

# train_test_split
# 정확도, 재현율, 정밀도

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score


In [44]:
diabetes_df = pd.read_csv('D:/pythonTest/diabetes.csv')

In [45]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [46]:
diabetes_df.Outcome.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [53]:
def get_metrics(y_test,pred):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    p_score = precision_score(y_test,pred)
    r_score = recall_score(y_test,pred)
    roc_auc = roc_auc_score(y_test,pred)
    f1 = f1_score(y_test,pred)
    
    # 혼동행렬부터 출력하기
    print('혼동행렬')
    print(confusion)
    
    # 결과지표 print문
    print('accuracy:', np.round(accuracy,4),
         'precision:', np.round(p_score,4),
         'recall:', np.round(r_score,4),
         'roc_auc:', np.round(roc_auc,4),
         'f1_score:', np.round(f1,4))

In [54]:
X = diabetes_df.drop('Outcome', axis=1)
# y = diabetes_df['Outcom']
# y = diabetes_df.Outcom
y = diabetes_df.iloc[:,-1]

In [60]:
# train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y)

# fitting 시작
# lr_clf
print('### 로지스틱 회귀\n')
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred_lr = lr_clf.predict(X_test)
get_metrics(y_test,pred_lr)

# dt_clf
print('\n### 의사결정나무\n')
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
pred_dt = dt_clf.predict(X_test)
get_metrics(y_test,pred_dt)

# rf_clf
print('\n### 랜덤포레스트\n')
rf_clf = DecisionTreeClassifier()
rf_clf.fit(X_train, y_train)
pred_rf = rf_clf.predict(X_test)
get_metrics(y_test,pred_rf)

### 로지스틱 회귀

혼동행렬
[[90 10]
 [22 32]]
accuracy: 0.7922 precision: 0.7619 recall: 0.5926 roc_auc: 0.7463 f1_score: 0.6667

### 의사결정나무

혼동행렬
[[73 27]
 [20 34]]
accuracy: 0.6948 precision: 0.5574 recall: 0.6296 roc_auc: 0.6798 f1_score: 0.5913

### 랜덤포레스트

혼동행렬
[[74 26]
 [22 32]]
accuracy: 0.6883 precision: 0.5517 recall: 0.5926 roc_auc: 0.6663 f1_score: 0.5714


In [50]:
def p_r_curve_plot(y_test, pred_proba):
    # threshold ndarray로 가져와보고 
    # 위의 threshold에 따른 정밀도, 재현율의 ndarray를 추출
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba)
    
    # X축 = threshold
    # y축은 precision(점선) 및 recall(주황선) 설정
    # 각 곡선을 중첩되게 graph화
    plt.figure(figsize=(8,6))
    threshold_boudary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boudary], linestyle='--', label='precison')
    plt.plot(thresholds, recalls[0:threshold_boudary], label='recall')
    
    # threshold 값 x 축의 scale을 0,1으로 scaling
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start,end,0.10),2))
    
    # X축, y축 label 및 legend 추가
    # grid도 추가
    plt.xlabel('Threshold_value')
    plt.ylabel('Precision and Recall')
    plt.legend()
    plt.grid()
    plt.show() 

In [None]:
from sklearn.metrics