## Heart Disease Dataset

https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset

1. age
2. sex
3. chest pain type (4 values)
4. resting blood pressure
5. serum cholestoral in mg/dl
6. fasting blood sugar > 120 mg/dl
7. resting electrocardiographic results (values 0,1,2)
8. maximum heart rate achieved
9. exercise induced angina
10. oldpeak = ST depression induced by exercise relative to rest
11. the slope of the peak exercise ST segment
12. number of major vessels (0-3) colored by flourosopy
13. thal: 0 = normal; 1 = fixed defect; 2 = reversable defect
14. target: the presence of heart disease in the patient. 0 = no disease; 1 = disease

## Load Dataset 데이터셋 로드

In [2]:
import pandas as pd


df = pd.read_csv('heart.csv')

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


### 데이터프레임 정보


In [4]:
df.info(), df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


(None,
                age          sex           cp     trestbps        chol  \
 count  1025.000000  1025.000000  1025.000000  1025.000000  1025.00000   
 mean     54.434146     0.695610     0.942439   131.611707   246.00000   
 std       9.072290     0.460373     1.029641    17.516718    51.59251   
 min      29.000000     0.000000     0.000000    94.000000   126.00000   
 25%      48.000000     0.000000     0.000000   120.000000   211.00000   
 50%      56.000000     1.000000     1.000000   130.000000   240.00000   
 75%      61.000000     1.000000     2.000000   140.000000   275.00000   
 max      77.000000     1.000000     3.000000   200.000000   564.00000   
 
                fbs      restecg      thalach        exang      oldpeak  \
 count  1025.000000  1025.000000  1025.000000  1025.000000  1025.000000   
 mean      0.149268     0.529756   149.114146     0.336585     1.071512   
 std       0.356527     0.527878    23.005724     0.472772     1.175053   
 min       0.000000     0

### 유니크한 값

In [5]:
df.nunique()

age          41
sex           2
cp            4
trestbps     49
chol        152
fbs           2
restecg       3
thalach      91
exang         2
oldpeak      40
slope         3
ca            5
thal          4
target        2
dtype: int64

# Automatically Visualize Dataset 자동 시각화

AutoViz 설치

https://github.com/AutoViML/AutoViz

In [7]:
#!pip install -q autoviz

^C


In [None]:
%matplotlib inline

from autoviz.AutoViz_Class import AutoViz_Class
import matplotlib.pyplot as plt
plt.style.use('dark_background')

AV = AutoViz_Class()

AV.AutoViz(
    filename='',
    dfte=df,
    depVar='target',
    verbose=2, # 0: 간단히 표시; 1: 자세히 표시; 2: 파일로 저장
    max_rows_analyzed=df.shape[0],
    max_cols_analyzed=df.shape[1])

# LazyPredict

AutoML with scikit-learn

https://lazypredict.readthedocs.io/en/latest/

자동으로 베스트 모델 찾아주는 패키지

### 데이터 준비

In [11]:
y_data = df.pop('target')
x_data = df

print(x_data.shape)
print(y_data.shape)

(1025, 13)
(1025,)


In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x_data,
    y_data,
    test_size=0.2,
    random_state=2022,
    stratify=y_data) # 클래스 비율을 동일하게 분할한다

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(820, 13) (820,)
(205, 13) (205,)


### 모델 성능 자동 비교

In [13]:
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0, predictions=True)

models, predictions = clf.fit(x_train, x_test, y_train, y_test)

models

100%|██████████| 29/29 [00:01<00:00, 21.14it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,1.0,1.0,1.0,1.0,0.15
LabelPropagation,1.0,1.0,1.0,1.0,0.04
XGBClassifier,1.0,1.0,1.0,1.0,0.14
DecisionTreeClassifier,1.0,1.0,1.0,1.0,0.01
RandomForestClassifier,1.0,1.0,1.0,1.0,0.2
ExtraTreeClassifier,1.0,1.0,1.0,1.0,0.02
ExtraTreesClassifier,1.0,1.0,1.0,1.0,0.2
BaggingClassifier,1.0,1.0,1.0,1.0,0.04
LabelSpreading,1.0,1.0,1.0,1.0,0.04
SVC,0.94,0.94,0.94,0.94,0.03


### 모델별 테스트 데이터 예측값

In [14]:
predictions.head()

Unnamed: 0,AdaBoostClassifier,BaggingClassifier,BernoulliNB,CalibratedClassifierCV,DecisionTreeClassifier,DummyClassifier,ExtraTreeClassifier,ExtraTreesClassifier,GaussianNB,KNeighborsClassifier,...,PassiveAggressiveClassifier,Perceptron,QuadraticDiscriminantAnalysis,RandomForestClassifier,RidgeClassifier,RidgeClassifierCV,SGDClassifier,SVC,XGBClassifier,LGBMClassifier
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,1,1,1,1,1,1,0,...,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,0,1,1,0,1,0,0,1,1,...,1,1,0,0,1,1,1,0,0,0


### 모델별 분류 리포트

In [16]:
from sklearn.metrics import classification_report

for model_name in predictions.columns.tolist():
    print(f'{model_name}')
    print(classification_report(y_test, predictions[model_name]))

AdaBoostClassifier
              precision    recall  f1-score   support

           0       0.95      0.89      0.92       100
           1       0.90      0.95      0.93       105

    accuracy                           0.92       205
   macro avg       0.92      0.92      0.92       205
weighted avg       0.92      0.92      0.92       205

BaggingClassifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       100
           1       1.00      1.00      1.00       105

    accuracy                           1.00       205
   macro avg       1.00      1.00      1.00       205
weighted avg       1.00      1.00      1.00       205

BernoulliNB
              precision    recall  f1-score   support

           0       0.88      0.80      0.84       100
           1       0.82      0.90      0.86       105

    accuracy                           0.85       205
   macro avg       0.85      0.85      0.85       205
weighted avg       0.85  

### 성능 좋은 LightGBM 사용

In [17]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

lgbm = LGBMClassifier()

lgbm.fit(x_train, y_train)

y_pred = lgbm.predict(x_test)

accuracy_score(y_pred, y_test)

1.0

# 파라미터 자동 튜닝

### SVM 튜닝없이 돌렸을때

In [18]:
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

X_train = pd.DataFrame(x_train)
X_test = pd.DataFrame(x_test)

pipe = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", SVC())
])

pipe.fit(x_train, y_train)

y_pred = pipe.predict(x_test)

accuracy_score(y_pred, y_test)

0.9365853658536586

In [19]:
pipe['classifier'].get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

## Scikit-Optimize

Scikit-learn 의 머신러닝 모델들의 파라미터 자동 튜닝 패키지

https://scikit-optimize.github.io/stable/

In [22]:
from skopt import BayesSearchCV

pipe = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", SVC())
])

opt = BayesSearchCV(
    pipe,
    {
        'classifier__C': (1e-1, 1e+1, 'log-uniform'),
        'classifier__gamma': (1e-6, 1e+1, 'log-uniform'),
        'classifier__degree': (1, 8),  # integer valued parameter
        'classifier__kernel': ['linear', 'poly', 'rbf'],  # categorical parameter
    },
    n_iter=8,
    cv=3
)

opt.fit(x_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(x_test, y_test))

val. score: 0.9670686167036533
test score: 1.0


In [23]:
opt.best_params_

OrderedDict([('classifier__C', 0.3190547286265487),
             ('classifier__degree', 4),
             ('classifier__gamma', 0.632736824293571),
             ('classifier__kernel', 'poly')])

In [25]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

X_train = pd.DataFrame(x_train)
X_test = pd.DataFrame(x_test)

pipe = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", XGBClassifier())
])

pipe.fit(x_train, y_train)

y_pred = pipe.predict(x_test)

accuracy_score(y_pred, y_test)

1.0

In [26]:
pipe['classifier'].get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': 0.5,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': 0,
 'gpu_id': -1,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.300000012,
 'max_bin': 256,
 'max_cat_to_onehot': 4,
 'max_delta_step': 0,
 'max_depth': 6,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 100,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'sampling_method': 'uniform',
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}