# 심장 질환 예측 (heart disease prediction)

In [21]:
import warnings

warnings.filterwarnings('ignore')

In [22]:
import pandas as pd

df_heart = pd.read_csv('heart_disease.csv')
df_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
from sklearn.model_selection import train_test_split

X = df_heart.iloc[:,:-1]
y = df_heart.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

model = DecisionTreeClassifier(random_state=2)
scores = cross_val_score(model, X, y, cv=5)
print('정확도:', np.round(scores, 2))
print('정확도 평균: %0.2f' % (scores.mean()))

정확도: [0.74 0.85 0.77 0.73 0.7 ]
정확도 평균: 0.76


In [8]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

def randomized_search_clf(params, runs=20, clf=DecisionTreeClassifier(random_state=2)):
  rand_clf = RandomizedSearchCV(clf, params, n_iter=runs,
                                cv=5, n_jobs=-1, random_state=2)
  rand_clf.fit(X_train, y_train)
  best_model = rand_clf.best_estimator_
  best_score = rand_clf.best_score_
  print("훈련 점수: {:.3f}".format(best_score))
  y_pred = best_model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  print('테스트 점수: {:.3f}'. format(accuracy))
  return best_model

In [11]:
randomized_search_clf(
    params={
        'criterion':['entropy', 'gini'],
        'splitter':['random', 'best'],
        'min_samples_split':[2, 3, 4, 5, 6, 8, 10],
        'min_samples_leaf':[1, 0.01, 0.02, 0.03, 0.04],
        'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],
        'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],
        'max_features':['auto', 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],
        'max_depth':[None, 2,4,6,8],
        'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]
    })

훈련 점수: 0.798
테스트 점수: 0.855


DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=0.8,
                       max_leaf_nodes=45, min_samples_leaf=0.04,
                       min_samples_split=10, min_weight_fraction_leaf=0.05,
                       random_state=2)

In [15]:
best_model = randomized_search_clf(
    params = {'max_depth': [None, 6, 7],
              'max_features': ['auto', 0.78],
              'max_leaf_nodes': [45, None],
              'min_samples_leaf': [1, 0.035, 0.04, 0.045, 0.05],
              'min_samples_split': [2, 9, 10],
              'min_weight_fraction_leaf': [0.0, 0.05, 0.06, 0.07],
              },
    runs=100
    )

훈련 점수: 0.802
테스트 점수: 0.868


In [16]:
scores = cross_val_score(best_model, X, y, cv=5)
print('정확도:', np.round(scores, 2))
print('정확도 평균: %0.2f' % (scores.mean()))

정확도: [0.82 0.9  0.8  0.8  0.78]
정확도 평균: 0.82


In [23]:
from sklearn.utils.fixes import loguniform
from scipy.stats import randint

params = {'max_depth': randint(1, 100),
          'max_leaf_nodes': randint(1, 100),
          'max_features': loguniform(1e-5, 1),
          'min_samples_split': loguniform(1e-5, 1),
          'min_samples_leaf': loguniform(1e-5, 1),
          'min_impurity_decrease': loguniform(1e-5, 1),
          'min_weight_fraction_leaf': loguniform(1e-5, 1)}
        
dtc = DecisionTreeClassifier(random_state=0)
rs = RandomizedSearchCV(dtc, params, n_iter=100, n_jobs=-1, random_state=0)
rs.fit(X_train, y_train)

print('최상의 교차 검증 점수:', rs.best_score_)
print('최상의 매개변수:', rs.best_params_)

최상의 교차 검증 점수: 0.7932367149758455
최상의 매개변수: {'max_depth': 48, 'max_features': 0.43091880545542754, 'max_leaf_nodes': 68, 'min_impurity_decrease': 0.0013196080073784372, 'min_samples_leaf': 0.005935250363740932, 'min_samples_split': 0.00027243167437771866, 'min_weight_fraction_leaf': 0.03410935690756259}


In [24]:
best_model.fit(X, y)

DecisionTreeClassifier(max_depth=7, max_features=0.78, max_leaf_nodes=45,
                       min_samples_leaf=0.045, min_samples_split=9,
                       min_weight_fraction_leaf=0.06, random_state=2)

In [25]:
best_model.feature_importances_ 

array([0.04826754, 0.04081653, 0.48409586, 0.00568635, 0.        ,
       0.        , 0.        , 0.00859483, 0.        , 0.02690379,
       0.        , 0.18069065, 0.20494446])

In [26]:
feature_dict = dict(zip(X.columns, best_model.feature_importances_))
import operator
sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True)[0:3]

[('cp', 0.4840958610240171),
 ('thal', 0.20494445570568706),
 ('ca', 0.18069065321397942)]

In [29]:
from sklearn.inspection import permutation_importance
result = permutation_importance(best_model, X, y, n_jobs=-1, random_state=0)
feature_dict = dict(zip(X.columns, result.importances_mean))
sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True)[0:3]

[('cp', 0.08976897689768981),
 ('thal', 0.08382838283828387),
 ('ca', 0.05940594059405944)]