In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
data = pd.read_csv('heart_2020_cleaned.csv')
data.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
data.shape

(303, 14)

In [6]:
data.duplicated().sum()

1

In [7]:
data = data.drop_duplicates().reset_index(drop=True)

In [8]:
data.isna().sum()

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64

In [9]:
for feature in data.columns:  
    print(feature)
    print(data[feature].unique(),"\n") 

age
[63 37 41 56 57 44 52 54 48 49 64 58 50 66 43 69 59 42 61 40 71 51 65 53
 46 45 39 47 62 34 35 29 55 60 67 68 74 76 70 38 77] 

sex
[1 0] 

cp
[3 2 1 0] 

trtbps
[145 130 120 140 172 150 110 135 160 105 125 142 155 104 138 128 108 134
 122 115 118 100 124  94 112 102 152 101 132 148 178 129 180 136 126 106
 156 170 146 117 200 165 174 192 144 123 154 114 164] 

chol
[233 250 204 236 354 192 294 263 199 168 239 275 266 211 283 219 340 226
 247 234 243 302 212 175 417 197 198 177 273 213 304 232 269 360 308 245
 208 264 321 325 235 257 216 256 231 141 252 201 222 260 182 303 265 309
 186 203 183 220 209 258 227 261 221 205 240 318 298 564 277 214 248 255
 207 223 288 160 394 315 246 244 270 195 196 254 126 313 262 215 193 271
 268 267 210 295 306 178 242 180 228 149 278 253 342 157 286 229 284 224
 206 167 230 335 276 353 225 330 290 172 305 188 282 185 326 274 164 307
 249 341 407 217 174 281 289 322 299 300 293 184 409 259 200 327 237 218
 319 166 311 169 187 176 241 131] 

fbs
[1 

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    int64  
 1   sex       302 non-null    int64  
 2   cp        302 non-null    int64  
 3   trtbps    302 non-null    int64  
 4   chol      302 non-null    int64  
 5   fbs       302 non-null    int64  
 6   restecg   302 non-null    int64  
 7   thalachh  302 non-null    int64  
 8   exng      302 non-null    int64  
 9   oldpeak   302 non-null    float64
 10  slp       302 non-null    int64  
 11  caa       302 non-null    int64  
 12  thall     302 non-null    int64  
 13  output    302 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.2 KB


In [11]:
binary_columns = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 
          'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']
for column in binary_columns:
    data[column] = data[column].replace({'No': 0, 'Yes': 1})

KeyError: 'HeartDisease'

In [None]:
def convert_range_to_mean(range_str):
    if 'or older' in range_str:
        return int(range_str.split()[0])
    else:
        range_values = range_str.split('-')
        return (int(range_values[0]) + int(range_values[1])) / 2

data['AgeCategory'] = data['AgeCategory'].apply(convert_range_to_mean)

In [None]:
from sklearn.preprocessing import MinMaxScaler
data_numeric_columns = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime','AgeCategory']
scaler = MinMaxScaler()
data[data_numeric_columns] = scaler.fit_transform(data[data_numeric_columns])

In [None]:
GenHealth_mapping = {'Poor': 1, 'Fair': 2, 'Good': 3, 'Very good': 4, 'Excellent': 5}
data['GenHealth'] = data['GenHealth'].map(GenHealth_mapping)

In [None]:
cols = ['Diabetic','Sex', 'Race']
data[cols] = data[cols].apply(LabelEncoder().fit_transform)

In [None]:
data.head(10)

In [None]:
x = data.drop('HeartDisease', axis=1).values
y = data['HeartDisease'].values

In [None]:
labels = ['0=Non Heart Disease', '1=Heart Disease']
colors = ['royalblue','red']
values = np.bincount(y.astype(int)).tolist()
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.5)])
fig.update_traces(hoverinfo='label+value', textfont_size=15, marker=dict(colors=colors))
fig.update_layout(annotations=[dict(text='HeartDisease', x=0.5, y=0.5, font_size=15, showarrow=False)])
fig.show()

In [None]:
print("採樣前資料分布:")
print(data['HeartDisease'].value_counts())

In [None]:
from imblearn.over_sampling import RandomOverSampler
sm = RandomOverSampler(random_state=42)
x_resampled, y_resampled = sm.fit_resample(x, y)

In [None]:
labels = ['0=Non Heart Disease', '1=Heart Disease']
values = np.bincount(y_resampled.astype(int)).tolist()
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.5)])
fig.update_traces(hoverinfo='label+value', textfont_size=15, marker=dict(colors=colors))
fig.update_layout(annotations=[dict(text='HeartDisease', x=0.5, y=0.5, font_size=15, showarrow=False)])
fig.show()

In [None]:
print("採樣後資料分布:")
print(pd.Series(y_resampled).value_counts())

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.3, random_state=42)

combined_train_df = pd.concat([pd.DataFrame(y_train), pd.DataFrame(x_train)], axis=1)
combined_test_df = pd.concat([pd.DataFrame(y_test), pd.DataFrame(x_test)], axis=1)



columns = ['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke', 
           'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory', 
           'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime', 
           'Asthma', 'KidneyDisease', 'SkinCancer']

combined_train_df.columns = columns
combined_test_df.columns = columns

combined_train_df.to_csv('combined_train_data.csv', index=False, columns=columns)
combined_test_df.to_csv('combined_test_data.csv', index=False, columns=columns)

In [None]:
from sklearn.metrics import matthews_corrcoef
cols = data.columns
corr = []
for col in cols:
    if col in binary_columns:
        corr.append(matthews_corrcoef(data['HeartDisease'], data[col]))
    else:
        corr.append(data['HeartDisease'].corr(data[col]))
correlation = pd.DataFrame(list(zip(cols, corr)), columns=['Variables','Corr_with_HeartDisease'])
correlation.drop(index=correlation.index[:1], axis=0, inplace=True)
correlation

In [None]:
correlation['colors'] = ['#F0073B' if float(x) <= 0 else '#CB3579' for x in correlation['Corr_with_HeartDisease']]
correlation = correlation.sort_values(ascending=True, by=['Corr_with_HeartDisease']) 
plt.figure(figsize=(14,10), dpi=80)
plt.hlines(y=correlation.Variables, xmin=0, xmax=correlation.Corr_with_HeartDisease, color=correlation.colors,  linewidth=5)
plt.grid()
plt.show()

In [None]:
accuracy = []
precision = []
recall = []
f1 = []
auc = []
feature_selection_count = Counter()

def evaluate_model(model, y_true, y_pred):
    accuracy.append(accuracy_score(y_true, y_pred))
    precision.append(precision_score(y_true, y_pred))
    recall.append(recall_score(y_true, y_pred))
    f1.append(f1_score(y_true, y_pred))
    auc.append(roc_auc_score(y_true, y_pred))

In [None]:
xgb = XGBClassifier(n_estimators= 50 , max_depth= 3 , learning_rate = 0.1)

max_features_range = range(1, 18)
best_max_features = None
best_score = 0


for max_features in max_features_range:
    selector = SelectFromModel(estimator=xgb, threshold=-np.inf, max_features=max_features)
    selector.fit(x_train, y_train)
    x_train_selected = selector.transform(x_train)
    
    scores = cross_val_score(XGBClassifier(n_estimators= 50 , max_depth= 3 , learning_rate = 0.1), 
                             x_train_selected, y_train, cv=5, scoring='balanced_accuracy')
    mean_score = scores.mean()
    print(max_features,'features',' = ',mean_score)
    if mean_score > best_score:
        best_score = mean_score
        best_max_features = max_features

print("最佳的 max_features 值:", best_max_features)
print("對應的交叉驗證分數:", best_score)


selector = SelectFromModel(estimator=xgb, threshold=-np.inf, max_features=best_max_features)
selector.fit(x_train, y_train)


print()

selected_features = np.array(data.drop('HeartDisease', axis=1).columns)[selector.get_support()]
feature_selection_count.update(selected_features)
print("選擇的特徵:", selected_features)

print()

x_train_selected = selector.transform(x_train)
x_test_selected = selector.transform(x_test)

xgb.fit(x_train_selected, y_train)


y_pred_prob = xgb.predict_proba(x_test_selected)[:, 1]
threshold = 0.45
y_pred = (y_pred_prob >= threshold).astype(int)

evaluate_model(xgb, y_test, y_pred)
print(f'Accuracy_score: {accuracy_score(y_test, y_pred)}')
print(f'Precision_score: {precision_score(y_test, y_pred)}')
print(f'Recall_score: {recall_score(y_test, y_pred)}')
print(f'F1-score: {f1_score(y_test, y_pred)}')

print()

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(7,5))
plt.title('XGBClassifier')
ax = sns.heatmap(cm/np.sum(cm),fmt='.2%', annot=True, cmap='Blues')
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(['No HeartDisease','HeartDisease'],fontsize = 15)
ax.yaxis.set_ticklabels(['No HeartDisease','HeartDisease'],fontsize = 15)
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver = "liblinear")

best_max_features = None
best_score = 0

for max_features in max_features_range:
    selector = SelectFromModel(estimator=logreg, threshold=-np.inf, max_features=max_features)
    selector.fit(x_train, y_train)
    x_train_selected = selector.transform(x_train)
    
    scores = cross_val_score(LogisticRegression(solver = "liblinear"), 
                             x_train_selected, y_train, cv=5, scoring='balanced_accuracy')
    mean_score = scores.mean()
    print(max_features,'features',' = ',mean_score)
    if mean_score > best_score:
        best_score = mean_score
        best_max_features = max_features

print("最佳的 max_features 值:", best_max_features)
print("對應的交叉驗證分數:", best_score)


selector = SelectFromModel(estimator=logreg, threshold=-np.inf, max_features=best_max_features)
selector.fit(x_train, y_train)

print()

selected_features = np.array(data.drop('HeartDisease', axis=1).columns)[selector.get_support()]
feature_selection_count.update(selected_features)
print("選擇的特徵:", selected_features)

print()

x_train_selected = selector.transform(x_train)
x_test_selected = selector.transform(x_test)

logreg.fit(x_train_selected, y_train)


y_pred_prob = logreg.predict_proba(x_test_selected)[:, 1]
threshold = 0.4
y_pred = (y_pred_prob >= threshold).astype(int)

evaluate_model(logreg, y_test, y_pred)
print(f'Accuracy_score: {accuracy_score(y_test, y_pred)}')
print(f'Precision_score: {precision_score(y_test, y_pred)}')
print(f'Recall_score: {recall_score(y_test, y_pred)}')
print(f'F1-score: {f1_score(y_test, y_pred)}')

print()

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(7,5))
plt.title('LogisticRegression')
ax = sns.heatmap(cm/np.sum(cm),fmt='.2%', annot=True, cmap='Blues')
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(['No HeartDisease','HeartDisease'],fontsize = 15)
ax.yaxis.set_ticklabels(['No HeartDisease','HeartDisease'],fontsize = 15)
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=50,max_depth = 3)

best_max_features = None
best_score = 0

for max_features in max_features_range:
    selector = SelectFromModel(estimator=gbc, threshold=-np.inf, max_features=max_features)
    selector.fit(x_train, y_train)
    x_train_selected = selector.transform(x_train)

    scores = cross_val_score(GradientBoostingClassifier(n_estimators=50,max_depth = 3), 
                             x_train_selected, y_train, cv=5, scoring='balanced_accuracy')
    mean_score = scores.mean()
    print(max_features,'features',' = ',mean_score)
    if mean_score > best_score:
        best_score = mean_score
        best_max_features = max_features

print("最佳的 max_features 值:", best_max_features)
print("對應的交叉驗證分數:", best_score)


selector = SelectFromModel(estimator=gbc, threshold=-np.inf, max_features=best_max_features)
selector.fit(x_train, y_train)

print()

selected_features = np.array(data.drop('HeartDisease', axis=1).columns)[selector.get_support()]
feature_selection_count.update(selected_features)
print("選擇的特徵:", selected_features)

print()

x_train_selected = selector.transform(x_train)
x_test_selected = selector.transform(x_test)

gbc.fit(x_train_selected, y_train)


y_pred_prob = gbc.predict_proba(x_test_selected)[:, 1]
threshold = 0.4
y_pred = (y_pred_prob >= threshold).astype(int)

evaluate_model(gbc, y_test, y_pred)
print(f'Accuracy_score: {accuracy_score(y_test, y_pred)}')
print(f'Precision_score: {precision_score(y_test, y_pred)}')
print(f'Recall_score: {recall_score(y_test, y_pred)}')
print(f'F1-score: {f1_score(y_test, y_pred)}')

print()

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(7,5))
plt.title('GradientBoostingClassifier')
ax = sns.heatmap(cm/np.sum(cm),fmt='.2%', annot=True, cmap='Blues')
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(['No HeartDisease','HeartDisease'],fontsize = 15)
ax.yaxis.set_ticklabels(['No HeartDisease','HeartDisease'],fontsize = 15)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=50,max_depth = 3)

best_max_features = None
best_score = 0

for max_features in max_features_range:
    selector = SelectFromModel(estimator=rfc, threshold=-np.inf, max_features=max_features)
    selector.fit(x_train, y_train)
    x_train_selected = selector.transform(x_train)
    
    scores = cross_val_score(RandomForestClassifier(n_estimators=50,max_depth = 3), 
                             x_train_selected, y_train, cv=5, scoring='balanced_accuracy')
    mean_score = scores.mean()
    print(max_features,'features',' = ',mean_score)
    if mean_score > best_score:
        best_score = mean_score
        best_max_features = max_features

print("最佳的 max_features 值:", best_max_features)
print("對應的交叉驗證分數:", best_score)


selector = SelectFromModel(estimator=rfc, threshold=-np.inf, max_features=best_max_features)
selector.fit(x_train, y_train)

print()

selected_features = np.array(data.drop('HeartDisease', axis=1).columns)[selector.get_support()]
feature_selection_count.update(selected_features)
print("選擇的特徵:", selected_features)

print()

x_train_selected = selector.transform(x_train)
x_test_selected = selector.transform(x_test)

rfc.fit(x_train_selected, y_train)


y_pred_prob = rfc.predict_proba(x_test_selected)[:, 1]
threshold = 0.47
y_pred = (y_pred_prob >= threshold).astype(int)

evaluate_model(rfc, y_test, y_pred)
print(f'Accuracy_score: {accuracy_score(y_test, y_pred)}')
print(f'Precision_score: {precision_score(y_test, y_pred)}')
print(f'Recall_score: {recall_score(y_test, y_pred)}')
print(f'F1-score: {f1_score(y_test, y_pred)}')

print()

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(7,5))
plt.title('RandomForestClassifier')
ax = sns.heatmap(cm/np.sum(cm),fmt='.2%', annot=True, cmap='Blues')
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(['No HeartDisease','HeartDisease'],fontsize = 15)
ax.yaxis.set_ticklabels(['No HeartDisease','HeartDisease'],fontsize = 15)
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc =  DecisionTreeClassifier(max_depth= 8)

best_max_features = None
best_score = 0

for max_features in max_features_range:
    selector = SelectFromModel(estimator=dtc, threshold=-np.inf, max_features=max_features)
    selector.fit(x_train, y_train)
    x_train_selected = selector.transform(x_train)
    
    scores = cross_val_score(DecisionTreeClassifier(max_depth= 8), 
                             x_train_selected, y_train, cv=5, scoring='balanced_accuracy')
    mean_score = scores.mean()
    print(max_features,'features',' = ',mean_score)
    if mean_score > best_score:
        best_score = mean_score
        best_max_features = max_features

print("最佳的 max_features 值:", best_max_features)
print("對應的交叉驗證分數:", best_score)


selector = SelectFromModel(estimator=dtc, threshold=-np.inf, max_features=best_max_features)
selector.fit(x_train, y_train)

print()

selected_features = np.array(data.drop('HeartDisease', axis=1).columns)[selector.get_support()]
feature_selection_count.update(selected_features)
print("選擇的特徵:", selected_features)

print()

x_train_selected = selector.transform(x_train)
x_test_selected = selector.transform(x_test)

dtc.fit(x_train_selected, y_train)


y_pred = dtc.predict(x_test_selected)
evaluate_model(dtc, y_test, y_pred)
print(f'Accuracy_score: {accuracy_score(y_test, y_pred)}')
print(f'Precision_score: {precision_score(y_test, y_pred)}')
print(f'Recall_score: {recall_score(y_test, y_pred)}')
print(f'F1-score: {f1_score(y_test, y_pred)}')

print()

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(7,5))
plt.title('DecisionTreeClassifier')
ax = sns.heatmap(cm/np.sum(cm),fmt='.2%', annot=True, cmap='Blues')
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(['No HeartDisease','HeartDisease'],fontsize = 15)
ax.yaxis.set_ticklabels(['No HeartDisease','HeartDisease'],fontsize = 15)
plt.show()

In [None]:
from sklearn.ensemble import AdaBoostClassifier

Ada = AdaBoostClassifier(n_estimators = 25)

best_max_features = None
best_score = 0

for max_features in max_features_range:
    selector = SelectFromModel(estimator=Ada, threshold=-np.inf, max_features=max_features)
    selector.fit(x_train, y_train)
    x_train_selected = selector.transform(x_train)
    
    scores = cross_val_score(AdaBoostClassifier(n_estimators = 25), 
                             x_train_selected, y_train, cv=5, scoring='balanced_accuracy')
    mean_score = scores.mean()
    print(max_features,'features',' = ',mean_score)
    if mean_score > best_score:
        best_score = mean_score
        best_max_features = max_features

print("最佳的 max_features 值:", best_max_features)
print("對應的交叉驗證分數:", best_score)


selector = SelectFromModel(estimator=Ada, threshold=-np.inf, max_features=best_max_features)
selector.fit(x_train, y_train)

print()

selected_features = np.array(data.drop('HeartDisease', axis=1).columns)[selector.get_support()]
feature_selection_count.update(selected_features)
print("選擇的特徵:", selected_features)

print()

x_train_selected = selector.transform(x_train)
x_test_selected = selector.transform(x_test)

Ada.fit(x_train_selected, y_train)


y_pred_prob = Ada.predict_proba(x_test_selected)[:, 1]
threshold = 0.498
y_pred = (y_pred_prob >= threshold).astype(int)

evaluate_model(Ada, y_test, y_pred)
print(f'Accuracy_score: {accuracy_score(y_test, y_pred)}')
print(f'Precision_score: {precision_score(y_test, y_pred)}')
print(f'Recall_score: {recall_score(y_test, y_pred)}')
print(f'F1-score: {f1_score(y_test, y_pred)}')

print()

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(7,5))
plt.title('AdaBoostClassifier')
ax = sns.heatmap(cm/np.sum(cm),fmt='.2%', annot=True, cmap='Blues')
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(['No HeartDisease','HeartDisease'],fontsize = 15)
ax.yaxis.set_ticklabels(['No HeartDisease','HeartDisease'],fontsize = 15)
plt.show()

In [None]:
results_df = pd.DataFrame({
    'Model': ['XGBClassifier', 'LogisticRegression', 'GradientBoostingClassifier', 'RandomForestClassifier', 'DecisionTreeClassifier', 'AdaBoostClassifier'],
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-score': f1,
    'AUC': auc
})

styled_df = results_df
styled_df

In [None]:
sorted_feature_selection = feature_selection_count.most_common()
print("每個特徵被選擇的次數:")
for feature, count in sorted_feature_selection:
    print(f"{feature}: {count}")