In [None]:
# === 'Secret Sauce' CSS & Config File ===
# --- Libraries for 'Secret Sauce' ---
import sys
from IPython.display import display, HTML, Javascript

# ---'Secret Sauce' Function ---
def apply_config_file(kaggle_path, local_path, project_year, project_name):
    sys.path.append(f"{kaggle_path}/config_file/{project_year}/{project_name}/")
    sys.path.append(f"{local_path}/config_file/{project_year}/{project_name}")

def apply_css_file(kaggle_path, local_path, project_year, css_file_name):
    try:
        return HTML("<style>"+ open(f"{kaggle_path}/css/{project_year}/{css_file_name}.css", "r").read()+ "</style>")
    except:
        return HTML("<style>"+ open(f"{local_path}/css/{project_year}/{css_file_name}.css", "r").read()+ "</style>")

# --- 'Secret Sauce' Variables ---
""" Change Variables ~! HERE !~ """
project_name = "heart_disease"
project_year = "2023"
css_file_name = "css_" + project_name
kaggle_path = "/kaggle/input/caesarmario"
local_path = "config_kaggle"

# --- Apply Function & Import Config File ---
apply_config_file(kaggle_path, local_path, project_year, project_name)
from config_file import *
apply_css_file(kaggle_path, local_path, project_year, css_file_name)

<!--- Project Title --->
<span class="dates">IV.XX.MMXXIII | @caesarmario</span><br>
<span class="title-normal">Listen to Your Heart: </span>
<span class="title-highlight"> A Disease Prediction</span><br>
<span class="subtitle">using Various Machine Learning Models</span>
<hr>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

# <div class="header1">1. | Introduction 👋</div>
<center>
    <img src="https://images.unsplash.com/photo-1628348070889-cb656235b4eb?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=1170&q=80" alt="Heart Disease" width="80%">
</center>

## <div class="header2">🤔 Dataset Problems</div>
<div class="explain-box">
    This dataset is taken from the <a style="color: #3D5A80" href="https://archive.ics.uci.edu/ml/datasets/heart+disease"><b>UCI machine learning website</b></a>. This dataset contains <b>medical information on patients and the diagnosis results</b> of whether the patient has heart disease. <mark>Machine learning models are necessary to determine whether a patient has heart disease and speed up the diagnostic process</mark> based on the medical information provided about that patient. The <b>variables that most influence</b> a patient to have heart disease will also be <b>explored more deeply</b> in this notebook.
</div>

## <div class="header2">📌 Notebook Objectives</div>
<div class="explain-box">
    This notebook <b>aims</b> to:
    <ul>
        <li><mark>Perform dataset exploration</mark> using various types of data visualization.</li>
        <li><mark>Build machine learning model</mark> that can predict patients status.</li>
        <li><mark>Export prediction result on test data</mark> into files.</li>
        <li><mark>Save/dump the complete machine learing pipeline</mark> for later usage.</li>
        <li><mark>Perform prediction on new example data</mark> given and <mark>export the prediction result</mark>.</li>
    </ul>
</div>

## <div class="header2">👨‍💻 Machine Learning Model</div>
<div class="explain-box">
    The <b>models</b> used in this notebook:
    <ol start="1">
        <li><b>Logistic Regression</b>,</li>
        <li><b>K-Nearest Neighbour (KNN)</b>,</li>
        <li><b>Support Vector Machine (SVM)</b>,</li>
        <li><b>Gaussian Naive Bayes</b>,</li>
        <li><b>Decision Tree</b>,</li>
        <li><b>Random Forest</b>,</li>
        <li><b>Extra Tree Classifier</b>,</li>
        <li><b>Gradient Boosting</b>, and</li>
        <li><b>AdaBoost</b>.</li>
    </ol>
</div>

## <div class="header2">🧾 Dataset Description</div>
<div class="explain-box">
    The following is the <b>structure of the dataset</b>.<br>
    
<table style="font-family: Open Sans; font-weight: 300; font-size: 12px; text-align: left; padding: 8px; border-collapse: collapse; width: 100%;">
  <thead>
    <tr>
      <th style="font-family: Open Sans; font-weight: 900; text-align: center; font-size: 14px; background-color: #FF5C8A">Variable Name</th>
      <th style="font-family: Open Sans; font-weight: 900; text-align: center; font-size: 14px; background-color: #FF5C8A">Description</th>
      <th style="font-family: Open Sans; font-weight: 900; text-align: center; font-size: 14px; background-color: #FF5C8A">Sample Data</th>
    </tr>
  </thead>
  <tbody>
    <tr>
        <td><b>Age</b></td>
        <td>Patient age <br> (in years)</td>
        <td>63; 37; ...</td>
    </tr>
    <tr>
        <td><b>Sex</b></td>
        <td>Gender of patient<br><br>0 = male<br>1 = female</td>
        <td>1; 0; ...</td>
    </tr>
    <tr>
        <td><b>cp</b></td>
        <td>Chest pain type<br><br>0 = typical angina<br>1 = atypical angina<br>2 = non-anginal pain<br>3 = asymptomatic</td>
        <td>3; 1; 2; ...</td>
    </tr>
    <tr>
        <td><b>trestbps</b></td>
        <td>Resting blood pressure <br> (in mm Hg)</td>
        <td>145; 130; ...</td>
    </tr>
    <tr>
        <td><b>chol</b></td>
        <td>Serum cholestoral <br> (in mg/dl)</td>
        <td>233; 250; ...</td>
    </tr>
    <tr>
        <td><b>fbs</b></td>
        <td>Fasting blood sugar > 120 mg/dl<br><br>0 = false<br>1 = true</td>
        <td>1; 0; ...</td>
    </tr>
    <tr>
        <td><b>restecg</b></td>
        <td>Resting electrocardiographic results<br><br>0 = normal<br>1 = having ST-T wave abnormality<br>2 = showing probable or definite left ventricular hypertrophy by Estes' criteria</td>
        <td>0; 1; ...</td>
    </tr>
    <tr>
        <td><b>thalach</b></td>
        <td>Maximum heart rate achieved </td>
        <td>150; 187; ...</td>
    </tr>
    <tr>
        <td><b>exang</b></td>
        <td>Exercise induced angina<br><br>0 = no<br>1 = yes</td>
        <td>1; 0; ...</td>
    </tr>
    <tr>
        <td><b>oldpeak</b></td>
        <td>ST depression induced by exercise relative to rest</td>
        <td>2.3; 3.5; ...</td>
    </tr>
    <tr>
        <td><b>slope</b></td>
        <td>The slope of the peak exercise ST segment<br><br>0 = upsloping<br>1 = flat<br>2 = downsloping</td>
        <td>0; 2; ...</td>
    </tr>
    <tr>
        <td><b>ca</b></td>
        <td>Number of major vessels (0-4) colored by flourosopy </td>
        <td>0; 3; ...</td>
    </tr>
    <tr>
        <td><b>thal</b></td>
        <td>Thalassemia<br><br>3 = normal<br>6 = fixed defect<br>7 = reversable defect</td>
        <td>1; 3; ...</td>
    </tr>
    <tr>
        <td><b>Target</b></td>
        <td>Target column<br><br>0 = not have heart disease<br>1 = have heart disease</td>
        <td>1; 0; ...</td>
    </tr>
    </tbody>
</table>
<hr>
<center>
    <span class="thanks-explain">📌 Like this notebook? You can support me by giving <mark><b>upvote</b></mark> 😆👍🔼</span><br>
    <span class="thanks-watermark">Follow me in other platform: <a href="https://linktr.ee/caesarmario_">linktr.ee/caesarmario_</a></span><br>
    <span class="three-dots2">...</span><br>
    <span class="thanks-watermark"><u>Support me!</u></span><br>
    <span class="ko-fi">
        <a href='https://ko-fi.com/D1D3JU963' target='_blank'><img src='https://ko-fi.com/img/githubbutton_sm.svg' alt='Support me on Ko-fi Button'/></a><br>
    </span>
</center>
<hr>
</div>

# <div class="header1">2. | Installing and Importing Libraries 📚</div>
<div class="explain-box">
    <b>Installing and Importing libraries</b> that will be used in this notebook.
</div>

In [None]:
# --- Installing Libraries ---
!pip install ydata-profiling
!pip install pywaffle
!pip install highlight-text
!pip install Pillow

In [None]:
# --- Importing Libraries ---
import numpy as np
import pandas as pd
import ydata_profiling
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import warnings
import os
import yellowbrick
import joblib

from ydata_profiling import ProfileReport
from pywaffle import Waffle
from statsmodels.graphics.gofplots import qqplot
from PIL import Image
from highlight_text import fig_text
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score
from yellowbrick.classifier import PrecisionRecallCurve, ROCAUC, ConfusionMatrix
from yellowbrick.model_selection import LearningCurve, FeatureImportances
from yellowbrick.contrib.wrapper import wrap
from yellowbrick.style import set_palette

# <div class="header1">3. | Reading Dataset 👓</div>
<div class="explain-box">
    After importing libraries, <b>the dataset that will be used will be imported</b>.
</div>

In [None]:
# --- Importing Dataset ---
df = pd.read_csv("../input/heart-disease/heart.csv")

# --- Reading Train Dataset ---
print(clr.start+'.: Imported Dataset :.'+clr.end)
print(clr.color+'*' * 23)
df.head().style.background_gradient(cmap='Reds').hide_index()

# <div class="header1">4. | Initial Dataset Exploration 🔍</div>
<div class="explain-box">
    This section will focused on <b>initial data exploration on the dataset</b> with <u>Pandas Profiling</u> before pre-processing performed. In addition, <b>variables correlation</b> will be examined as well.
</div>

In [None]:
# --- Dataset Report ---
ProfileReport(df, title='Heart Disease Dataset Report', minimal=True, progress_bar=False, samples=None, correlations=None, interactions=None, explorative=True, dark_mode=True, notebook={'iframe':{'height': '600px'}}, html={'style':{'primary_color': color_line}}, missing_diagrams={'heatmap': False, 'dendrogram': False}).to_notebook_iframe()

In [None]:
# --- Correlation Map Variables ---
suptitle = dict(x=0.1, y=1.01, fontsize=13, weight='heavy', ha='left', va='bottom', fontname=font_main)
title = dict(x=0.1, y=0.98, fontsize=8, weight='normal', ha='left', va='bottom', fontname=font_alt)
xy_label = dict(size=6)
highlight_textprops = [{'weight':'bold', 'color': colors[0]}, {'weight':'bold', 'color': colors[2]}]

# --- Correlation Map (Heatmap) ---
mask = np.triu(np.ones_like(df.corr(), dtype=bool))
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(df.corr(), mask=mask, annot=True, cmap=color_map, linewidths=0.2, cbar=False, annot_kws={"size": 7}, rasterized=True)
yticks, ylabels = plt.yticks()
xticks, xlabels = plt.xticks()
ax.set_xticklabels(xlabels, rotation=0, **xy_label)
ax.set_yticklabels(ylabels, **xy_label)
ax.grid(False)
fig_text(s='Numerical Variables Correlation Map', **suptitle)
fig_text(s='<Chest pain type, max heart rate, and slope> positively correlate with <target> variables.', highlight_textprops=highlight_textprops, **title)
plt.tight_layout(rect=[0, 0.04, 1, 1.01])
plt.gcf().text(0.85, 0.03, 'kaggle.com/caesarmario', style='italic', fontsize=5)
plt.show();

<div class="explain-box">
    From <b>dataset report</b> and <b>correlation matrix</b>, it can be <mark><b>concluded</b></mark> that:
    <blockquote style="color: #000000;">
        <ul>
            <li>There are <mark>no missing values</mark> detected in the dataset. In addition, it also can be seen that <b>the number of categorical columns is more than the numerical columns</b>.</li>
            <li>As can be seen from the profiling report, <mark>the number of male patients is greater than female patients</mark>. In addition, <b>chest pain type 0 (typical angina) is higher than other types</b>. <mark>Most of the patients in the dataset had fasting blood sugar that was less than 120 mg/dl</mark>. <b>The number of resting electrocardiographic types 1 (having ST-T wave abnormality) and 0 (normal) is more than type 2 (definite left ventricular hypertrophy)</b>. Moreover, <mark>patients who don't have exercise-induced angina have a higher number</mark>. <b>The number patients with flat and downsloping slopes is more</b> than upsloping slope.</li>
            <li>Furthermore, <mark>patients with 0 major vessels are more numerous than those with major vessels</mark>. <b>Patients with fixed defect thalassemia have the highest distribution compared to others</b>. <mark>The total number of patients with heart disease is higher</mark> than those without heart disease.</li>
            <li><mark>Age, resting blood pressure, cholestoral, and max. heart received columns are lack of variation</mark> since it has <b>low standard deviation</b>.</li>
            <li><mark>The age column has a normal distribution</mark> based on the histogram and skewness value. However, <b>the resting blood pressure column has a moderately right-skewed distribution and the serum cholestoral and oldpeak columns have a highly right-skewed distribution</b>. On the other hand, <mark>the max. heart received column has a moderate left-skewed distribution</mark>. Since some columns are moderate to highly left or right-skewed, <b>some outliers are detected at the distribution tail</b>.</li>
            <li><mark>The age, resting blood pressure, max. heart received and oldpeak columns</mark> have a kurtosis value of less than 3, which indicates that the column is <mark>platikurtic</mark>. Meanwhile, <b>the serum cholestoral column</b> has a kurtosis value of more than 3, which indicates that the column is <b>leptokurtic</b>.<br>
                <blockquote><span style="font-size: 11px;">📌 <mark>Low standard deviation</mark> means data are <mark>clustered around the mean</mark> (lack of variation), and high standard deviation indicates data are more spread out (more variation).</span></blockquote>
                <blockquote><span style="font-size: 11px;">📌 If skewness is <b>less than -1 or greater than 1</b>, the distribution is <mark>highly skewed</mark>. If skewness is <b>between -1 and -0.5 or between 0.5 and 1</b>, the distribution is <mark>moderately skewed</mark>. If skewness is <b>between -0.5 and 0.5</b>, the distribution is <mark>approximately symmetric</mark>.</span></blockquote>
                <blockquote><span style="font-size: 11px;">📌 <mark>Kurtosis</mark> values used to show <mark>tailedness of a column</mark>. The value of normal distribution (mesokurtotic) should be equal to 3. If kurtosis value is more than 3, it is called leptokurtic. Meanwhile, if kurtosis value is less than 3, then it is called platikurtic.</span></blockquote>
            </li>
            <li><b>The mean age of the patients in the dataset was 54.36 years old</b>, with the most senior patient being 77 years old and the youngest being 29 years old. <b>The average resting blood pressure in the dataset is 131.62</b>, where the highest resting blood pressure is 200, and the minimum is 94 (generally, <b>the ideal blood pressure ranges from 90 to 120</b>).</li>
            <li><b>The mean serum cholesteral was 246.26</b>, with a maximum of 564 and a minimum of 126. In addition, <b>the patient's average max. heart rate in the dataset was 149.64</b>, with a minimum of 71 and a maximum of 202. <b>The patient's mean oldpeak was 1.03</b>, with a minimum of 0 and a maximum of 6.2.</li>
            <li>According to the correlation between variables, it can be seen that <b>chest pain type, max. heart rate, and slope have a high positive correlation with the target variable</b>. However, <b>exang, oldpeak, and thalassemia negatively correlate with the target variable</b>.</li>
        </ul>
    </blockquote>
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

# <div class="header1">5. | EDA 📈</div>
<div class="explain-box">
    This section will perform some <b>EDA</b> to get more insights about dataset.<br>
</div>

## <div class="header2">5.1 | Disease Distribution based on Chest Pain Type in Each Gender</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- EDA 1 Dataframes ---
df_eda1 = df[['sex', 'cp', 'target']]
df_eda1 = pd.DataFrame(df_eda1.groupby(['sex', 'target']).cp.value_counts().reset_index(name='total'))
df_eda1_mns = df_eda1.query(f'sex == 0 & target == 0')
df_eda1_mns.loc[len(df_eda1_mns.index)] = [0, 0, 3, 0]
df_eda1_ms = df_eda1.query(f'sex == 0 & target == 1')
df_eda1_fns = df_eda1.query(f'sex == 1 & target == 0')
df_eda1_fs = df_eda1.query(f'sex == 1 & target == 1')

# --- EDA 1: Variables ---
y = np.arange(len(df_eda1.cp.unique()))
x_ticks = list(np.arange(-80, 60, 20))
x_labels = list(map(str, x_ticks))
x_labels = list(map(lambda each:each.strip("-"), x_labels))
y_ticks = list(np.arange(0, 4, 1))
labels_pain_type = ['Type 0', 'Type 1', 'Type 2', 'Type 3']
labels_legend = ['Not Sick', 'Sick']
bar_height = 0.35
bar_style = dict(zorder=3, edgecolor='black', linewidth=0.5, alpha=0.85)
cnt_label = dict(fontsize=7, horizontalalignment='center', verticalalignment='center')
axvspan = dict(alpha=0.2, zorder=2)
tick_params = dict(length=3, width=1, color=color_line)
xy_label = dict(fontweight='bold', fontsize=8)
suptitle = dict(x=0.16, y=0.96, fontsize=13, weight='heavy', ha='left', va='bottom', fontname=font_main)
title = dict(x=0.16, y=0.93, fontsize=8, weight='normal', ha='left', va='bottom', fontname=font_alt)
highlight_textprops = [{'weight':'bold', 'color': colors[0]}, {'weight':'bold', 'color': colors[5]}]

# --- Display EDA 1 ---
fig, ax = plt.subplots(figsize=(9, 5))
bar_mns = plt.barh(y+bar_height, df_eda1_mns['total'], color=colors[3], height=bar_height, **bar_style) # hatch='//'
bar_ms = plt.barh(y, df_eda1_ms['total'], color=colors[4], height=bar_height, **bar_style)
bar_fns = plt.barh(y+bar_height, df_eda1_fns['total']*-1, color=colors[3], height=bar_height, **bar_style)
bar_fs = plt.barh(y, df_eda1_fs['total']*-1, color=colors[1], height=bar_height, **bar_style)
ax.set_yticks(y + bar_height / 2)
ax.set_yticklabels(labels_pain_type, fontsize=7)
for rect in ax.patches:
    width, height = rect.get_width(), rect.get_height()
    x, y = rect.get_xy()
    if width >= 0:
        if width > 10: ax.text(x+width/2, y+height/2, '{:.0f}'.format(width), **cnt_label)
        else: ax.text(x+width+1.5, y+height/2, '{:.0f}'.format(width), **cnt_label)
    elif width < 0:
        if width*-1 > 10: ax.text(x+width/2, y+height/2, '{:.0f}'.format(width*-1), **cnt_label)
        else: ax.text(x+width-1.5, y+height/2, '{:.0f}'.format(width*-1), **cnt_label)
plt.xticks(fontsize=7, ticks=x_ticks, labels=x_labels)
plt.xlabel('\nTotal', **xy_label)
plt.ylabel('Chest Pain Type\n', **xy_label)
plt.grid(axis='y', alpha=0, zorder=2)
plt.grid(axis='x', which='major', alpha=0.3, color=color_grid, linestyle='dotted', zorder=1)
plt.axvspan(-85, 0, color=colors[1], **axvspan)
plt.axvspan(40, 0, color=colors[4], **axvspan)
leg_fsick = mpatches.Patch(color=colors[1], label='Sick Female')
leg_msick = mpatches.Patch(color=colors[4], label='Sick Male')
leg_notsick = mpatches.Patch(color=colors[3], label='Not Sick')
plt.legend(handles=[leg_fsick, leg_msick, leg_notsick], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3, borderpad=3, frameon=False, fontsize=7, columnspacing=3)
plt.tick_params(bottom='on', **tick_params)
ax=plt.gca()
for spine in ax.spines.values():
    spine.set_color('None')
ax.spines['bottom'].set_visible(True)
ax.spines['bottom'].set_color(color_line)
fig_text(s='Disease Distribution based on Chest Pain Type in Each Gender', **suptitle)
fig_text(s="Chest pain types 1, 2, and 3 <have more sick patients> than those <who don't>.", highlight_textprops=highlight_textprops, **title)
plt.gcf().text(0.77, -0.09, 'kaggle.com/caesarmario', style='italic', fontsize=6)
plt.show();

<div class="explain-box">
    From the butterfly chart above and as previously mentioned, typical angina chest pain and female patients have a greater number in the dataset. When viewed more detail, <mark>atypical angina, non-anginal pain, and asymptomatic chest pain have more sick patients than healthy male and female patients</mark>. In addition, <b>for patients with typical angina chest pain</b>, <mark>the ratio of male and female patients with heart disease is almost the same</mark>. However, <mark>the number of healthy female patients in that chest pain category is higher</mark> than healthy male patients.
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

## <div class="header2">5.2 | Maximum Heart Rate vs. Age based on Patients Sickness</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- EDA 2 Variables ---
scatter_style = dict(linewidth=0.65, edgecolor=scatter_color_edge, alpha=0.8)
sub_scatter_style_color = dict(s=5, alpha=0.65, linewidth=0.15, zorder=10, edgecolor=scatter_color_edge)
sub_scatter_style_grey = dict(s=5, alpha=0.3, linewidth=0.7, zorder=5, color=colors[4])
grid_style = dict(alpha=0.3, color=color_grid, linestyle='dotted', zorder=1)
xy_label = dict(fontweight='bold', fontsize=9)
suptitle = dict(x=0.12, y=0.62, fontsize=16, weight='heavy', ha='left', va='bottom', fontname=font_main)
title = dict(x=0.12, y=0.605, fontsize=10, weight='normal', ha='left', va='bottom', fontname=font_alt)
color_pallete = [colors[5], colors[1]]
target_labels = [[0, 1], ['Not Sick', 'Sick']]
highlight_textprops = [{'weight':'bold', 'color': colors[5]}, {'weight':'bold', 'color': colors[1]}, {'weight':'bold', 'color': colors[1]}]
highlight_mean = [{'fontsize':7, 'color': 'black'}, {'fontsize':8, 'weight':'bold', 'color': colors[5]}]
sub_axes = [None] * 2

# --- EDA 2 Dataframe & Figure Settings ---
df_eda2 = df[['target', 'age', 'thalach']]
age_mean = df_eda2.age.mean()
thalach_mean = df_eda2.thalach.mean()
fig = plt.figure(figsize=(10, 16))
gs = fig.add_gridspec(2, 2)
ax = fig.add_subplot(gs[:2, :])
ax.set_aspect(1)

# --- EDA 2: Main Scatter Plot ---
ax.axvline(x=thalach_mean, linewidth=0.8, linestyle='--', color=colors[5], alpha=0.5)
ax.axhline(y=age_mean, linewidth=0.8, linestyle='--', color=colors[5], alpha=0.5)
for x in range(len(target_labels[0])):
    df_eda2_temp = df_eda2[df_eda2['target']==target_labels[0][x]]
    ax.scatter(df_eda2_temp['thalach'], df_eda2_temp['age'], s=65, color=color_pallete[x], **scatter_style)
    ax.set_xlabel('\nMaximum Heart Rate', **xy_label)
    ax.set_ylabel('Age\n', **xy_label)
    ax.grid(axis='y', which='major', **grid_style)
    ax.grid(axis='x', which='major', **grid_style)
    for spine in ax.spines.values(): spine.set_color('None')
    for spine in ['bottom', 'left']:
        ax.spines[spine].set_visible(True)
        ax.spines[spine].set_color(color_line)
    plt.tick_params(bottom='on', left='on', **tick_params)
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)
fig_text(x=0.13, y=0.495, ha='left', s="<Age Mean:>\n<{:.2f}>".format(age_mean), highlight_textprops=highlight_mean)
fig_text(x=0.59, y=0.426, ha='left', s="<Max. Heart Rate Mean:>\n<{:.2f}>".format(thalach_mean), highlight_textprops=highlight_mean)

# --- EDA 2: Sub Plots ---
for idx, trgt in enumerate(target_labels[0]):
    gs_thalach = df_eda2[df_eda2['target']!=trgt]['thalach']
    gs_age = df_eda2[df_eda2['target']!=trgt]['age']
    cs_thalach = df_eda2[df_eda2['target']==trgt]['thalach']
    cs_age = df_eda2[df_eda2['target']==trgt]['age']

    sub_axes[idx] = fig.add_subplot(gs[1, idx], aspect=1)
    sub_axes[idx].scatter(gs_thalach, gs_age, label=trgt, **sub_scatter_style_grey)
    sub_axes[idx].scatter(cs_thalach, cs_age, color=color_pallete[idx], label=trgt, **sub_scatter_style_color)
    m, b = np.polyfit(cs_thalach, cs_age, deg=1)
    sub_axes[idx].plot(cs_thalach, m*cs_thalach+b, linewidth=0.5, color=color_pallete[idx], linestyle='dotted');
    
    cnt = (df_eda2['target']==trgt).sum()
    sub_axes[idx].set_title(f'{target_labels[1][trgt]} Patients - ({cnt})', fontsize=7, style='italic', weight='bold', ha='center')
    sub_axes[idx].set_xticks([])
    sub_axes[idx].set_yticks([])
    for spine in sub_axes[idx].spines.values(): spine.set_color('None')

# --- EDA 2 Titles & WM ---
fig_text(s='Maximum Heart Rate vs. Age based on Patients Sickness', **suptitle)
fig_text(s="Patients who tend to get <heart disease> are <less than 54 years old> and have <max. heart rate over 149>.", highlight_textprops=highlight_textprops, **title)
plt.gcf().text(0.77, 0.23, 'kaggle.com/caesarmario', style='italic', fontsize=7)
plt.show();

<div class="explain-box">
    The scatter plot above shows that <b>patients with and without heart disease are aged between 40 to 70 years old</b>. In addition, the spread of <b>max. patient's heart rate in the dataset ranges from 140 to 180</b>. When viewed in more detail, <mark>patients who tend to get heart disease have max. heart rate over 149 and under 54 years of age</mark>. In the scatter plot above, it can also be seen that <mark>age and max. heart rate has a negative correlation</mark>, especially in patients with heart disease. In addition, heart disease patients have more numbers than healthy ones.
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

## <div class="header2">5.3 | Fasting Blood Sugar Distribution by Resting Electrocardiographic Results</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- EDA 3 Dataframes ---
df_eda3 = df[['fbs', 'restecg']]
df_eda3 = pd.DataFrame(df_eda3.groupby(['fbs', 'restecg']).size().reset_index(name='total'))
df_eda3.loc[len(df_eda3.index)] = [1, 2, 0]
df_eda3_0 = df_eda3.query(f'restecg == 0').drop('restecg', axis=1)
df_eda3_1 = df_eda3.query(f'restecg == 1').drop('restecg', axis=1)
df_eda3_2 = df_eda3.query(f'restecg == 2').drop('restecg', axis=1)

# --- EDA 3 Variables ---
total_list = [df_eda3_0['total'], df_eda3_1['total'], df_eda3_2['total']]
suptitle = dict(x=0.5, y=0.94, fontsize=14, weight='heavy', ha='center', va='center', fontname=font_main)
exp_text = dict(x=0.5, y=0.17, fontsize=6, weight='normal', ha='center', va='center', textalign='center', fontname=font_alt)
highlight_explanation = [{'weight':'bold', 'color': colors[5]}, {'weight':'bold', 'color': colors[5]}, {'weight':'bold', 'color': colors[1]}]
l_120mg = mpatches.Patch(color=colors[5], label='< 120 mg/dl')
m_120mg = mpatches.Patch(color=colors[1], label='> 120 mg/dl')

# --- EDA 3 Functions ---
def display_eda3(subplot_num, restecg_type, total, colors, start_angle):
    centre = plt.Circle((0, 0), 0.85, fc='white', edgecolor='black', linewidth=0.5)
    total_patients = total.sum()
    
    plt.subplot(1, 3, subplot_num)
    plt.tight_layout(rect=[0, 0, 1, 1.01])
    plt.pie(total, colors=colors, autopct='%.2f%%', pctdistance=0.65, startangle=start_angle, wedgeprops=dict(alpha=0.85, edgecolor='black', linewidth=0.5), textprops={'fontsize': 7, 'fontname': font_alt})
    plt.text(0, 0.08, f"Type {restecg_type}", weight='bold', ha='center', fontsize=10, fontname=font_main)
    plt.text(0, -0.08, f"{total_patients} patients", ha='center', fontsize=8, fontname=font_alt)
    fig=plt.gcf()
    fig.gca().add_artist(centre)

# --- Display EDA 3 ---
plt.figure(figsize=(9, 4))
for idx, total in enumerate(total_list):
    display_eda3(idx+1, idx, total, [colors[5], colors[1]], sample_num[idx])
    if idx == 1: plt.legend(handles=[l_120mg, m_120mg], loc='upper center', bbox_to_anchor=(0.5, 1.2), ncol=2, borderpad=3, frameon=False, fontsize=7, columnspacing=3)
fig_text(s="Fasting Blood Sugar Distribution by Resting Electrocardiographic Results", **suptitle)
fig_text(s="<Resting electrocardiograph type 0 and 1 have higher distribution> compared to type 2.\n<Only type 0 and 1 have patients with fasting blood sugar over 120 mg/dl>, while <type 2 does not>", highlight_textprops=highlight_explanation, **exp_text)
plt.gcf().text(0.83, 0.07, 'kaggle.com/caesarmario', style='italic', fontsize=7)
plt.show();

<div class="explain-box">
    The donut chart above shows <b>resting electrocardiograph types 0 and 1 have almost the same number of patients</b>. However, inversely proportional to resting electrocardiograph type 2, where the number of patients is only four. In addition, <mark>resting electrocardiograph types 0 and 1 have patients with fasting blood sugar over 120 mg/dl</mark>. Although, resting electrocardiograph type 2 had no patients with fasting blood sugar over 120 mg/dl.
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

## <div class="header2">5.4 | Number of Major Vessles Distribution based on Exercise Induced Angina</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- EDA 4 Dataframes ---
df_eda4 = df[['exang', 'ca']]
df_eda4 = pd.DataFrame(df_eda4.groupby(['exang', 'ca']).size().reset_index(name='total'))
df_eda4_0 = df_eda4.query(f'exang == 0').drop(['exang', 'ca'], axis=1).reset_index(drop=True)
df_eda4_1 = df_eda4.query(f'exang == 1').drop(['exang', 'ca'], axis=1).reset_index(drop=True)

# --- EDA 4 Variables ---
suptitle = dict(x=0.3, y=1.07, fontsize=48, weight='heavy', ha='center', va='center', fontname=font_main)
title = dict(x=0.3, y=1.01, fontsize=30, weight='normal', ha='center', va='bottom', fontname=font_alt)
title_pywaffle = dict(loc='left', fontsize=30, weight='bold', fontname=font_main)
legend_pywaffle = dict(loc='upper center', fontsize=22, ncol=5, borderpad=3, frameon=False, columnspacing=3)

# --- Display EDA 4 ---
fig = plt.figure(FigureClass=Waffle,
    plots={211: {'values': df_eda4_0['total'], 
                 'labels': [f"{key} Major Vessels - ({value})" for key, value in df_eda4_0['total'].items()], 
                 'legend': {'bbox_to_anchor': (0.5, 0.05), **legend_pywaffle},
                 'title': {'label': "Don't Have Exercise Induced Angina\n", **title_pywaffle}}
           , 212: {'values': df_eda4_1['total'], 
                   'labels': [f"{key} Major Vessels - ({value})" for key, value in df_eda4_1['total'].items()], 
                   'legend': {'bbox_to_anchor': (1, 0.05), **legend_pywaffle},
                   'title': {'label': "Have Exercise Induced Angina\n", **title_pywaffle}}
          }, figsize=(50, 20), rows=7, colors=color_pywaffle, rounding_rule='ceil')
fig.suptitle('\nNumber of Major Vessles Distribution based on Exercise Induced Angina', **suptitle)
plt.gcf().text(s='The major vessel distribution proportion in patients with and without exercise-induced angina is almost the same.', **title)
plt.gcf().text(0.48, 0.01, 'kaggle.com/caesarmario', style='italic', fontsize=20)
fig.tight_layout()
plt.show();

<div class="explain-box">
    The waffle charts above show that <mark>the proportion between patients who do and do not do exercise-induced angina is almost the same</mark>. This can be seen by <b>comparing the total number of patients between major vessels in each exercise</b>.
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

## <div class="header2">5.5 | Resting Blood Pressure Distribution based on Slope</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- EDA 5 Dataframes ---
df_eda5 = df[['slope', 'trestbps']]
df_eda5['slope'] = df_eda5['slope'].astype(str)

# --- EDA 5 Variables ---
tick_params=dict(length=3, width=1, color=color_line)
xy_label=dict(fontweight='bold', fontsize=7)
slope_list = sorted(df_eda5['slope'].unique())
color_pallete = [colors[5], colors[4], colors[0]]
sub_axes=[None] * 3
suptitle = dict(x=0.125, y=0.925, fontsize=14, weight='heavy', ha='left', va='bottom', fontname=font_main)
title = dict(x=0.125, y=0.9, fontsize=8, weight='normal', ha='left', va='bottom', fontname=font_alt)
qq_plot = dict(fit=True, line='45', markeredgecolor=scatter_color_edge)
highlight_textprops = [{'weight':'bold', 'color': colors[5]}, {'weight':'bold', 'color': colors[1]}]

# --- EDA 5 Settings ---
fig = plt.figure(figsize=(10, 7))
gs = fig.add_gridspec(6, 3)
ax = fig.add_subplot(gs[:3, :])

# --- EDA 5: Main KDE Plot ---
sns.kdeplot(x='trestbps', hue='slope', data=df_eda5, palette=color_pallete, hue_order=slope_list, bw_adjust=0.4, fill=True, ax=ax)
plt.legend([], [], frameon=False)
plt.grid(axis='x', which='major', alpha=0.75, color=color_line, linestyle='dotted', zorder=1)
plt.grid(axis='y', alpha=0, zorder=2)
plt.xticks(fontsize=6)
plt.yticks(fontsize=6)
plt.xlabel('\nResting Blood Pressure (in mm Hg)', **xy_label)
plt.ylabel('Density\n', **xy_label)
plt.tick_params(left='on', bottom='on', **tick_params)
for spine in ax.spines.values(): spine.set_color('None')
for spine in ['bottom', 'left']:
    ax.spines[spine].set_visible(True)
    ax.spines[spine].set_color(color_line)
fig_text(s='Resting Blood Pressure Distribution based on Slope', **suptitle)
fig_text(s='Each <slope type distribution> is <moderately right-skewed>.', highlight_textprops=highlight_textprops, **title)
plt.gcf().text(0.79, 0.16, 'kaggle.com/caesarmario', style='italic', fontsize=7)

# --- EDA 5: Sub Q-Q Plot ---
for idx, slp in enumerate(slope_list):
    df_eda5_slope = df_eda5[df_eda5['slope']==slp]
    sub_axes[idx] = fig.add_subplot(gs[4, idx])
    qqplot(df_eda5['trestbps'], ax=sub_axes[idx], markerfacecolor=color_line, alpha=0.4, **qq_plot)
    qqplot(df_eda5_slope['trestbps'], ax=sub_axes[idx], markerfacecolor=color_pallete[idx], alpha=0.5, **qq_plot)
    for line in [1, 3]:
        sub_axes[idx].get_lines()[line].set_color(colors[5])
        sub_axes[idx].get_lines()[line].set_linewidth(0.8)
        sub_axes[idx].get_lines()[line].set_linestyle('--')
    sub_axes[idx].set_xticks([])
    sub_axes[idx].set_yticks([])
    sub_axes[idx].set_xlabel('')
    sub_axes[idx].set_ylabel('')
    sub_axes[idx].legend([], [], frameon=False)
    sub_axes[idx].set_title(f'Q-Q Plot - Slope {slp}', fontsize=8, style='italic', weight='bold', ha='center')
    for spines in sub_axes[idx].spines.values(): spines.set_color('None')
plt.show();

<div class="explain-box">
    The distribution plot and Q-Q plots above show that each slope type's distribution is <mark>moderately right-skewed</mark>. This is due to <b>outliers</b> (distribution tail) on the right side of the plot. In addition, <b>the skewness value and gap at the upper of Q-Q plots with a 45-degree line</b> also show that the distribution in this column is not normal.
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

# <div class="header1">6. | Data Preprocessing ⚙️</div>
<div class="explain-box">
    This section will <b>prepare the dataset</b> before building the machine learning models.
</div>

## <div class="header2">6.1 | Features Separating and Splitting 🪓</div>
<div class="explain-box">
     In this section, <mark>the 'target' (dependent) column will be seperated from independent columns</mark>. Also, the dataset will be splitted into <mark>80:20 ratio</mark> (80% training and 20% testing).
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- Seperating Dependent Features ---
x = df.drop(['target'], axis=1)
y = df['target']

# --- Splitting Dataset ---
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## <div class="header2">6.2 | Processing Pipeline 🪠</div>
<div class="explain-box">
    This section will <mark>create a preprocessing pipeline</mark> for numerical and categorical columns and <mark>apply them to the <code>x_train</code> and <code>x_test</code> data</mark>. Not all columns will go through preprocessing. For <mark>all numerical columns</mark>, scaling will be carried out using a <mark>robust scaler</mark> since the dataset used is a <b>small dataset</b> where the presence of outliers dramatically affects the performance of a model. While for <mark>categorical columns with more than two categories, one-hot encoding will be carried out</mark>.</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- Numerical Pipeline ---
num_column = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
num_pipeline = Pipeline([
    ('scaling', RobustScaler())
])

# --- Categorical Pipeline ---
cat_column = ['cp', 'slope', 'thal']
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(drop='first', sparse=False))
])

# --- Combine Both Pipelines into Transformer ---
preprocessor = ColumnTransformer([
    ('categorical', cat_pipeline, cat_column)
    , ('numerical', num_pipeline, num_column)]
    , remainder='passthrough')

# --- Apply Transformer to Pipeline ---
process_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# --- Apply to Dataframe --- 
x_train_process = process_pipeline.fit_transform(x_train)
x_test_process = process_pipeline.fit_transform(x_test)

# <div class="header1">7. | Model Implementation 🛠️</div>
<div class="explain-box">
    This section will <b>implement various machine learning models</b> as mentioned in Introduction section. In addition, explanation for each models also will be discussed.
</div>

In [None]:
# --- Functions: Model Fitting and Performance Evaluation ---
def fit_ml_models(algo, algo_param, algo_name):
    
    # --- Algorithm Pipeline ---
    algo = Pipeline([('algo', algo)])
    
    # --- Apply Grid Search ---
    model = GridSearchCV(algo, param_grid=algo_param, cv=10, n_jobs=-1, verbose=1)
    
    # --- Fitting Model ---
    print(clr.start+f".:. Fitting {algo_name} .:."+clr.end)
    fit_model = model.fit(x_train_process, y_train)
    
    # --- Model Best Parameters ---
    best_params = model.best_params_
    print("\n>> Best Parameters: "+clr.start+f"{best_params}"+clr.end)
    
    # --- Best & Final Estimators ---
    best_model = model.best_estimator_
    best_estimator = model.best_estimator_._final_estimator
    best_score = round(model.best_score_, 4)
    print(">> Best Score: "+clr.start+"{:.3f}".format(best_score)+clr.end)
    
    # --- Create Prediction for Train & Test ---
    y_pred_train = model.predict(x_train_process)
    y_pred_test = model.predict(x_test_process)
    
    # --- Train & Test Accuracy Score ---
    acc_score_train = round(accuracy_score(y_pred_train, y_train)*100, 3)
    acc_score_test = round(accuracy_score(y_pred_test, y_test)*100, 3)
    print("\n"+clr.start+f".:. Train and Test Accuracy Score for {algo_name} .:."+clr.end)
    print("\t>> Train Accuracy: "+clr.start+"{:.2f}%".format(acc_score_train)+clr.end)
    print("\t>> Test Accuracy: "+clr.start+"{:.2f}%".format(acc_score_test)+clr.end)
    
    # --- Classification Report ---
    print("\n"+clr.start+f".:. Classification Report for {algo_name} .:."+clr.end)
    print(classification_report(y_test, y_pred_test))
    
    # --- Figures Settings ---
    xy_label = dict(fontweight='bold', fontsize=12)
    grid_style = dict(color=color_grid, linestyle='dotted', zorder=1)
    title_style = dict(fontsize=14, fontweight='bold')
    tick_params = dict(length=3, width=1, color=color_line)
    bar_style = dict(zorder=3, edgecolor='black', linewidth=0.5, alpha=0.85)
    set_palette(color_yb)
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 14))
    
    # --- Confusion Matrix ---
    conf_matrix = ConfusionMatrix(best_estimator, ax=ax1, cmap='Reds')
    conf_matrix.fit(x_train_process, y_train)
    conf_matrix.score(x_test_process, y_test)
    conf_matrix.finalize()
    conf_matrix.ax.set_title('Confusion Matrix\n', **title_style)
    conf_matrix.ax.tick_params(axis='both', labelsize=10, bottom='on', left='on', **tick_params)
    for spine in conf_matrix.ax.spines.values(): spine.set_color(color_line)
    conf_matrix.ax.set_xlabel('\nPredicted Class', **xy_label)
    conf_matrix.ax.set_ylabel('True Class\n', **xy_label)
    conf_matrix.ax.xaxis.set_ticklabels(['False', 'True'], rotation=0)
    conf_matrix.ax.yaxis.set_ticklabels(['True', 'False'])
    
    # --- ROC AUC ---
    logrocauc = ROCAUC(best_estimator, classes=['False', 'True'], ax=ax2, colors=color_yb)
    logrocauc.fit(x_train_process, y_train)
    logrocauc.score(x_test_process, y_test)
    logrocauc.finalize()
    logrocauc.ax.set_title('ROC AUC Curve\n', **title_style)
    logrocauc.ax.tick_params(axis='both', labelsize=10, bottom='on', left='on', **tick_params)
    logrocauc.ax.grid(axis='both', alpha=0.4, **grid_style)
    for spine in logrocauc.ax.spines.values(): spine.set_color('None')
    for spine in ['bottom', 'left']:
        logrocauc.ax.spines[spine].set_visible(True)
        logrocauc.ax.spines[spine].set_color(color_line)
    logrocauc.ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2, borderpad=2, frameon=False, fontsize=10)
    logrocauc.ax.set_xlabel('\nFalse Positive Rate', **xy_label)
    logrocauc.ax.set_ylabel('True Positive Rate\n', **xy_label)
    
    # --- Learning Curve ---
    lcurve = LearningCurve(best_estimator, scoring='f1_weighted', ax=ax3, colors=color_yb)
    lcurve.fit(x_train_process, y_train)
    lcurve.finalize()
    lcurve.ax.set_title('Learning Curve\n', **title_style)
    lcurve.ax.tick_params(axis='both', labelsize=10, bottom='on', left='on', **tick_params)
    lcurve.ax.grid(axis='both', alpha=0.4, **grid_style)
    for spine in lcurve.ax.spines.values(): spine.set_color('None')
    for spine in ['bottom', 'left']:
        lcurve.ax.spines[spine].set_visible(True)
        lcurve.ax.spines[spine].set_color(color_line)
    lcurve.ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2, borderpad=2, frameon=False, fontsize=10)
    lcurve.ax.set_xlabel('\nTraining Instances', **xy_label)
    lcurve.ax.set_ylabel('Scores\n', **xy_label)
    
    # --- Feature Importance or Precision Recall Curve ---
    try:
        feat_importance = FeatureImportances(best_estimator, labels=columns_list_onehot, ax=ax4, topn=5, colors=color_yb_importance)
        feat_importance.fit(x_train_process, y_train)
        feat_importance.finalize()
        feat_importance.ax.set_title('Feature Importances (Top 5 Features)\n', **title_style)
        feat_importance.ax.tick_params(axis='both', labelsize=10, bottom='on', left='on', **tick_params)
        feat_importance.ax.grid(axis='x', alpha=0.4, **grid_style)
        feat_importance.ax.grid(axis='y', alpha=0, **grid_style)
        for spine in feat_importance.ax.spines.values(): spine.set_color('None')
        for spine in ['bottom']:
            feat_importance.ax.spines[spine].set_visible(True)
            feat_importance.ax.spines[spine].set_color(color_line)
        feat_importance.ax.set_xlabel('\nRelative Importance', **xy_label)
        feat_importance.ax.set_ylabel('Features\n', **xy_label)
    except:
        prec_curve = PrecisionRecallCurve(best_estimator, ax=ax4, ap_score=True, iso_f1_curves=True)
        prec_curve.fit(x_train_process, y_train)
        prec_curve.score(x_test_process, y_test)
        prec_curve.finalize()
        prec_curve.ax.set_title('Precision-Recall Curve\n', **title_style)
        prec_curve.ax.tick_params(axis='both', labelsize=10, bottom='on', left='on', **tick_params)
        for spine in prec_curve.ax.spines.values(): spine.set_color('None')
        for spine in ['bottom', 'left']:
            prec_curve.ax.spines[spine].set_visible(True)
            prec_curve.ax.spines[spine].set_color(color_line)
        prec_curve.ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2, borderpad=2, frameon=False, fontsize=10)
        prec_curve.ax.set_xlabel('\nRecall', **xy_label)
        prec_curve.ax.set_ylabel('Precision\n', **xy_label)
        
    plt.suptitle(f'\n{algo_name} Performance Evaluation Report\n', fontsize=18, fontweight='bold')
    plt.gcf().text(0.88, 0.02, 'kaggle.com/caesarmario', style='italic', fontsize=10)
    plt.tight_layout();
    
    return acc_score_train, acc_score_test, best_score

## <div class="header2">7.1 | Logistic Regression</div>
<div class="explain-box">
    <blockquote style="color: #000000;">
        <mark><b>Logistic regression</b></mark> is a statistical method that is used for building machine learning models where <b>the dependent variable is dichotomous: i.e. binary</b>. Logistic regression is used to describe data and <b>the relationship between one dependent variable and one or more independent variables</b>. The independent variables can be nominal, ordinal, or of interval type.<br><br>
    The name "logistic regression" is derived from the concept of the logistic function that it uses. <b>The logistic function is also known as the sigmoid function</b>. The value of this logistic function lies between zero and one.<br><br>
        <center>
            <img src="https://www.simplilearn.com/ice9/free_resources_article_thumb/years-2.JPG" alt="Logistic Regression" width="40%"><br>
            <i>🖼 Logistic Function by Simplilearn</i>
        </center>
    </blockquote>
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- Logistic Regression Parameters ---
parameter_lr = {"algo__solver": ["lbfgs", "saga", "newton-cg"]
                , "algo__C": [0.1, 0.2, 0.5, 0.8]}

# --- Logistic Regression Algorithm ---
algo_lr = LogisticRegression(penalty="l2", random_state=42, n_jobs=-1)

# --- Applying Logistic Regression ---
acc_score_train_lr, acc_score_test_lr, best_score_lr = fit_ml_models(algo_lr, parameter_lr, "Logistic Regression")

## <div class="header2">7.2 | K-Nearest Neighbour (KNN)</div>
<div class="explain-box">
    <blockquote style="color: #000000;">
        <mark><b>The k-nearest neighbors (KNN)</b></mark> algorithm is a data classification method <b>for estimating the likelihood that a data point will become a member of one group or another</b> based on what group the data points nearest to it belong to. The k-nearest neighbor algorithm is a type of supervised machine learning algorithm used <b>to solve classification and regression problems</b>.<br><br>
    It's called a <b>lazy learning algorithm or lazy learner</b> because it doesn't perform any training when you supply the training data. Instead, it just stores the data during the training time and doesn't perform any calculations. It doesn't build a model until a query is performed on the dataset. This makes KNN ideal for data mining.<br><br>
        <center>
            <img src="https://1.bp.blogspot.com/-D6REhf2XBwQ/XZcWn0cwSEI/AAAAAAAAAvs/LUCN8jxvzcMjkkDK4FAXSuR7MBDW8SBJgCLcBGAsYHQ/s1600/KNN_final_a1mrv9.jpg" alt="KNN" width="35%"><br>
            <i>🖼 KNN by Kita Informatika</i>
        </center>
    </blockquote>
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- KNN Parameters ---
parameter_knn = {"algo__n_neighbors": [2, 5, 10, 17]
                , "algo__leaf_size": [1, 10, 11, 30]}

# --- KNN Algorithm ---
algo_knn = KNeighborsClassifier(n_jobs=-1)

# --- Applying KNN ---
acc_score_train_knn, acc_score_test_knn, best_score_knn = fit_ml_models(algo_knn, parameter_knn, "K-Nearest Neighbour (KNN)")

## <div class="header2">7.3 | Support Vector Machine (SVM)</div>
<div class="explain-box">
    <blockquote style="color: #000000;">
        <mark><b>Support Vector Machine (SVM)</b></mark> is one of the most popular Supervised Learning algorithms, which is used for Classification as well as Regression problems. The goal of the SVM algorithm is <b>to create the best line or decision boundary that can segregate n-dimensional space into classes</b> so that we can easily put the new data point in the correct category in the future. This best decision boundary is called a hyperplane.<br><br>
        SVM chooses the <b>extreme points/vectors</b> that help in creating the hyperplane. These extreme cases are called as support vectors, and hence algorithm is termed as Support Vector Machine.<br>
        <center>
            <img src="https://static.javatpoint.com/tutorial/machine-learning/images/support-vector-machine-algorithm.png" alt="SVM" width="40%"><br>
            <i>🖼 SVM by JavaTPoint</i>
        </center>
    </blockquote>
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- SVM Parameters ---
parameter_svc = [
    {'algo__kernel': ['rbf'], 'algo__gamma': np.arange(0.1, 1, 0.1), 'algo__C': np.arange(0.1, 1, 0.1)}
    , {'algo__kernel': ['linear'], 'algo__C': np.arange(0.1, 1, 0.1)}
    , {'algo__kernel': ['poly'], 'algo__degree' : np.arange(1, 10, 1), 'algo__C': np.arange(0.1, 1, 0.1)}
]

# --- SVM Algorithm ---
algo_svc = SVC(random_state=1, probability=True)

# --- Applying SVM ---
acc_score_train_svc, acc_score_test_svc, best_score_svc = fit_ml_models(algo_svc, parameter_svc, "Support Vector Machine (SVM)")

## <div class="header2">7.4 | Gaussian Naive Bayes</div>
<div class="explain-box">
    <blockquote style="color: #000000;">
        <mark><b>Naive Bayes Classifiers</b></mark> are based on the Bayes Theorem, which <b>one assumption taken is the strong independence assumptions between the features</b>. These classifiers assume that the value of a particular feature is independent of the value of any other feature. In a supervised learning situation, Naive Bayes Classifiers are trained very efficiently. Naive Bayes classifiers <b>need a small training data to estimate the parameters needed for classification</b>. Naive Bayes Classifiers have simple design and implementation and they can applied to many real life situations.<br><br>
        <mark><b>Gaussian Naive Bayes</b></mark> is a <b>variant of Naive Bayes that follows Gaussian normal distribution and supports continuous data</b>. When working with continuous data, an assumption often taken is that the continuous values associated with each class are distributed according to a normal (or Gaussian) distribution.<br>
        <center>
            <img src="https://iq.opengenus.org/content/images/2020/02/Illustration-of-how-a-Gaussian-Naive-Bayes-GNB-classifier-works-For-each-data-point.png" alt="GNB" width="35%"><br>
            <i>🖼 Gaussian Naive Bayes by OpenGenus</i>
        </center>
    </blockquote>
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- Gaussian NB Parameters ---
parameter_gnb = {"algo__var_smoothing": [1e-2, 1e-3, 1e-4, 1e-6]}

# --- Gaussian NB Algorithm ---
algo_gnb = GaussianNB()

# --- Applying Gaussian NB ---
acc_score_train_gnb, acc_score_test_gnb, best_score_gnb = fit_ml_models(algo_gnb, parameter_gnb, "Gaussian Naive Bayes")

## <div class="header2">7.5 | Decision Tree</div>
<div class="explain-box">
    <blockquote style="color: #000000;">
        <mark><b>Decision Tree</b></mark> is a Supervised learning technique that can be used for both classification and Regression problems, but mostly it is preferred for solving Classification problems. It is a tree-structured classifier, where <b>internal nodes represent the features of a dataset, branches represent the decision rules and each leaf node represents the outcome</b>.<br><br>
    In a Decision tree, there are <b>two nodes</b>, which are the <mark><b>Decision Node and Leaf Node</b></mark>. Decision nodes are used to make any decision and have multiple branches, whereas Leaf nodes are the output of those decisions and do not contain any further branches.<br>
        <center>
            <img src="https://static.javatpoint.com/tutorial/machine-learning/images/decision-tree-classification-algorithm.png" alt="DT" width="35%"><br>
            <i>🖼 Decision Tree by Javatpoint</i>
        </center>
    </blockquote>
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- Decision Tree Parameters ---
parameter_dt = {"algo__max_depth": [1, 2, 3]}

# --- Decision Tree Algorithm ---
algo_dt = DecisionTreeClassifier(random_state=42)

# --- Applying Decision Tree ---
acc_score_train_dt, acc_score_test_dt, best_score_dt = fit_ml_models(algo_dt, parameter_dt, "Decision Tree")

## <div class="header2">7.6 | Random Forest</div>
<div class="explain-box">
    <blockquote style="color: #000000;">
        <mark><b>Random Forest</b></mark> is a tree-based machine learning algorithm that <b>leverages the power of multiple decision trees for making decisions</b>. Each individual tree in the random forest spits out a class prediction and the class with the most votes becomes our model’s prediction. <b>A large number of relatively uncorrelated models (trees) operating as a committee will outperform any of the individual constituent models</b>.<br>
        <center>
            <img src="https://cdn.analyticsvidhya.com/wp-content/uploads/2020/02/rfc_vs_dt1.png" alt="RF" width="35%"><br>
            <i>🖼 Random Forest by Abhishek Sharma</i>
        </center>
    </blockquote>
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- Random Forest Parameters ---
parameter_rf = {"algo__max_depth": np.arange(1, 6, 1)}

# --- Random Forest Algorithm ---
algo_rf = RandomForestClassifier(random_state=99, n_jobs=-1)

# --- Applying Random Forest ---
acc_score_train_rf, acc_score_test_rf, best_score_rf = fit_ml_models(algo_rf, parameter_rf, "Random Forest")

## <div class="header2">7.7 | Extra Tree Classifier</div>
<div class="explain-box">
    <blockquote style="color: #000000;">
        <mark><b>Extra Trees Classifier</b></mark> is a type of ensemble learning technique which <b>aggregates the results of multiple de-correlated decision trees collected in a "forest" to output it’s classification result</b>. In concept, it is very similar to a Random Forest Classifier and only differs from it in the manner of construction of the decision trees in the forest.<br><br>
        Each Decision Tree in the Extra Trees Forest is <b>constructed from the original training sample</b>. Then, at each test node, each tree is provided with a <b>random sample of k features</b> from the feature-set from which each decision tree must select the best feature to split the data based on some mathematical criteria (typically the Gini Index). This random sample of features leads to the creation of multiple de-correlated decision trees.
    </blockquote>
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- Extra Tree Parameters ---
parameter_et = {"algo__max_depth": [2, 3]
    , "algo__max_leaf_nodes": [3, 5, 7]}

# --- Extra Tree Algorithm ---
algo_et = ExtraTreesClassifier(random_state=42, n_jobs=-1)

# --- Applying Extra Tree ---
acc_score_train_et, acc_score_test_et, best_score_et = fit_ml_models(algo_et, parameter_et, "Extra Tree Classifier")

## <div class="header2">7.8 | Gradient Boosting</div>
<div class="explain-box">
    <blockquote style="color: #000000;">
        <mark><b>Boosting</b></mark> is a method of <b>converting weak learners into strong learners</b>. In boosting, <b>each new tree is a fit on a modified version</b> of the original data set. It strongly relies on the prediction that the next model will reduce prediction errors when blended with previous ones. The main idea is <b>to establish target outcomes for this upcoming model to minimize errors</b>.<br><br>
        <mark><b>Gradient Boosting</b></mark> trains many models in <b>a gradual, additive and sequential manner</b>. The term gradient boosting emerged because every case’s target outcomes are based on the gradient’s error with regards to the predictions. Every model reduces prediction errors by taking a step in the correct direction.<br>
        <center>
            <img src="https://www.researchgate.net/publication/345327934/figure/fig3/AS:1022810793209856@1620868504478/Flow-chart-of-XGBoost.png" alt="GB" width="35%"><br>
            <i>🖼 Boosting Algorithm by Rui Guo et al.</i>
        </center>
    </blockquote>
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- Gradient Boosting Parameters ---
parameter_gb = {
    "algo__learning_rate": [0.1, 0.3, 0.5]
    , "algo__n_estimators": [2, 4, 6]
    , "algo__min_weight_fraction_leaf": [0.1, 0.2, 0.5]
}

# --- Gradient Boosting Algorithm ---
algo_gb = GradientBoostingClassifier(loss="exponential", random_state=2)

# --- Applying Gradient Boosting ---
acc_score_train_gb, acc_score_test_gb, best_score_gb = fit_ml_models(algo_gb, parameter_gb, "Gradient Boosting")

## <div class="header2">7.9 | AdaBoost</div>
<div class="explain-box">
    <blockquote style="color: #000000;">
        <mark><b>AdaBoost</b></mark> also called <b>Adaptive Boosting</b> is a technique in Machine Learning used as an Ensemble Method. The most common algorithm used with AdaBoost is <b>decision trees with one level</b> that means with Decision trees with only 1 split. These trees are also called <mark><b>Decision Stumps</b></mark>. <b>AdaBoost builds a model and gives equal weights to all the data points</b>. It then assigns higher weights to points that are wrongly classified. Now, all the points which have higher weights are given more importance in the next model. It will keep training models until and unless a lowe error is received.
    </blockquote>
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- AdaBoost Parameters ---
parameter_ab = {
    "algo__n_estimators": [6, 7, 10]
    , "algo__learning_rate": [0.2, 0.4, 0.8]
}

# --- AdaBoost Algorithm ---
algo_ab = AdaBoostClassifier(random_state=1)

# --- Applying AdaBoost ---
acc_score_train_ab, acc_score_test_ab, best_score_ab = fit_ml_models(algo_ab, parameter_ab, "AdaBoost")

## <div class="header2">7.10 | Model Comparison 👀</div>
<div class="explain-box">
    After implementing and tuning 9 models, this section will <mark>compare all machine learning models accuracy and best score</mark>.
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- Create Accuracy Comparison Table ---
df_compare = pd.DataFrame({'Model': ['Logistic Regression', 'K-Nearest Neighbour', 'Support Vector Machine', 'Gaussian NB',
                                     'Decision Tree', 'Random Forest', 'Extra Tree Classifier', 'Gradient Boosting', 'AdaBoost'] 
                           , 'Accuracy Train': [acc_score_train_lr, acc_score_train_knn, acc_score_train_svc, acc_score_train_gnb,
                                                acc_score_train_dt, acc_score_train_rf, acc_score_train_et, acc_score_train_gb, acc_score_train_ab]
                           , 'Accuracy Test': [acc_score_test_lr, acc_score_test_knn, acc_score_test_svc, acc_score_test_gnb,
                                               acc_score_test_dt, acc_score_test_rf, acc_score_test_et, acc_score_test_gb, acc_score_test_ab]
                           , 'Best Score': [best_score_lr, best_score_knn, best_score_svc, best_score_gnb,best_score_dt, best_score_rf, 
                                            best_score_et, best_score_gb, best_score_ab]})

# --- Create Comparison Table ---
print(clr.start+f".:. Models Comparison .:."+clr.end)
print(clr.color+'*' * 26)
df_compare.sort_values(by='Best Score', ascending=False).style.apply(acc_train_vs_test, axis=1).hide_index()

<div class="explain-box">
    From the results of the <mark>accuracy of the train and test</mark> above, <b>most models experienced overfitting or underfitting</b>. However, <b>several models have a good fit</b>, where the difference between train and test accuracy or vice versa is a little. These models are <b>random forest, AdaBoost, Gaussian Naive Bayes, and extra tree classifier</b>. As seen in the data frame above, of the four models, <mark>random forest and Gaussian NB have the highest accuracy compared to the other models</mark>. This is also supported by the ROC AUC curve figure for random forest and Gaussian NB, <mark>where the AUC value for both models is close to 1</mark>, which means that both models can predict well whether patients have heart disease. The <b>confusion matrix</b> shows that the prediction results between the actual target and the predicted target for the random forest and Gaussian NB models in each class in the test data are <mark>better</mark> than those of other models.<br><br>
    Judging from the <mark>F1 scores</mark> of both models, both models do a very good job differentiating sick patients from those who are not (scores above <b>0.85</b>). If seen from the <mark>precision value for Gaussian NB, 93% of all the patients that the model predicted have heart disease. Whereas in the random forest precision value, only 88% out of all the patients that the model predicted have heart disease, slightly lower than the Gaussian NB precision value</mark>. At the <b>Gaussian NB recall value, this model only correctly predicts 81% of all heart disease patients. However, in the random forest recall value, this model can predict better than Gaussian NB, where 88% of patients are predicted to have heart disease out of all patients who do have heart disease</b>.<br><br>
    Furthermore, in the <mark>learning curve</mark> between Gaussian NB and random forest, <mark>the learning curve for the random forest is more ideal</mark> than Gaussian NB. This is because <b>both training and validation scores of Gaussian NB stay too close together</b> (indicates low variance and high bias). This will more likely result in <mark>poor fit and especially poor generalization of the data (towards the data it has not seen before)</mark>. Whereas in a <b>random forest</b>, <mark>the validation score constantly improves as the number of training set sizes gets larger (notice the difference in the x-axis scale from the previous two curves)</mark>. Both the training and validation scores also converge to nearly similar values. This is a model that can generalize very well. From the analysis above, <mark>it can be concluded that the random forest model best predicts whether a person has heart disease</mark>.<br><br>
    In the <mark>random forest feature importance plot</mark>, the <b>five following features to be the most important</b> are <mark>major vessels number (ca), fixed defect thalassemia (thal_2), ST depression induced by exercise relative to rest (oldpeak), reversable defect thalassemia (thal_3), and exercise-induced angina (exang)</mark>. <mark>Major vessel number</mark> might be significant since a lower number of major vessels can lead to a reduced blood supply to the heart muscle, which can result in ischemia (lack of oxygen and nutrients) and potentially lead to heart disease. For example, individuals with single-vessel disease have a greater risk of adverse cardiac events than those with multi-vessel disease. <mark>Fixed defects and reversible defects of thalassemia</mark> can affect the heart by reducing the number of red blood cells and oxygen delivery to the heart muscles, which can lead to the development of myocardial ischemia, coronary artery disease, and other forms of heart disease. Early detection and appropriate treatment of thalassemia can help prevent these complications and improve cardiovascular health.<br><br>
    <mark>ST depression induced by exercise relative to rest</mark> can be important finding which can suggest underlying heart disease and may warrant further evaluation and management to reduce the risk of adverse cardiovascular events. Early detection and appropriate treatment of underlying heart disease can help prevent complications and improve cardiovascular health. Finally, <mark>exercise-induced angina</mark> is also important because it can suggest underlying CAD and may warrant further evaluation and management to reduce the risk of adverse cardiovascular events. Individuals with exercise-induced angina may need further testing, such as stress testing or coronary angiography, to assess the extent of their coronary artery disease and determine the most appropriate treatment options.
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

# <div class="header1">8. | Miscellaneous 🧪</div>
<div class="explain-box">
    This section focuses on <mark>creating a complete pipeline</mark>, starting from data processing to a machine learning pipeline, using the best model concluded in the previous section and <mark>exporting it to <code>joblib</code> and <code>pickle (.pkl)</code> files</mark>. Besides that, <mark>test dataset predicted results would also be exported</mark> along with actual results in CSV and JSON files. Moreover, this section will also <mark>make predictions on dummy data</mark> (data generated using Python functions) and <mark>export them to CSV and JSON files</mark>.
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

## <div class="header2">8.1 | Creating Outputs 📤</div>
<div class="explain-box">
    <mark>The complete pipeline will be exported in this section</mark>. The pipeline will be stored using the joblib library into <code>joblib</code> and <code>pickle (.pkl)</code> files. This section will also <mark>show the test data frame before exporting the predicted results and the actual results to the CSV and JSON files</mark>.
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- Complete Pipeline: Preprocessor & RF ---
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor)
    , ('algo', RandomForestClassifier(max_depth=3, random_state=99, n_jobs=-1))
])

# --- Save Complete Pipeline (joblib and pickle) ---
file_name = 'pipeline_heart_disease_random_forest_caesarmario'
for ext in ['joblib', 'pkl']:
    joblib.dump(rf_pipeline, f'pipeline/{file_name}.{ext}')

In [None]:
# --- Dataframes to Create Test Output Dataframe ---
rf_pipeline.fit(x_train, y_train)
y_pred_rf = rf_pipeline.predict(x_test)
pred_target = pd.DataFrame(y_pred_rf, columns=['pred_target'])

x_test_output = x_test.reset_index()
actual_target = y_test.to_frame(name='actual_target').reset_index()

# --- Combining and Creating Test Output Dataframe ---
df_test_output = pd.concat([x_test_output, actual_target, pred_target], axis=1).drop('index', axis=1)

# --- Showing Sample Test Output Dataframe ---
print(clr.start+'.: Sample Test Dataframe :.'+clr.end)
print(clr.color+'*' * 28)
df_test_output.sample(n=10, random_state=0).style.apply(act_vs_pred, axis=1).hide_index()

In [None]:
# --- Export to CSV and JSON Files ---
output_name = 'test_data_heart_disease_caesarmario'
df_test_output.to_csv(f'test_data/{output_name}.csv', index=False, sep=',', encoding='utf-8')
df_test_output.to_json(f'test_data/{output_name}.json', orient='index')

## <div class="header2">8.2 | Prediction Case 🧐</div>
<div class="explain-box">
    This second section will <mark>predict the dummy data</mark> generated using Python functions. Then, <mark>the prediction results will be exported as CSV and JSON files</mark>, along with dummy data.
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- Creating Prediction Case Dataframe (50 Rows) ---
df_pred_case = create_prediction_case(x_train, 50)

# --- Showing Dataframe ---
print(clr.start+'.: Prediction Case Dataframe :.'+clr.end)
print(clr.color+'*' * 32)
df_pred_case.sample(n=6, random_state=24).style.background_gradient(cmap='Reds').hide_index()

<div class="explain-box">
    The above data frame is <mark>six samples from 50 dummy data</mark> generated using Python. Furthermore, using the best model in the next section, <b>predictions will be made on dummy data, displaying the prediction results in a data frame before exporting it to CSV and JSON files</b>.
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- Creating Prediction using Best Model ---
y_pred_case = rf_pipeline.predict(df_pred_case)

# --- Combining Prediction Case Dataframe w/ Prediction ---
pred_case_target = pd.DataFrame(y_pred_case, columns=['pred_target'])
df_pred_case = pd.concat([df_pred_case, pred_case_target], axis=1)

# --- Showing Final Dataframe ---
print(clr.start+'.: Final Prediction Case Dataframe :.'+clr.end)
print(clr.color+'*' * 38)
df_pred_case.sample(n=6, random_state=24).style.apply(coloring_target_col).hide_index()

In [None]:
# --- Export to CSV and JSON Files ---
pred_output_name = 'pred_case_heart_disease_caesarmario'
df_pred_case.to_csv(f'pred_case/{pred_output_name}.csv', index=False, sep=',', encoding='utf-8')
df_pred_case.to_json(f'pred_case/{pred_output_name}.json', orient='index')

# <div class="header1">9. | Conclusions and Future Improvements 🧐</div>
<div class="explain-box">
    From the results of dataset analysis and implementation of machine learning models in the previous section, <mark>it can be concluded as follows</mark>:
    <blockquote style="color: #000000;">
        <ul>
            <li><mark>Random forest is the best model</mark> out of 9 machine-learning models implemented in this notebook. This is because <b>this model fits well with train and test data</b>. In addition, <b>this model also performs better than other models when predicting the test data</b> (can be seen from the performance evaluation graph and classification report of each model).</li>
            <li>Based on previous findings, <mark>medical workers can focus more on examining the five variables previously mentioned</mark>. This is because these five variables most influence whether a patient has heart disease.</li>
            <li><mark>The prediction results on test data, dummy data, and the complete machine learning pipeline have been successfully exported</mark> for other purposes. In addition, data exploration has also been successfully carried out using the <code>ydata-profiling</code>, <code>seaborn</code>, and <code>matplotlib</code> libraries.</li>
            <li><mark>Several improvements can be implemented in the following research/notebook</mark>. For example, by carrying out A/B Testing on patients with the same major vessel number in one group. Another example is performing advanced hyperparameter tuning experiments to obtain higher accuracy (~90%).</li>
        </ul>
    </blockquote>
</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

# <div class="header1">10. | References 🔗</div>
<div class="references">
    <ul><u>Kaggle Notebook 📚</u>
        <li><a style="color: #3D5A80" href="https://www.kaggle.com/vivek468/what-visualizations-should-you-use">What Visualizations Should You Use? by Vivek Chowdhury</a></li>
        <li><a style="color: #3D5A80" href="https://www.kaggle.com/code/sonalisingh1411/eda-on-train-test-dataset-price-prediction">EDA On Train & Test Dataset+🏡 Price💸Prediction🤔 by Sonali Singh</a></li>
        <li><a style="color: #3D5A80" href="https://www.kaggle.com/cdabakoglu/heart-disease-classifications-machine-learning">Heart Disease - Classifications (Machine Learning) by Caner Dabakoglu</a></li>
        <li><a style="color: #3D5A80" href="https://www.kaggle.com/code/asimislam/heart-disease-uci-eda-and-ml-w-lr">Heart Disease UCI - EDA and ML w/LR by Asim Islam</a></li>
        <li><a style="color: #3D5A80" href="https://www.kaggle.com/code/kellibelcher/heart-disease-predictions-with-shapley">Heart Disease Predictions with Shapley by Kelli Belcher</a></li>
    </ul>
    <ul><b><u>Online Articles 🌏</u></b>
        <li><a style="color: #3D5A80" href="https://www.simplilearn.com/tutorials/machine-learning-tutorial/logistic-regression-in-python">An Introduction to Logistic Regression in Python by Simplilearn</a></li>
        <li><a style="color: #3D5A80" href="https://learn.g2.com/k-nearest-neighbor">What Is K-Nearest Neighbor? An ML Algorithm to Classify Data by Amal Joby</a></li>
        <li><a style="color: #3D5A80" href="https://www.javatpoint.com/machine-learning-support-vector-machine-algorithm">Support Vector Machine Algorithm by Javatpoint</a></li>
        <li><a style="color: #3D5A80" href="https://iq.opengenus.org/gaussian-naive-bayes/">Gaussian Naive Bayes by OpenGenus</a></li>
        <li><a style="color: #3D5A80" href="https://www.javatpoint.com/machine-learning-decision-tree-classification-algorithm">Decision Tree Classification Algorithm by Javatpoint</a></li>
        <li><a style="color: #3D5A80" href="https://www.analyticsvidhya.com/blog/2020/05/decision-tree-vs-random-forest-algorithm/">Decision Tree vs. Random Forest – Which Algorithm Should you Use? by Abhishek Sharma</a></li>
        <li><a style="color: #3D5A80" href="https://towardsdatascience.com/understanding-random-forest-58381e0602d2">Understanding Random Forest by Tony Yiu</a></li>
        <li><a style="color: #3D5A80" href="https://datascience.eu/machine-learning/gradient-boosting-what-you-need-to-know/">Gradient Boosting – What You Need to Know by Data Science.EU</a></li>
        <li><a style="color: #3D5A80" href="https://towardsdatascience.com/understanding-gradient-boosting-machines-9be756fe76ab">Understanding Gradient Boosting Machines by Harshdeep Singh</a></li>
        <li><a style="color: #3D5A80" href="https://www.analyticsvidhya.com/blog/2021/09/adaboost-algorithm-a-complete-guide-for-beginners/">AdaBoost Algorithm – A Complete Guide for Beginners by Anshul Saini</a></li>
        <li><a style="color: #3D5A80" href="https://www.geeksforgeeks.org/ml-extra-tree-classifier-for-feature-selection/">ML | Extra Tree Classifier for Feature Selection by GeeksforGeeks</a></li>
    </ul>
</div>

<hr>
<center>
    <span class="thanks">.: Thanks for Viewing :.</span><br><br>
    <span class="thanks-explain">📌 Like this notebook? You can support me by giving <mark><b>upvote</b></mark> 😆👍🔼</span><br>
    <span class="three-dots2">...</span><br><br>
    <span class="thanks-explain">Similar project on <b><u>GitHub</u></b>:</span><br><br>
    <span class="promotion">▸▸ Disease Prediction using <b>SAS Studio</b> <a href="https://github.com/caesarmario/heart-disease-prediction-with-logistic-regression-SAS-studio">here</a> ◂◂</span><br><br>
    <span class="thanks-watermark"><u>Support me!</u></span><br>
    <span class="ko-fi">
        <a href='https://ko-fi.com/D1D3JU963' target='_blank'><img src='https://ko-fi.com/img/githubbutton_sm.svg' alt='Support me on Ko-fi Button'/></a>
    </span><br>
    <span class="thanks-watermark"><u>Follow me in other platform</u></span><br>
    <div align="center" class="social-media">
        <ul>
            <li><a href="https://www.kaggle.com/caesarmario"><img src="https://i.imgur.com/K6QyzaJ.png"></a></li>
            <li><a href="https://public.tableau.com/app/profile/caesarmario"><img src="https://i.imgur.com/JVxVkeQ.png"></a></li>
            <li><a href="https://github.com/caesarmario"><img src="https://i.imgur.com/Orp40Ys.png"></a></li>
            <li><a href="https://caesarmario.medium.com/"><img src="https://i.imgur.com/6TrHyu0.png"></a></li>
            <li><a href="https://www.linkedin.com/in/caesarmario"><img src="https://i.imgur.com/vVYd0aI.png"></a></li>
        </ul>
    </div>
    <img src="https://i.imgur.com/Xy0J9G0.png" width=65% alt="WM">
</center>