<a href="https://colab.research.google.com/github/lenin438/prettymapp/blob/main/Heart_Failure_Prediction%F0%9F%AB%80%F0%9F%8F%A5%7C_Streamlit_App%F0%9F%9A%80.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
fedesoriano_heart_failure_prediction_path = kagglehub.dataset_download('fedesoriano/heart-failure-prediction')

print('Data source import complete.')


<div style="color:white;
            display:fill;
            border-radius:15px;
            background-color:black;
            font-size:100%;
            font-family:Verdana;
            letter-spacing:1px">
    <h1 style='padding: 20px;
              color:white;
              text-align:center;'>
        Import Needed Libraries
    </h1>
    </div>

In [None]:
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.offline import iplot , plot
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
import plotly.io as pio
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler , LabelEncoder
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import  ConfusionMatrixDisplay, classification_report
import warnings
warnings.filterwarnings('ignore')

<div style="color:white;
            display:fill;
            border-radius:15px;
            background-color:black;
            font-size:100%;
            font-family:Verdana;
            letter-spacing:1px">
    <h1 style='padding: 20px;
              color:white;
              text-align:center;'>
        Reading Dataset And Get Info
    </h1>
    </div>

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-prediction/heart.csv')

In [None]:
# Show Sample of Data
df.sample(5)

In [None]:
# Show Shape of Data
print(f"Number of Row : {df.shape[0]}\nNumber of Columns : {df.shape[1]}")

In [None]:
df.info()

In [None]:
# Check NaN Value
df.isna().sum()

In [None]:
# Describe Numiric Data
df.describe()

####    All this an errors in data collection
- the min value in  2 columns = 0 (RestingBP, Cholesterol) is not realistic
- An Oldpeak value of -2.6 mm is not realistic because ST depression (Oldpeak) cannot be negative

In [None]:
pd.DataFrame({'Count':df.shape[0],
              'Null':df.isnull().sum(),
              'Null %':df.isnull().mean() * 100,
              'Cardinality':df.nunique()
})

In [None]:
# Check Dublication
df.duplicated().sum()

<div style="color:white;
            display:fill;
            border-radius:15px;
            background-color:black;
            font-size:100%;
            font-family:Verdana;
            letter-spacing:1px">
    <h1 style='padding: 20px;
              color:white;
              text-align:center;'>
        Handling Missing Data (Nulls)
    </h1>
    </div>

## Handle not realistic value (RestingBP, Cholesterol,Oldpeak)

#### The negative reviews are real but were entered incorrectly because Oldpeak should be positive

In [None]:
df[df['Oldpeak']<0]

In [None]:
# Change negative values to positive values
df['Oldpeak'] = df['Oldpeak'].abs()

####  Convert those 0's values in these 2 columns (RestingBP, Cholesterol) to determine how many missing valuesin each feature in our data

In [None]:
df[['RestingBP','Cholesterol']] = df[['RestingBP','Cholesterol']].replace(0,np.nan)

In [None]:
# Show Number of Missing Values (NaN)
df[['RestingBP','Cholesterol']].isna().sum()

In [None]:
# Show Status of HeartDisease which NaN in RestingBP
df[df['RestingBP'].isna()]

In [None]:
# Fill Missing Value with median of people has Heart Disease
df['RestingBP'] = df['RestingBP'].fillna(df.groupby('HeartDisease')['RestingBP'].median()[1])

In [None]:
temp = df.groupby('HeartDisease')['Cholesterol'].median()
df.loc[(df['Cholesterol'].isna()) & (df['HeartDisease'] == 0), 'Cholesterol'] = temp[0]
df.loc[(df['Cholesterol'].isna()) & (df['HeartDisease'] == 1), 'Cholesterol'] = temp[1]

In [None]:
# After Fill Missing Values
df.isnull().sum()

<div style="color:white;
            display:fill;
            border-radius:15px;
            background-color:black;
            font-size:100%;
            font-family:Verdana;
            letter-spacing:1px">
    <h1 style='padding: 20px;
              color:white;
              text-align:center;'>
        Detection Outliers
    </h1>
    </div>

In [None]:
def detect_outliers_iqr(df, columns, threshold = 1.5):

    outlier_indices = {}

    for col in columns:
        Q1 = df[col].quantile(0.25)  # 25th percentile
        Q3 = df[col].quantile(0.75)  # 75th percentile
        IQR = Q3 - Q1  # Interquartile range
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        # Finding outliers
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index.tolist()
        outlier_indices[col] = outliers

    return outlier_indices

numerical_columns = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
outliers_iqr = detect_outliers_iqr(df, numerical_columns)

for col, indices in outliers_iqr.items():
    print(f"{col}: {len(indices)} outliers detected")

In [None]:
plt.figure(figsize=(15, 6))

for i, col in enumerate(numerical_columns, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x = df[col], color = "skyblue")
    plt.title(f"Box Plot of {col}")

plt.tight_layout()
plt.show()

In [None]:
def remove_outliers(df, columns, threshold=1.5):
    df_clean = df.copy()
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)  # First quartile (25%)
        Q3 = df_clean[col].quantile(0.75)  # Third quartile (75%)
        IQR = Q3 - Q1  # Interquartile range
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        df_clean = df_clean[(df_clean[col] > lower_bound) & (df_clean[col] < upper_bound)]

    return df_clean

df_cleaned = remove_outliers(df, numerical_columns)

In [None]:
# The New of Shape Cleaned Data
print(f"Number of Row : {df_cleaned.shape[0]}\nNumber of Columns : {df_cleaned.shape[1]}")

#### Cleaned Data BoxPlot To Show Outliers

In [None]:
plt.figure(figsize=(15, 6))

for i, col in enumerate(numerical_columns, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x = df_cleaned[col], color = "skyblue")
    plt.title(f"Box Plot of {col}")

plt.tight_layout()
plt.show()

In [None]:
df_cleaned.reset_index(inplace = True , drop = True)
df_cleaned.sample(5)

In [None]:
df_cleaned.info()

<div style="color:white;
            display:fill;
            border-radius:15px;
            background-color:black;
            font-size:100%;
            font-family:Verdana;
            letter-spacing:1px">
    <h1 style='padding: 20px;
              color:white;
              text-align:center;'>
         EDA of Data and Get Insights
    </h1>
    </div>

## Age distribution in the data

In [None]:
plt.figure(figsize=(15, 6))

plt.subplot(1,2,1)

sns.histplot(df_cleaned[df['Sex'] == 'M']['Age'], color='blue', kde=True, bins=20, stat='density')
plt.title('Age Distribution for Male')

plt.subplot(1,2,2)

sns.histplot(df_cleaned[df['Sex'] == 'F']['Age'], color='pink', kde=True, bins=20, stat='density')
plt.title('Age Distribution for Female')

plt.show()

- Most Ranges of Age between 50 to 60

In [None]:
fig = px.pie(values = df_cleaned['Sex'].value_counts(),
             names = ['Male','Female'],
             template = 'plotly_dark',
             title = 'The Percentage of Type of Gender in the Data'
             ).update_traces(textinfo='label+percent')
pio.renderers.default = 'iframe'
fig.show()

- Males are the dominant Category in the data

## Chest Pain Type For each gender of Sex

In [None]:
fig = px.bar(df_cleaned[df_cleaned['Sex'] == 'M']['ChestPainType'].value_counts(),
             color = df_cleaned[df_cleaned['Sex'] == 'M']['ChestPainType'].value_counts().index,
             template = 'plotly_dark',
             title = 'Most Chest Pain Type For Male',
             labels = {'value':'Count'},
             text_auto = True
             )
fig.show()

In [None]:
fig = px.bar(df_cleaned[df_cleaned['Sex'] == 'F']['ChestPainType'].value_counts(),
             color = df_cleaned[df_cleaned['Sex'] == 'F']['ChestPainType'].value_counts().index,
             template = 'plotly_dark',
             title = 'Most Chest Pain Type For Female',
             labels = {'value':'Count'},
             text_auto = True
             )
fig.show()

In [None]:
fig = px.bar(df_cleaned['ChestPainType'].value_counts(),
             color = df_cleaned['ChestPainType'].value_counts().index,
             template = 'plotly_dark',
             title = 'Most Chest Pain Type In General',
             labels = {'value':'Count'},
             text_auto = True
             )
fig.show()

#### Most Chest Pain Type
- Male and Female Most Common occur -> ASY: Asymptomatic
- Male and Female Least Common occur -> TA: Typical Angina

## Distribution of Resting Blood Pressure

In [None]:
plt.figure(figsize=(20, 12))

plt.subplot(2,2,1)

sns.histplot(df_cleaned[(df['Sex'] == 'M') & (df_cleaned['HeartDisease'] == 0)]['RestingBP'], color='blue', kde=True, bins=20, stat='density')
plt.title('Resting Blood Pressure [mm Hg] Distribution for Male Has Not Heart Disease')

plt.subplot(2,2,2)

sns.histplot(df_cleaned[(df['Sex'] == 'M') & (df_cleaned['HeartDisease'] == 1)]['RestingBP'], color='blue', kde=True, bins=20, stat='density')
plt.title('Resting Blood Pressure [mm Hg] Distribution for Male Has Heart Disease')

plt.subplot(2,2,3)

sns.histplot(df_cleaned[(df['Sex'] == 'F') & (df_cleaned['HeartDisease'] == 0)]['RestingBP'], color='pink', kde=True, bins=20, stat='density')
plt.title('Resting Blood Pressure [mm Hg] Distribution for Female Has Not Heart Disease')

plt.subplot(2,2,4)

sns.histplot(df_cleaned[(df['Sex'] == 'F') & (df_cleaned['HeartDisease'] == 1)]['RestingBP'], color='pink', kde=True, bins=20, stat='density')
plt.title('Resting Blood Pressure [mm Hg] Distribution for Female Has Heart Disease')

plt.show()

## Distribution of Cholesterol

In [None]:
plt.figure(figsize=(20, 12))

plt.subplot(2,2,1)

sns.histplot(df_cleaned[(df['Sex'] == 'M') & (df_cleaned['HeartDisease'] == 0)]['Cholesterol'], color='blue', kde=True, bins=20, stat='density')
plt.title('Cholesterol Distribution for Male Has Not Heart Disease')

plt.subplot(2,2,2)

sns.histplot(df_cleaned[(df['Sex'] == 'M') & (df_cleaned['HeartDisease'] == 1)]['Cholesterol'], color='blue', kde=True, bins=20, stat='density')
plt.title('Cholesterol Distribution for Male Has Heart Disease')

plt.subplot(2,2,3)

sns.histplot(df_cleaned[(df['Sex'] == 'F') & (df_cleaned['HeartDisease'] == 0)]['Cholesterol'], color='pink', kde=True, bins=20, stat='density')
plt.title('Cholesterol Distribution for Female Has Not Heart Disease')

plt.subplot(2,2,4)

sns.histplot(df_cleaned[(df['Sex'] == 'F') & (df_cleaned['HeartDisease'] == 1)]['Cholesterol'], color='pink', kde=True, bins=20, stat='density')
plt.title('Cholesterol Distribution for Female Has Heart Disease')

plt.show()

## Percentage of fasting For people has heart disease or not

In [None]:
fig = px.pie(values = df_cleaned[(df_cleaned['HeartDisease'] == 0)]['FastingBS'].value_counts(),
             names = ['otherwise','FastingBS > 120 mg'],
             template = 'plotly_dark',
             title = 'The Percentage of FastingBS in For People has Not Heart Disease'
             ).update_traces(textinfo='label+percent')

fig.show()

In [None]:
fig = px.pie(values = df_cleaned[(df_cleaned['HeartDisease'] == 1)]['FastingBS'].value_counts(),
             names = ['otherwise','FastingBS > 120 mg'],
             template = 'plotly_dark',
             title = 'The Percentage of FastingBS in For People has Heart Disease'
             ).update_traces(textinfo='label+percent')

fig.show()

- A higher rate of fasting affects the appearance of heart disease in people

### Resting Electrocardiogram Results (RestingECG) for people has Heart Disease or not

In [None]:
fig = px.pie(values = df_cleaned[(df_cleaned['HeartDisease'] == 0)]['RestingECG'].value_counts(),
             names = ['Normal','LVH','ST'],
             template = 'plotly_dark',
             title = 'The Percentage of Resting Electrocardiogram Results (RestingECG) in For People has Not Heart Disease'
             ).update_traces(textinfo='label+percent')

fig.show()

In [None]:
fig = px.pie(values = df_cleaned[(df_cleaned['HeartDisease'] == 1)]['RestingECG'].value_counts(),
             names = ['Normal','ST','LVH'],
             template = 'plotly_dark',
             title = 'The Percentage of Resting Electrocardiogram Results (RestingECG) in For People has Heart Disease'
             ).update_traces(textinfo='label+percent')

fig.show()

## Distribution of Maximum Heart Rate Achieved (MaxHR)

In [None]:
plt.figure(figsize=(20, 12))

plt.subplot(2,2,1)

sns.histplot(df_cleaned[(df['Sex'] == 'M') & (df_cleaned['HeartDisease'] == 0)]['MaxHR'], color='blue', kde=True, bins=20, stat='density')
plt.title('Maximum Heart Rate Achieved Distribution for Male Has Not Heart Disease')

plt.subplot(2,2,2)

sns.histplot(df_cleaned[(df['Sex'] == 'M') & (df_cleaned['HeartDisease'] == 1)]['MaxHR'], color='blue', kde=True, bins=20, stat='density')
plt.title('Maximum Heart Rate Achieved Distribution for Male Has Heart Disease')

plt.subplot(2,2,3)

sns.histplot(df_cleaned[(df['Sex'] == 'F') & (df_cleaned['HeartDisease'] == 0)]['MaxHR'], color='pink', kde=True, bins=20, stat='density')
plt.title('Maximum Heart Rate Achieved Distribution for Female Has Not Heart Disease')

plt.subplot(2,2,4)

sns.histplot(df_cleaned[(df['Sex'] == 'F') & (df_cleaned['HeartDisease'] == 1)]['MaxHR'], color='pink', kde=True, bins=20, stat='density')
plt.title('Maximum Heart Rate Achieved Distribution for Female Has Heart Disease')

plt.show()

## Oldpeak for people has Heart Disease or not

In [None]:
plt.figure(figsize=(20, 12))

plt.subplot(2,2,1)

sns.histplot(df_cleaned[(df['Sex'] == 'M') & (df_cleaned['HeartDisease'] == 0)]['Oldpeak'], color='blue', kde=True, bins=20, stat='density')
plt.title('Oldpeak Distribution for Male Has Not Heart Disease')

plt.subplot(2,2,2)

sns.histplot(df_cleaned[(df['Sex'] == 'M') & (df_cleaned['HeartDisease'] == 1)]['Oldpeak'], color='blue', kde=True, bins=20, stat='density')
plt.title('Oldpeak Distribution for Male Has Heart Disease')

plt.subplot(2,2,3)

sns.histplot(df_cleaned[(df['Sex'] == 'F') & (df_cleaned['HeartDisease'] == 0)]['Oldpeak'], color='pink', kde=True, bins=20, stat='density')
plt.title('Oldpeak Distribution for Female Has Not Heart Disease')

plt.subplot(2,2,4)

sns.histplot(df_cleaned[(df['Sex'] == 'F') & (df_cleaned['HeartDisease'] == 1)]['Oldpeak'], color='pink', kde=True, bins=20, stat='density')
plt.title('Oldpeak Distribution for Female Has Heart Disease')

plt.show()

- Higher Oldpeak values are strongly correlated with the presence of Heart Disease, indicating more severe heart ischemia or coronary artery disease.

## Total Insights
- Most Ranges of Age between 50 to 60
- Males are the dominant Category in the data
- Male and Female Most Common occur -> ASY: Asymptomatic
- Male and Female Least Common occur -> TA: Typical Angina
- A higher rate of fasting affects the appearance of heart disease in people
- The Presence of Exercise Results in a higher probability of Angina Pectoris occurring
- Higher Oldpeak values are strongly correlated with the presence of Heart Disease, indicating more severe heart ischemia or coronary artery disease.
- Flat (flat) is more strongly correlated with heart disease, while Upsloping (Up) and Down (DownSloping) are typically associated with normal heart function or less severe heart issues.


<div style="color:white;
            display:fill;
            border-radius:15px;
            background-color:black;
            font-size:100%;
            font-family:Verdana;
            letter-spacing:1px">
    <h1 style='padding: 20px;
              color:white;
              text-align:center;'>
         Data Preprocessing
    </h1>
    </div>

## Handling Object DataType

### Encode Object DataType

In [None]:
column_to_encode = df.select_dtypes(object).columns.tolist()
le = LabelEncoder()
for column in column_to_encode:
    df_cleaned[column] = le.fit_transform(df_cleaned[column])

In [None]:
# Show Data After some of Encode
df_cleaned.head()

In [None]:
# Final Info after Encode
df_cleaned.info()

## Assign feature and target variables

In [None]:
X = df_cleaned.drop('HeartDisease' , axis = 1)
y = df_cleaned['HeartDisease']

## Splitting Data

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.25 , random_state = 44 , shuffle = True)

In [None]:
print(f'Shape of X_Train {X_train.shape}')
print(f'Shape of X_Test {X_test.shape}')
print(f'Shape of Y_Train {y_train.shape}')
print(f'Shape of Y_Test {y_test.shape}')

<div style="color:white;
            display:fill;
            border-radius:15px;
            background-color:black;
            font-size:100%;
            font-family:Verdana;
            letter-spacing:1px">
    <h1 style='padding: 20px;
              color:white;
              text-align:center;'>
         Modeling
    </h1>
    </div>

## Use Different Models Algorithm

In [None]:
def Kfold(model,model_name):
    model = cross_val_score(model , X , y , cv = 10)
    model_score = np.average(model)
    print(f"{model_name} score on cross validation: {model_score * 100}%")

def train(model,model_name):
    model.fit(X_train,y_train)
    model_train_score = model.score(X_train,y_train)
    model_test_score = model.score(X_test,y_test)
    print(f"{model_name} model score on Training data: {model_train_score * 100}%\n{model_name} model score on Testing data: {model_test_score * 100}%")

def class_report(model):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

## Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators = 100 , max_depth = 50)
Kfold(rf, "Random Forest")
train(rf, "Random Forest")

In [None]:
ConfusionMatrixDisplay.from_estimator(rf,
                                       X_test,
                                       y_test,
                                       xticks_rotation=45
    );

In [None]:
class_report(rf)

## Xgboost

In [None]:
xgboost = model = XGBClassifier(objective = "binary:logistic" , subsample = 0.7 , min_child_weight = 3,
                                max_depth = 3 , learning_rate = 0.1 , gamma = 0 , colsample_bytree = 0.5,)
Kfold(xgboost, "Xgboost")
train(xgboost, "Xgboost")

In [None]:
ConfusionMatrixDisplay.from_estimator(xgboost,
                                       X_test,
                                       y_test,
                                       xticks_rotation=45
    );

In [None]:
class_report(xgboost)

- Best Model is RandomForestClassifier with recall 90.0%

## Save RandomForestClassifier Model

In [None]:
joblib.dump(rf,'RandomForestClassifier_model.sav')

# Streamlit Application
##  [🚀 Click here to go to the GitHub repo](https://github.com/ahmedismaiill/AI-Projects-Main/tree/main/1-%20Machine%20Learning%20/2-%20Classification/Heart%20Failure%20Prediction)