# Background

# Dataset

# Objective

# Method

<h1> packages installation </h1>

In [34]:
!pip install kagglehub




<h1> Imports </h1>


In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import kagglehub

<h1> Dataset path </h1>

In [None]:
if 'vscode' in sys.modules:
    DATASET_PATH = './heart.csv'
elif 'google.colab' in sys.modules:
    DATASET_PATH = kagglehub.dataset_download('fedesoriano/heart-failure-prediction/heart.csv')
else:
    raise Exception('Unknown environment')

## Data Exploration and visualization

In [None]:

# Load the dataset
df = pd.read_csv(DATASET_PATH)
df

In [None]:
selected = df.iloc[:,:-1]

for column in selected.columns:
    # Check if the column is categorical (object type)
    if selected[column].dtype == 'object':
        # Create a frequency plot for categorical columns
        selected[column].value_counts().plot(kind='bar', title=column, figsize=(2, 2))
        plt.show()  # Show the plot for this column

    else:
        # Create a histogram for numerical columns
        selected[column].hist(figsize=(2, 2))
        plt.title(column)  # Add a title to the plot
        plt.show()  # Show the plot for this column

In [None]:
selected = df.loc[:,['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']]
selected.plot(kind='density', subplots=True, layout=(3,5), sharex=False,figsize=(15,15))

In [None]:
# Setting up the plot
num_features = 5
fig, axes = plt.subplots(nrows=num_features, ncols=1, figsize=(10, 5 * num_features))

# Loop through each feature to create a PDF plot
for i, column in enumerate(selected):
    # Plotting the density
    df[column].plot(kind='density', ax=axes[i], color='blue', alpha=0.5, label='PDF')

    # Calculate mean, median, and mode
    mean = df[column].mean()
    median = df[column].median()
    mode = df[column].mode()[0]

    # Marking mean, median, and mode with horizontal lines
    axes[i].axvline(x=mean / 2, color='red', linestyle='--', label=f'Mean: {mean:.2f}')  # Adjusted y-value for visibility
    axes[i].axvline(x=median / 2, color='green', linestyle='--', label=f'Median: {median:.2f}')  # Adjusted y-value for visibility
    axes[i].axvline(x=mode / 2, color='purple', linestyle='--', label=f'Mode: {mode:.2f}')  # Adjusted y-value for visibility

    # Adding titles and labels
    axes[i].set_title(f'Probability Density Function for {column}')
    axes[i].set_xlabel(column)
    axes[i].set_ylabel('Density')
    axes[i].legend()
    axes[i].grid()

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
selected = df.loc[:,['Age','Sex','RestingBP','Cholesterol','FastingBS','MaxHR','Oldpeak']]
selected['Sex'] = selected['Sex'].replace({'M': 1, 'F': 0})  # Substitute 'M' with 1 and 'F' with 0
selected.skew()

In [None]:
selected = df.loc[:,['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']]
selected.plot(kind='box', subplots=True, sharex=False, sharey=False, layout=(5,3), figsize=(15,10)) #very important to detect outliers that can cause data skew that need to be removed

In [None]:
df.describe()

In [None]:
# Check for missing values in each column
df.isnull().sum()

# Visualize missing values using a heatmap (optional)
# import seaborn as sns
# import matplotlib.pyplot as plt
# sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
# plt.show()

## Feature extraction and selection

## Data cleaning and processing

### Handling missing data
Given that the dataset contains no missing values for any of the features, there was no need to perform replacement for missing values, dropping any rows with missing values, or dropping any features.

### Removing outliers

In [None]:
Q1 = np.percentile(df['Age'], 25)
Q3 = np.percentile(df['Age'], 75)
IQR = Q3 - Q1
df = df[(df['Age'] >= Q1 - 1.5 * IQR) & (df['Age'] <= Q3 + 1.5 * IQR)] #this will pull the value of the skew closer to 0

In [None]:
Q1 = np.percentile(df['RestingBP'], 25)
Q3 = np.percentile(df['RestingBP'], 75)
IQR = Q3 - Q1
df = df[(df['RestingBP'] >= Q1 - 1.5 * IQR) & (df['RestingBP'] <= Q3 + 1.5 * IQR)] #this will pull the value of the skew closer to 0

In [None]:
Q1 = np.percentile(df['Cholesterol'], 25)
Q3 = np.percentile(df['Cholesterol'], 75)
IQR = Q3 - Q1
df = df[(df['Cholesterol'] >= Q1 - 1.5 * IQR) & (df['Cholesterol'] <= Q3 + 1.5 * IQR)] #this will pull the value of the skew closer to 0

In [None]:
Q1 = np.percentile(df['MaxHR'], 25)
Q3 = np.percentile(df['MaxHR'], 75)
IQR = Q3 - Q1
df = df[(df['MaxHR'] >= Q1 - 1.5 * IQR) & (df['MaxHR'] <= Q3 + 1.5 * IQR)] #this will pull the value of the skew closer to 0

In [None]:
Q1 = np.percentile(df['Oldpeak'], 25)
Q3 = np.percentile(df['Oldpeak'], 75)
IQR = Q3 - Q1
df = df[(df['Oldpeak'] >= Q1 - 1.5 * IQR) & (df['Oldpeak'] <= Q3 + 1.5 * IQR)] #this will pull the value of the skew closer to 0

In [None]:
df

In [None]:
selected = df.loc[:,['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']]
selected.plot(kind='box', subplots=True, sharex=False, sharey=False, layout=(5,3), figsize=(15,10)) #very important to detect outliers that can cause data skew that need to be removed

In [None]:
Q1 = np.percentile(df['Cholesterol'], 25)
Q3 = np.percentile(df['Cholesterol'], 75)
IQR = Q3 - Q1
df = df[(df['Cholesterol'] >= Q1 - 1.5 * IQR) & (df['Cholesterol'] <= Q3 + 1.5 * IQR)] #this will pull the value of the skew closer to 0

In [None]:
selected = df.loc[:,['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']]
selected.plot(kind='box', subplots=True, sharex=False, sharey=False, layout=(5,3), figsize=(15,10)) #very important to detect outliers that can cause data skew that need to be removed

In [None]:
Q1 = np.percentile(df['Cholesterol'], 25)
Q3 = np.percentile(df['Cholesterol'], 75)
IQR = Q3 - Q1
df = df[(df['Cholesterol'] >= Q1 - 1.5 * IQR) & (df['Cholesterol'] <= Q3 + 1.5 * IQR)] #this will pull the value of the skew closer to 0

In [None]:
selected = df.loc[:,['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']]
selected.plot(kind='box', subplots=True, sharex=False, sharey=False, layout=(5,3), figsize=(15,10)) #very important to detect outliers that can cause data skew that need to be removed

In [None]:
df

In [None]:
selected = df.loc[:,['Age','Sex','RestingBP','Cholesterol','FastingBS','MaxHR','Oldpeak']]
selected['Sex'] = selected['Sex'].replace({'M': 1, 'F': 0})  # Substitute 'M' with 1 and 'F' with 0
selected.skew()

### Dropping Duplicates
Given that the dataset is given with no duplicates, we did not need to remove any duplicates.

### Correlation

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

selected_df_corr = df.loc[:,['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']]

# Assuming 'data' is your DataFrame
correlation_matrix = selected_df_corr.corr()  # Calculate correlation matrix for numerical columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(correlation_matrix, cmap='Reds', annot=True, fmt='.2f')

plt.show()

In [None]:
df.plot(kind='scatter', x='Age',y='Oldpeak')

Since the correlation between the numerical features is not strong, we decide not to exclude any of the features. Even the strongest correlation of them between the oldpeak and Age are not strong enough to exclude any of them.

### Feature Scaling

We opt for Standardization becauase: These features have different units (e.g., mmHg, mg/dL) and wide ranges. Standardization ensures they are on a comparable scale (mean 0, standard deviation 1) while preserving the shape of the distribution.

In [None]:
from sklearn.preprocessing import StandardScaler
features = ['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']
sc = StandardScaler()
df[features] = sc.fit_transform(df[features])

In [None]:
df

## PCA analysis

In [None]:


from sklearn.decomposition import PCA

# Assuming 'df' is your DataFrame with scaled features
features = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
X = df[features]

# Apply PCA with 2 components
pca = PCA(n_components=1)
principalComponents = pca.fit_transform(X)

# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
print(explained_variance_ratio)
total_variance_explained = np.sum(explained_variance_ratio)
print(f"Total variance explained by the two components: {total_variance_explained}")

QQ-Plot

## Model selection

## Model training

## Model evaluation

# Results

## Clustering

## Classification

# Discussion and Future Insight