In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set plot style for better aesthetics
sns.set(style="whitegrid", palette="muted")

# 1. Load the Dataset

In [None]:
# Load the dataset
df = pd.read_csv("dataset/heart_disease.csv")

# Show the first few rows
print(f"Shape: {df.shape}")
df.head()

# 2. Data Cleaning & Missing Value Analysis
- Check for duplicates.
- Identify and handle missing values (The original dataset has some marked with '?').

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicates: {duplicates}")

# Drop duplicates if any
if duplicates > 0:
    df.drop_duplicates(inplace=True)
    print("Duplicates removed.")

In [None]:
# Replace '?' with NaN (Not a Number)
df = df.replace('?', np.nan)

# Convert all columns to numeric types
# 'errors="coerce"' will turn any remaining non-numeric values into NaN
df = df.apply(pd.to_numeric, errors='coerce')

# Check missing values
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values[missing_values > 0])

In [None]:
# Handle missing values by Imputation
# Strategy: Median for numerical/ordinal features to be robust to outliers.

for col in df.columns:
    if df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
        print(f"Filled missing values in '{col}' with median: {median_val}")

print("Remaining missing values:", df.isnull().sum().sum())

# 3. Target Variable Analysis (Binary Classification)
The dataset originally has a target variable 'num' ranging from 0 to 4.
- 0 = No Disease
- 1, 2, 3, 4 = Disease Presence

For the purpose of this analysis (Binary Classification), we will analyze the target as:
- 0 -> No Disease
- 1 -> Disease (mapped from > 0)

In [None]:
# Create a binary target column for analysis
# We strictly create a new column 'target' to visualize patterns
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

# Check Class Distribution
target_counts = df['target'].value_counts()
print("Target Variable Distribution (0=No Disease, 1=Disease):")
print(target_counts)

# Calculate Ratio
print(f"Disease Ratio: {target_counts[1] / len(df):.2%}")

# 4. Statistical Summary
Report mean, median, and standard deviation for numerical features.

In [None]:
df.describe().T

# 5. Visualizations

### Distribution of the Binary Target Variable

In [None]:
plt.figure(figsize=(7, 5))
sns.countplot(x='target', data=df, palette='coolwarm')
plt.title('Distribution of Target Variable (Binary)')
plt.xlabel('Diagnosis (0 = No Disease, 1 = Disease)')
plt.ylabel('Count')
plt.xticks([0, 1], ['No Disease (0)', 'Disease (1)'])
plt.show()

### Histograms for Age and Cholesterol

In [None]:
# Histogram for Age vs Target
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='age', hue='target', bins=20, kde=True, palette='coolwarm', multiple='stack')
plt.title('Age Distribution by Target Class')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.legend(title='Disease', labels=['Yes', 'No'])
plt.show()

In [None]:
# Histogram for Cholesterol vs Target
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='chol', hue='target', bins=20, kde=True, palette='viridis', multiple='stack')
plt.title('Cholesterol Distribution by Target Class')
plt.xlabel('Cholesterol (mg/dl)')
plt.ylabel('Frequency')
plt.legend(title='Disease', labels=['Yes', 'No'])
plt.show()

### Correlation Heatmap

In [None]:
# Calculate the correlation matrix (dropping the original non-binary 'num' to focus on 'target')
cols_to_corr = df.drop('num', axis=1)
corr = cols_to_corr.corr()

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='RdBu_r', fmt='.2f', linewidths=0.5, vmin=-1, vmax=1)
plt.title('Correlation Heatmap (Binary Target)')
plt.show()

# 6. Save Processed Data
Save the cleaned CSV file. This file contains the imputation and the original 'num' column (preserved for preprocessing steps), as well as the new 'target' binary column.

In [None]:
# Create directory if needed
output_dir = "processed_data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the cleaned dataframe
# We keep 'num' just in case downstream notebooks expect it for their own transformations,
# but 'target' is available for immediate use.
output_path = os.path.join(output_dir, "heart_disease_cleaned.csv")
df.to_csv(output_path, index=False)

print(f"Cleaned dataset saved to: {output_path}")
print("Columns included:", df.columns.tolist())