In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set plot style for better aesthetics
sns.set(style="whitegrid")

# 1. Load the Dataset

In [None]:
# Load the dataset
df = pd.read_csv("dataset/heart_disease.csv")

# Show the first few rows
df.head()

# 2. Missing Value Analysis
Identify and handle missing values (The original dataset has some marked with '?')

In [None]:
# Check for '?' and replace with NaN (Not a Number)
df = df.replace('?', np.nan)

# Now convert all columns to numeric types (since '?' made them object types)
# 'errors="coerce"' will turn any remaining non-numeric values into NaN
df = df.apply(pd.to_numeric, errors='coerce')

In [None]:
# Count missing values in each column
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0])

In [None]:
# Handle missing values
# Since the number of missing values is likely small, we can drop these rows
# or fill them with the mean/median. Let's start by dropping them for simplicity.

df = df.dropna()

# Verify that we have no more missing values
print("Remaining missing values:", df.isnull().sum().sum())

# 3. Statistical Summary
Report mean, median, and standard deviation for numerical features.

In [None]:
# Get a statistical summary (mean, std, min, max, etc.)
df.describe()

# 4. Visualizations

### Distribution of Target Variable
Check for class imbalance.

In [None]:
# Plot the distribution of the target variable 'num'
# Typically 'num' > 0 indicates heart disease
# We can group >0 as 1 (disease) and 0 as 0 (no disease) for binary classification later,
# but for EDA we just view the raw distribution.

plt.figure(figsize=(8, 6))
sns.countplot(x='num', data=df)
plt.title('Distribution of Target Variable (num)')
plt.xlabel('Diagnosis (0 = No Disease, 1-4 = Disease Severity)')
plt.ylabel('Count')
plt.show()

### Histograms for Age and Cholesterol

In [None]:
# Histogram for Age
plt.figure(figsize=(10, 6))
sns.histplot(df['age'], bins=20, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Histogram for Cholesterol
plt.figure(figsize=(10, 6))
sns.histplot(df['chol'], bins=20, kde=True, color='orange')
plt.title('Cholesterol Level Distribution')
plt.xlabel('Cholesterol (mg/dl)')
plt.ylabel('Frequency')
plt.show()

### Correlation Heatmap
See feature relationships.

In [None]:
# Calculate the correlation matrix
corr = df.corr()

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# 5. Save Processed Data
Save the cleaned CSV file into a new directory.

In [None]:
# Create a new directory for processed data if it doesn't exist
output_dir = "processed_data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the cleaned dataframe
output_path = os.path.join(output_dir, "heart_disease_cleaned.csv")
df.to_csv(output_path, index=False)

print(f"Cleaned dataset saved to: {output_path}")