In [None]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Cell 2: Load Data
df = pd.read_csv('../data/raw/heart_disease.csv')
print(f"Dataset shape: {df.shape}")
df.head()

# Cell 3: Basic Info
print("Data Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())
print("\nBasic Statistics:")
df.describe()

# Cell 4: Target Distribution
plt.figure(figsize=(8, 5))
df['num'].value_counts().plot(kind='bar')
plt.title('Heart Disease Distribution')
plt.xlabel('Disease Presence (0=No, 1-4=Yes)')
plt.ylabel('Count')
plt.savefig('../reports/target_distribution.png', dpi=300)
plt.show()

# Cell 5: Correlation Heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig('../reports/correlation_heatmap.png', dpi=300)
plt.show()

# Cell 6: Age Distribution
plt.figure(figsize=(10, 6))
plt.hist(df['age'], bins=20, edgecolor='black')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.savefig('../reports/age_distribution.png', dpi=300)
plt.show()

# Cell 7: Pairplot of Key Features
key_features = ['age', 'trestbps', 'chol', 'thalach', 'num']
sns.pairplot(df[key_features], hue='num', diag_kind='kde')
plt.savefig('../reports/pairplot.png', dpi=300)
plt.show()