In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('../data/synthetic_patients.csv')

# Preview data
df.head()

# Basic info
print("Shape:", df.shape)
df.info()

# Check for missing values
missing = df.isnull().sum()
print("Missing values:\n", missing[missing > 0])

# Summary statistics
df.describe()

# Class distribution
sns.countplot(x='cardio_risk', data=df)
plt.title("Cardiovascular Risk Distribution")
plt.xlabel("Risk Level (0 = Low, 1 = High)")
plt.ylabel("Count")
plt.show()

# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.show()

# Age distribution by risk
sns.histplot(data=df, x='age', hue='cardio_risk', bins=20, kde=True)
plt.title("Age Distribution by Risk Level")
plt.show()

# Boxplot for cholesterol
sns.boxplot(x='cardio_risk', y='cholesterol', data=df)
plt.title("Cholesterol Levels by Risk Group")
plt.show()

# Pairplot (optional for small datasets)
# sns.pairplot(df, hue='cardio_risk')

# Categorical feature analysis
categorical = ['gender', 'smoking_status', 'diabetes_status', 'ethnicity']
for col in categorical:
    plt.figure()
    sns.countplot(x=col, hue='cardio_risk', data=df)
    plt.title(f"{col} vs Cardiovascular Risk")
    plt.xticks(rotation=45)
    plt.show()
