# Hospital Readmission EDA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_preprocessing import load_data, clean_data

# Load data
df = load_data('../data/diabetic_data.csv')
df_clean, preprocessor = clean_data(df)

# Display basic info
print(f"Dataset shape: {df_clean.shape}")
print(f"Readmission rate: {df_clean['readmitted'].mean():.2%}")

In [None]:
# Readmission distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='readmitted', data=df_clean)
plt.title('Readmission Distribution')
plt.xlabel('Readmitted within 30 days')
plt.ylabel('Count')
plt.show()

In [None]:
# Age vs Readmission
plt.figure(figsize=(10, 6))
sns.boxplot(x='readmitted', y='age', data=df_clean)
plt.title('Age vs Readmission')
plt.xlabel('Readmitted within 30 days')
plt.ylabel('Age')
plt.show()

In [None]:
# Time in hospital vs Readmission
plt.figure(figsize=(10, 6))
sns.boxplot(x='readmitted', y='time_in_hospital', data=df_clean)
plt.title('Time in Hospital vs Readmission')
plt.xlabel('Readmitted within 30 days')
plt.ylabel('Days in Hospital')
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))
numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
corr_matrix = df_clean[numeric_cols].corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()