# Hospital Readmission EDA

Exploratory Data Analysis for the hospital readmission dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set styles
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)


In [None]:
# Load the dataset
df = pd.read_csv("data/train_df.csv")

# Standardize column names
df.columns = df.columns.str.lower().str.strip()

# Display shape and basic info
print("Dataset shape:", df.shape)
df.info()


In [None]:
# Check missing values
missing = df.isnull().sum()
print("Missing values per column:")
print(missing[missing > 0])


In [None]:
# Summary statistics
df.describe(include="all").transpose()


In [None]:
# Count plot for categorical columns
categorical_cols = ["gender", "primary_diagnosis", "discharge_to", "readmitted"]
for col in categorical_cols:
    print(f"Value counts for {col}:
")
    print(df[col].value_counts(), "\n")


In [None]:
# Histogram of numeric columns
numeric_cols = ["age", "days_in_hospital", "num_procedures", "comorbidity_score"]
df[numeric_cols].hist(bins=15, figsize=(12, 8))
plt.suptitle("Distribution of Numeric Features")
plt.tight_layout()
plt.show()


In [None]:
# Correlation matrix
corr = df[numeric_cols].corr()

sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap of Numeric Features")
plt.show()
