# Hospital Readmission EDA

Exploratory Data Analysis for the hospital readmission dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set styles
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

In [None]:
# Load the dataset
df = pd.read_csv("data/train_df.csv")

# Standardize column names
df.columns = df.columns.str.lower().str.strip()

# Display shape and basic info
print("Dataset shape:", df.shape)
df.info()

In [None]:
# Check missing values
missing = df.isnull().sum()
print("\nMissing values per column:")
print(missing[missing > 0])

In [None]:
# Summary statistics
print("\nSummary Statistics:")
display(df.describe(include = "all").transpose())

In [None]:
# Count plots for categorical columns
categorical_cols = ["gender", "primary_diagnosis", "discharge_to", "readmitted"]
for col in categorical_cols:
    print(f"\nValue counts for {col}:")
    print(df[col].value_counts(), "\n")
    sns.countplot(x = col, data = df, palette = "Set2")
    plt.title(f"Distribution of {col}")
    plt.xticks(rotation = 45)
    plt.tight_layout()
    plt.show()

In [None]:
# Histogram of numeric columns
numeric_cols = ["age", "days_in_hospital", "num_procedures", "comorbidity_score"]
df[numeric_cols].hist(bins = 15, figsize = (12, 8), edgecolor = 'black')
plt.suptitle("Distribution of Numeric Features", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
corr = df[numeric_cols].corr()
sns.heatmap(corr, annot = True, cmap = "coolwarm", fmt = ".2f", square = True)
plt.title("Correlation Heatmap of Numeric Features")
plt.show()

In [None]:
# Boxplots to explore relationship with readmission
for col in numeric_cols:
    sns.boxplot(x = "readmitted", y = col, data = df, palette = "Set3")
    plt.title(f"{col} vs. Readmission")
    plt.tight_layout()
    plt.show()

In [None]:
# Class balance check
readmit_counts = df["readmitted"].value_counts(normalize = True)
print("\nClass Distribution (Readmitted):")
print(readmit_counts)

sns.barplot(x = readmit_counts.index, y = readmit_counts.values, palette = "Set1")
plt.title("Readmission Class Balance")
plt.ylabel("Proportion")
plt.xlabel("Readmitted")
plt.ylim(0, 1)
plt.tight_layout()
plt.show()