In [27]:
import pandas as pd
import os
import warnings
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import ast

warnings.filterwarnings("ignore")

In [None]:
# Use this chunk to generate the dataset if not already created
# df = pd.DataFrame()
# files = os.listdir("./data/patients_new")
# for file in files:
    # patient = pd.read_json(os.path.join("./data/patients_new/", file))
    # df = pd.concat([df, patient], ignore_index=True)
# df.reset_index(drop=True)
# df.to_csv("./collated_dataset.csv", index=False)

df = pd.read_csv("./collated_dataset.csv")

In [None]:
# Normalize admit and discharge times, and calculate duration of stay
df = df.sort_values(by=["admittime"]).reset_index(drop=True)
df['admittime_wy'] = df['admittime'].apply(lambda x: pd.to_datetime(x).replace(year=2024))
df['dischtime_wy'] = df['dischtime'].apply(lambda x: pd.to_datetime(x).replace(year=2024))
df['dischtime_wy'] = df.apply(lambda row: pd.to_datetime(row['dischtime_wy']).replace(year=2025) if row['dischtime_wy'].month < row['admittime_wy'].month else row['dischtime_wy'], axis=1)
df["duration_of_stay"] = pd.to_datetime(df["dischtime_wy"]) - pd.to_datetime(df["admittime_wy"])
df.head()

In [None]:
# Take the sub-datasets (patients, hosp, poe, etc.), flatten them and append them to the main dataframe
def normalize_jsons(column, dropper, sorter):
    data = []
    for datum in column:
        json_df = pd.json_normalize(ast.literal_eval(datum)[0])
        data.append(json_df)
    column_df = pd.concat(data, ignore_index=True).drop_duplicates(subset=dropper).sort_values(by=sorter).reset_index(drop=True)
    return column_df

In [None]:
df = pd.merge(df, normalize_jsons(df["patients"], ["subject_id"], ["anchor_year", "subject_id"]), how='left', on="subject_id", suffixes=("", ""))
df.head()

In [None]:
# Visualize admit and discharge time deltas (duration of stay) for all patients. Duration of stay is a good indicator of the value of care derived by class of patient.
plt.figure(figsize=(10, 6))
plt.scatter(pd.to_datetime(df['admittime_wy']), df['admittime_wy'], color="steelblue")
plt.scatter(pd.to_datetime(df['dischtime_wy']), df['admittime_wy'], color="firebrick")

plt.xlabel('Admit & Discharge Time')
plt.ylabel('Admit Time')
plt.title('Admission and Discharge Times')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
# We'll explore duration of stay based on four demographic indicators: class, race, gender and age.
# Race and gender are provided in the data. We'll use insurance (Medicaid or not) as a proxy for class. Age has been obfuscated in the dataset to "anchor age", which is approximately accurate.

In [None]:
# We'll look at the distribution of ages to make sure it's about correct.
# Note that the spike at age 91 is because everyone above 89 is assigned 91 for privacy.
bins = len(df["anchor_age"].value_counts())
ax = df["anchor_age"].hist(bins=bins, xlabelsize=10, ylabelsize=6, color="steelblue")
ax2 = df.drop_duplicates(subset="subject_id")["anchor_age"].hist(bins=bins, xlabelsize=10, ylabelsize=6, color="firebrick")

ax.set_title('Distribution of ages across stays and patients', weight='bold')
ax.set_xlabel('Anchor age')
ax.set_ylabel('Frequency')
plt.show()

In [None]:
# Medicaid patients are discharged, on average, slightly quicker than other patients
medicaid_df = df[df["insurance"] == "Medicaid"]
main_df = df[~df.index.isin(medicaid_df.index)]

ax = (df["duration_of_stay"].dt.total_seconds() / 86400).hist(bins=12, xlabelsize=10, ylabelsize=6, color="steelblue")
ax2 = (medicaid_df["duration_of_stay"].dt.total_seconds() / 86400).hist(bins=12, xlabelsize=10, ylabelsize=6, color="firebrick")
ax.set_title('Duration of stay of all patients vs Medicaid insured patients', weight='bold')
ax.set_xlabel('Duration (days)')
ax.set_ylabel('Frequency')
plt.show()

print(f"Non-Medicaid-insured patients: {main_df['duration_of_stay'].mean()}")
print(f"Medicaid-insured patients: {medicaid_df['duration_of_stay'].mean()}")

In [None]:
# Non-white patients are discharged, on average, significantly quicker than white patients
non_white_df = df[~df["race"].str.contains("WHITE")]
main_df = df[~df.index.isin(non_white_df.index)]

ax = (df["duration_of_stay"].dt.total_seconds() / 86400).hist(bins=12, xlabelsize=10, ylabelsize=6, color="steelblue")
ax2 = (non_white_df["duration_of_stay"].dt.total_seconds() / 86400).hist(bins=12, xlabelsize=10, ylabelsize=6, color="firebrick")
ax.set_title('Duration of stay of all patients vs non-white patients', weight='bold')
ax.set_xlabel('Duration (days)')
ax.set_ylabel('Frequency')
plt.show()

print(f"White patients: {main_df['duration_of_stay'].mean()}")
print(f"Non-white patients: {non_white_df['duration_of_stay'].mean()}")

In [None]:
# Female patients are discharged much quicker than male patients
female_df = df[df["gender"] == "F"]
main_df = df[~df.index.isin(female_df.index)]

ax = (df["duration_of_stay"].dt.total_seconds() / 86400).hist(bins=12, xlabelsize=10, ylabelsize=6, color="steelblue")
ax2 = (female_df["duration_of_stay"].dt.total_seconds() / 86400).hist(bins=12, xlabelsize=10, ylabelsize=6, color="firebrick")
ax.set_title('Duration of stay of female patients vs male', weight='bold')
ax.set_xlabel('Duration (days)')
ax.set_ylabel('Frequency')
plt.show()

print(f"Male patients: {main_df['duration_of_stay'].mean()}")
print(f"Female patients: {female_df['duration_of_stay'].mean()}")

In [None]:
# There doesn't appear to be a significant trend of duration with age past 45 or so
old_df = df[df["anchor_age"] >= 65]
main_df = df[~df.index.isin(old_df.index)]

plt.scatter(df["anchor_age"], (df["duration_of_stay"].dt.total_seconds() / 86400), c=df["anchor_age"].apply(lambda x: "firebrick" if x >= 65 else "steelblue"))
plt.xlabel('Anchor Age')
plt.ylabel('Duration of Stay (days)')
plt.title('Duration of Stay vs Anchor Age', weight='bold')
plt.show()

print(f"Young patients: {main_df['duration_of_stay'].mean()}")
print(f"Old patients: {old_df['duration_of_stay'].mean()}")

In [None]:
# We can also look at the interactions of protected attributes to see if a trend emerges. For example, are black women treated worse than other patients?

In [None]:
# Age and race don't reveal a meaningful pattern
plt.scatter(df["anchor_age"], (df["duration_of_stay"].dt.total_seconds() / 86400), c=df["race"].apply(lambda x: "firebrick" if x != "WHITE" else "steelblue"))
plt.xlabel('Anchor Age')
plt.ylabel('Duration of Stay (days)')
plt.title('Duration of Stay against Anchor Age and Race', weight='bold')
plt.show()

In [None]:
# Age and gender reveal that
plt.scatter(df["anchor_age"], (df["duration_of_stay"].dt.total_seconds() / 86400), c=df["gender"].apply(lambda x: "firebrick" if x == "F" else "steelblue"))
plt.xlabel('Anchor Age')
plt.ylabel('Duration of Stay (days)')
plt.title('Duration of Stay against Anchor Age and Gender', weight='bold')
plt.show()

In [None]:
# We can now look at variation in stay duration based on other information, such as admission circumstances.

In [None]:
# Patients admitted under emergency conditions are in hospital for significantly longer than other patients
emergency_df = df[(df["admission_type"].str.contains("EMER")) | (df["admission_type"].str.contains("URGENT"))]
main_df = df[~df.index.isin(emergency_df.index)]

ax = (df["duration_of_stay"].dt.total_seconds() / 86400).hist(bins=12, xlabelsize=10, ylabelsize=6, color="steelblue")
ax2 = (emergency_df["duration_of_stay"].dt.total_seconds() / 86400).hist(bins=12, xlabelsize=10, ylabelsize=6, color="firebrick")
ax.set_title('Duration of stay of all patients vs emergency-admitted patients', weight='bold')
ax.set_xlabel('Duration (days)')
ax.set_ylabel('Frequency')
plt.show()

print(f"Otherwise-admitted patients: {main_df['duration_of_stay'].mean()}")
print(f"Emergency-admitted patients: {emergency_df['duration_of_stay'].mean()}")

In [None]:
# There is seasonality to stay duration: patients admitted in the summer months are discharged much quicker
summer_df = df[(df["admittime_wy"].dt.month >= 5) & (df['admittime_wy'].dt.month <= 9)]
main_df = df[~df.index.isin(summer_df.index)]

ax = (df["duration_of_stay"].dt.total_seconds() / 86400).hist(bins=12, xlabelsize=10, ylabelsize=6, color="steelblue")
ax2 = (summer_df["duration_of_stay"].dt.total_seconds() / 86400).hist(bins=12, xlabelsize=10, ylabelsize=6, color="firebrick")
ax.set_title('Duration of stay of summer admits vs other patients', weight='bold')
ax.set_xlabel('Duration (days)')
ax.set_ylabel('Frequency')
plt.show()

print(f"Patients admitted outside summer: {main_df['duration_of_stay'].mean()}")
print(f"Patients admitted during summer: {summer_df['duration_of_stay'].mean()}")

In [None]:
# Could age be the reason? While age does account for some of the difference, it doesn't really seem to capture the effect
old_df = df[df["anchor_age"] >= 65]
main_df = df[~df.index.isin(old_df.index)]

plt.scatter(pd.to_datetime(df["admittime_wy"]), (df["duration_of_stay"].dt.total_seconds() / 86400), c=df["anchor_age"].apply(lambda x: "firebrick" if x >= 65 else "steelblue"))
plt.xlabel('Admit Time')
plt.ylabel('Duration of Stay (days)')
plt.title('Duration of Stay against Admit Time & Age', weight='bold')
plt.xticks(rotation=45)
plt.show()

print(f"Young winter patients: {old_df[(old_df['admittime_wy'].dt.month < 5) | (df['admittime_wy'].dt.month > 9)]['duration_of_stay'].mean()}")
print(f"Old winter patients: {main_df[(df['admittime_wy'].dt.month < 5) | (df['admittime_wy'].dt.month > 9)]['duration_of_stay'].mean()}")