Notebook to create demograph statistics on the dataset

In [None]:
import polars as pl
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
from file_paths import MIMIC_DIR, OUTPUT_DIR, MAPPING_DIR, ATHENA_PATH 
from femr.ontology import Ontology
import pandas as pd

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
sys.path.append(os.path.join(parent_dir, "src"))

from models.clmbr_t_base import get_tokenizer
from utility.codes import ICD9_PREFIX, ICD10_PREFIX

DATA_DIR = os.path.join(MIMIC_DIR, "hosp")
PLOT_DIR = os.path.join(OUTPUT_DIR, "plots")

os.makedirs(PLOT_DIR, exist_ok=True)

In [None]:
# Load the data
patients_df = pl.read_csv(os.path.join(DATA_DIR, "patients.csv"))

In [None]:
# Calculate age statistics
age_series = patients_df["anchor_age"]
median_age = age_series.median()
mean_age = age_series.mean()
std_dev_age = age_series.std()
min_age = age_series.min()
max_age = age_series.max()
iqr_age = age_series.quantile(0.75) - age_series.quantile(0.25)
q1 = age_series.quantile(0.25)
q3 = age_series.quantile(0.75)

# Print the results
print(f"Median Age: {median_age} years")
print(f"Mean Age: {mean_age:.1f} years")
print(f"Standard Deviation: {std_dev_age:.1f} years")
print(f"Minimum Age: {min_age} years")
print(f"Maximum Age: {max_age} years")
print(f"Interquartile Range: {iqr_age:.1f} years ({q1:.1f} - {q3:.1f} years)")

# Calculate the sex distribution
sex_distribution = patients_df.groupby("gender").agg([
    pl.count("subject_id").alias("count"),
    (pl.count("subject_id") / len(patients_df) * 100).alias("percentage")
])


print("\nSex distribution:")
print(sex_distribution)

In [None]:
# Extract anchor ages
anchor_ages = patients_df["anchor_age"]

# Define bins for each year up to 100 years
bins = range(18, 101)  # Assuming the minimum age is 18 and maximum age is 100

# Plot the distribution
plt.figure(figsize=(10, 6))
plt.hist(anchor_ages, bins=bins, color='skyblue', edgecolor='black')
plt.xlabel('Age at first data entry')
plt.ylabel('Patients')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Save the plot
plt.tight_layout()
plt.savefig(os.path.join(PLOT_DIR, "distribution_anchor_ages.png"))

# Display the plot
plt.show()


In [None]:

# Load the data
DATA_DIR = "/home/niclas/Dokumente/thesis-daten/mimic-iv-2.2/2.2/hosp"
patients_df = pl.read_csv(os.path.join(DATA_DIR, "patients.csv"))

# Calculate the distribution of anchor_year_group
year_group_distribution = patients_df.group_by("anchor_year_group").agg([
    pl.count("subject_id").alias("count")
])

# Convert to Pandas for plotting
year_group_distribution_pd = year_group_distribution.to_pandas()

# Define the order of the year groups
year_group_order = ["2008 - 2010", "2011 - 2013", "2014 - 2016", "2017 - 2019", "2020 - 2022"]
year_group_distribution_pd["anchor_year_group"] = pd.Categorical(year_group_distribution_pd["anchor_year_group"], categories=year_group_order, ordered=True)

# Sort the DataFrame by the specified order
year_group_distribution_pd = year_group_distribution_pd.sort_values("anchor_year_group")

# Plot the distribution
plt.figure(figsize=(10, 6))
plt.bar(year_group_distribution_pd["anchor_year_group"], year_group_distribution_pd["count"], color='skyblue')
plt.xlabel('Anchor Year Group')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


In [None]:
count_2020_2022 = patients_df.filter(pl.col("anchor_year_group") == "2020 - 2022").shape[0]

print(f"Number of patients in the year group 2020 - 2022: {count_2020_2022}")

In [None]:
# Load the data
patients_df = pl.read_csv(os.path.join(DATA_DIR, "patients.csv"))

# Calculate the distribution of anchor_year_group
year_group_distribution = patients_df.groupby("anchor_year_group").agg([
    pl.count("subject_id").alias("count")
])

year_group_distribution = year_group_distribution.filter(pl.col("count") > 2)


# Convert to Pandas for plotting
year_group_distribution_pd = year_group_distribution.to_pandas()

# Define the order of the year groups
year_group_order = ["2008 - 2010", "2011 - 2013", "2014 - 2016", "2017 - 2019", "2020 - 2022"]
year_group_distribution_pd["anchor_year_group"] = pd.Categorical(year_group_distribution_pd["anchor_year_group"], categories=year_group_order, ordered=True)

# Sort the DataFrame by the specified order
year_group_distribution_pd = year_group_distribution_pd.sort_values("anchor_year_group")

# Calculate percentages
total_count = year_group_distribution_pd["count"].sum()
year_group_distribution_pd["percentage"] = (year_group_distribution_pd["count"] / total_count) * 100

# Function to format y-axis with commas
def format_yaxis(value, _):
    return f'{int(value):,}'

# Plot the distribution
plt.figure(figsize=(10, 6))
bars = plt.bar(year_group_distribution_pd["anchor_year_group"], year_group_distribution_pd["count"], color='skyblue')

# Add percentage labels on the bars
for bar, percentage in zip(bars, year_group_distribution_pd["percentage"]):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{percentage:.2f}%', va='bottom')  # va: vertical alignment

# Format y-axis
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_yaxis))

plt.xlabel('Anchor Year Group')
plt.ylabel('Patient Count')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(PLOT_DIR, "barplot_anchor_year_group.png"))
plt.show()


Generate codes sets for the tasks based on the OMOP CDM

In [None]:
ontology = Ontology(ATHENA_PATH)

Acute Kidney Failure

In [None]:
aki_codes = ontology.get_all_children("SNOMED/14669001")
aki_label_codes = [code for code in aki_codes if code.startswith(ICD9_PREFIX) or code.startswith(ICD10_PREFIX)]
aki_label_codes.sort()
aki_label_codes

Hyperlipidaemia

In [None]:
hyperlipidaemia_codes = ontology.get_all_children("SNOMED/55822004")
hyperlipidaemia_label_codes = [code for code in hyperlipidaemia_codes if code.startswith(ICD9_PREFIX) or code.startswith(ICD10_PREFIX)]
hyperlipidaemia_label_codes.sort()
hyperlipidaemia_label_codes

Chronic Kidney Disease (CKD)

In [None]:
ckd_codes = ontology.get_all_children("SNOMED/709044004")
ckd_label_codes = [code for code in ckd_codes if code.startswith(ICD9_PREFIX) or code.startswith(ICD10_PREFIX)]
ckd_label_codes.sort()
ckd_label_codes