The code in this notebook was used to create disease frequency stats to select appropriate disease prediction tasks

In [None]:
import polars as pl
import os
from datetime import datetime, timedelta
from tabulate import tabulate
import matplotlib.pyplot as plt
import numpy as np

from file_paths import MIMIC_DIR, OUTPUT_DIR

In [None]:
TABLE_DIR = os.path.join(OUTPUT_DIR, "tables")
os.makedirs(TABLE_DIR, exist_ok=True)

In [None]:
def save_table(table, filename):
    df_pd = table.to_pandas()
    latex_table = tabulate(df_pd, headers='keys', tablefmt='latex')

    with open(os.path.join(TABLE_DIR, filename), "w") as f:
        f.write(latex_table)

In [None]:
diagnosis_df = pl.read_csv(os.path.join(MIMIC_DIR, "hosp/diagnoses_icd.csv"))
admissions_df = pl.read_csv(os.path.join(MIMIC_DIR, "hosp/admissions.csv"))

In [None]:
# Convert date columns to datetime
admissions_df = admissions_df.with_columns([
    pl.col('admittime').str.strptime(pl.Datetime, '%Y-%m-%d %H:%M:%S').alias('admit_time'),
    pl.col('dischtime').str.strptime(pl.Datetime, '%Y-%m-%d %H:%M:%S').alias('discharge_time')
])

# Sort admissions by subject_id and admit_time
admissions_df = admissions_df.sort(by=['subject_id', 'admit_time'])

# Add a column for the previous discharge time
admissions_df = admissions_df.with_columns([
    pl.col('discharge_time').shift(1).over('subject_id').alias('prev_discharge_time')
])

admissions_df = admissions_df.select(["subject_id", "hadm_id", "admit_time", "discharge_time", "prev_discharge_time"])

merged_df = diagnosis_df.join(admissions_df, on=['subject_id', 'hadm_id'])

# Sort by subject_id and admit_time
merged_df = merged_df.sort(by=['subject_id', 'admit_time'])

# Create a rolling window to check for previous occurrences of each ICD code per patient
merged_df = merged_df.with_columns([
    pl.col('icd_code').cum_count().over(['subject_id', 'icd_code']).alias('icd_count')
])
merged_df

In [None]:
# Filter out rows where icd_count is greater than 0 (i.e., previously recorded diagnoses)
unique_diagnoses_df = merged_df.filter(pl.col('icd_count') == 1)

# Remove the helper column
unique_diagnoses_df = unique_diagnoses_df.drop(['icd_count'])

unique_diagnoses_df

In [None]:
timeframe_days = 365

filtered_diagnoses_df = unique_diagnoses_df.with_columns([
    pl.when(pl.col('prev_discharge_time').is_not_null())
    .then(pl.col('admit_time') - pl.col('prev_discharge_time'))
    .otherwise(pl.duration(days=0))
    .alias('time_diff')
]).filter(pl.col('prev_discharge_time').is_not_null() & (pl.col('time_diff').dt.total_days() <= timeframe_days))

filtered_diagnoses_df


In [None]:
# Count the occurrences of each ICD code
icd_code_counts = filtered_diagnoses_df.group_by(['icd_version', 'icd_code']).agg([
    pl.count('icd_code').alias('count')
])

sorted_counts = icd_code_counts.sort("count", descending=True)

icd_def_df = pl.read_csv(os.path.join(MIMIC_DIR, 'hosp/d_icd_diagnoses.csv'), schema_overrides={'icd_code': pl.Utf8})
counts_with_description = sorted_counts.join(icd_def_df, on=['icd_code', 'icd_version'], how='left')

top_diagnoses = counts_with_description.select(["icd_version", "icd_code", "long_title", "count"])[0:20]

save_table(top_diagnoses, "diagnoses_within_year.tex")