In [None]:
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath('/workspaces'))

from LCA_Analysis.utils.data_postprocessing import *
from LCA_Analysis.utils.visualization import *

In [None]:
# LCA_POSTERIOR_PROBABILITIES_PATH: This store the posterior probabilities for all the latent classes, so we can combine small classes to larger classes
# LCA_LATENT_CLASS_DATA_PATH: This stores the patients that are classified to different subgroups based on LCA

LCA_POSTERIOR_PROBABILITIES_PATH = "../data/processed_data/LCA_posterior_probabilities.csv"
LCA_LATENT_CLASS_DATA_PATH = "../data/processed_data/LCA_latent_class_data.csv"
SOFA_DATA_PATH = "../data/raw_data/sofa.csv"
OASIS_DATA_PATH = "../data/raw_data/oasis.csv"
ANGUS_DATA_PATH = "../data/raw_data/angus.csv"
SEPSIS_DATA_PATH = "../data/raw_data/sepsis.csv"
PATIENT_DATA_PATH = "../data/raw_data/patients.csv"
NUM_LATENT_CLASSES = 6

### Read CSVs and reassign the class if the number of classes are more than 6

In [None]:
df = pd.read_csv(LCA_LATENT_CLASS_DATA_PATH)
df_prob = pd.read_csv(LCA_POSTERIOR_PROBABILITIES_PATH)
sofa = pd.read_csv(SOFA_DATA_PATH)
oasis = pd.read_csv(OASIS_DATA_PATH)
angus = pd.read_csv(ANGUS_DATA_PATH)
sepsis = pd.read_csv(SEPSIS_DATA_PATH)
patients = pd.read_csv(PATIENT_DATA_PATH)

df, classes_distribution = reassign_classes(df,df_prob,num_classes=NUM_LATENT_CLASSES)
display(df.head(5))
print(classes_distribution)
classes_mapping = {6:1,2:2,5:3,4:4,3:5,1:6}
df = process_morbidity_data(df,classes_mapping)

display(df.head(5))

### Different Plots below to prove the heterogeneity of the subgroups visually

In [None]:
plot_subgroup_characteristics(df, bubble_size_scale=10, save_plots=True)

In [None]:
cols_used_LCA = ["admission_type", "gender", "age_at_admission", "congestive_heart_failure", "cardiac_arrhythmias",
                 "valvular_disease", "pulmonary_circulation", "peripheral_vascular", "hypertension", "paralysis",
                 "other_neurological", "chronic_pulmonary", "diabetes_uncomplicated", "diabetes_complicated",
                 "hypothyroidism", "renal_failure", "liver_disease", "peptic_ulcer", "aids", "lymphoma",
                 "metastatic_cancer", "solid_tumor", "rheumatoid_arthritis", "coagulopathy", "obesity", "weight_loss",
                 "fluid_electrolyte", "blood_loss_anemia", "deficiency_anemias", "alcohol_abuse", "drug_abuse",
                 "psychoses", "depression"]

colors = ['black', 'red', 'green', 'blue', 'cyan', 'magenta']

plot_roc_curves(df, cols_used_LCA, colors,save_plots=True)

In [None]:
exclude_columns = ['subject_id', 'hadm_id', 'icustay_id', 'deathtime', 'gender',
       'age_at_admission', 'admission_type', 'los_icu_days',
       'los_hospital_days','class_assignment','count_morbidity','percent','dichotomized_class']
target_columns = ["class_assignment"]+[col for col  in df.columns if col not in(exclude_columns)]

mean_prevalence = df[target_columns].drop(columns=["class_assignment"]).mean()
mean_prevalence_subgroup = df[target_columns].groupby("class_assignment").mean()
# Normalize the prevalence to a maximum of 50%
normalized_prevalence = mean_prevalence.clip(upper=0.5)
normalized_prevalence_subgroup = mean_prevalence_subgroup.clip(upper=0.5)
plot_polar_all(mean_prevalence,save_plots=True)
plot_polar_subgroup(normalized_prevalence_subgroup,save_plots=True)

In [None]:
patients["dod_converion"] = patients["dod"].isna()
df_plot = df.copy()
df_plot = pd.merge(df_plot,sofa[["subject_id","hadm_id","sofa"]])
df_plot = pd.merge(df_plot,oasis[["subject_id","hadm_id","oasis"]])
df_plot = pd.merge(df_plot,angus[["subject_id","hadm_id","organ_dysfunction","explicit_sepsis"]])
df_plot = pd.merge(df_plot,sepsis[["subject_id","hadm_id","sepsis"]])
df_plot = pd.merge(df_plot,patients[["subject_id","dod_converion"]])
plot_boxplot_by_subgroup(df_plot,"sofa",save_plots=True)
plot_boxplot_by_subgroup(df_plot,"oasis",save_plots=True)

In [None]:
conditions = ["organ_dysfunction","sepsis"]
percentages = calculate_prevalence(df_plot, conditions,"Subgroup")
plot_bar(percentages, y_label="Percent prevalence", colors=["gray", "red", "green", "blue", "cyan", "pink"],save_plots=True)