#### Explore cohort characteristics after data preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import missingno as msno
import seaborn as sns
import pickle
from tqdm import tqdm
from tableone import TableOne

In [2]:
def load_pickle(filepath: str):
    """Load a pickled object.

    Args:
        filepath (str): Path to pickle (.pkl) file.

    Returns:
        Any: Loaded object.
    """
    with open(filepath, "rb") as f:
        data = pickle.load(f)
    return data

def save_pickle(target: dict, filepath: str, fname: str = "mm_feat.pkl"):
    """Save a pickled object from a dictionary.

    Args:
        filepath (str): Path to pickle (.pkl) file.

    Returns:
        Any: Loaded object.
    """
    with open(os.path.join(filepath, fname), "wb") as f:
        pickle.dump(target, f)

##### Load data dictionary and train/val/test ids

In [11]:
train_ids = pd.read_csv('../outputs/prep_data/training_ids_icu_admission.csv')
train_ids_og = pd.read_csv('../outputs/prep_data/training_ids_icu_admission.csv')
val_ids = pd.read_csv('../outputs/prep_data/validation_ids_icu_admission.csv')
test_ids = pd.read_csv('../outputs/prep_data/testing_ids_icu_admission.csv')

pt_set = pd.concat([train_ids, val_ids, test_ids])
print(pt_set.shape)

(20130, 1)


In [12]:
pt_embs = load_pickle("../outputs/prep_data_us/mmfair_feat.pkl")
print(len(pt_embs.keys()))

20130


In [13]:
ehr_data = pd.read_csv('../outputs/ext_data/ehr_static.csv')
print(ehr_data.shape, ehr_data['subject_id'].nunique())
ehr_data = ehr_data[ehr_data['subject_id'].isin(list(pt_embs.keys()))]
print(ehr_data.shape, ehr_data['subject_id'].nunique())
ehr_data = ehr_data[ehr_data['subject_id'].isin(list(pt_set.subject_id.unique()))]
print(ehr_data.shape, ehr_data['subject_id'].nunique())
ehr_data['set'] = np.where(ehr_data['subject_id'].isin(train_ids['subject_id']), 'train',
                           np.where(ehr_data['subject_id'].isin(val_ids['subject_id']), 'val', 'test'))

(35999, 214) 35999
(20130, 214) 20130
(20130, 214) 20130


In [14]:
ehr_data.shape, ehr_data['subject_id'].nunique(), ehr_data['set'].value_counts()

((20130, 215),
 20130,
 set
 train    16104
 test      2013
 val       2013
 Name: count, dtype: int64)

In [15]:
### Define display dictionary
disp_dict = {
    "anchor_age": "Age",
    "gender": "Gender",
    "race_group": "Ethnicity",
    "insurance": "Insurance",
    "marital_status": "Marital status",
    "in_hosp_death": "In-hospital death",
    "ext_stay_7": "Extended stay",
    "non_home_discharge": "Non-home discharge",
    "icu_admission": "ICU admission",
    "is_multimorbid": "Multimorbidity",
    "is_complex_multimorbid": "Complex multimorbidity",
}
cont_cols = ["Age"]
### List non-normally distributed columns here for re-scaling
## TODO: Would need to move this to an appropriate config file
nn_cols = [
    "total_n_presc",
    "n_unique_conditions",
    "n_presc_acetaminophen",
    "n_presc_acyclovir",
    "n_presc_albuterol_neb_soln",
    "n_presc_amlodipine",
    "n_presc_apixaban",
    "n_presc_aspirin",
    "n_presc_atorvastatin",
    "n_presc_calcium_carbonate",
    "n_presc_carvedilol",
    "n_presc_cefepime",
    "n_presc_ceftriaxone",
    "n_presc_docusate_sodium",
    "n_presc_famotidine",
    "n_presc_folic_acid",
    "n_presc_furosemide",
    "n_presc_gabapentin",
    "n_presc_heparin",
    "n_presc_hydralazine",
    "n_presc_hydromorphone_dilaudid",
    "n_presc_insulin",
    "n_presc_ipratropium_albuterol_neb",
    "n_presc_lactulose",
    "n_presc_levetiracetam",
    "n_presc_levothyroxine_sodium",
    "n_presc_lisinopril",
    "n_presc_lorazepam",
    "n_presc_metoprolol_succinate_xl",
    "n_presc_metoprolol_tartrate",
    "n_presc_metronidazole",
    "n_presc_midodrine",
    "n_presc_morphine_sulfate",
    "n_presc_multivitamins",
    "n_presc_omeprazole",
    "n_presc_ondansetron",
    "n_presc_oxycodone",
    "n_presc_pantoprazole",
    "n_presc_piperacillin_tazobactam",
    "n_presc_polyethylene_glycol",
    "n_presc_potassium_chloride",
    "n_presc_prednisone",
    "n_presc_rifaximin",
    "n_presc_senna",
    "n_presc_sevelamer_carbonate",
    "n_presc_tacrolimus",
    "n_presc_thiamine",
    "n_presc_vancomycin",
    "n_presc_vitamin_d",
    "n_presc_warfarin",
    "pon_nutrition",
    "pon_cardiology",
    "pon_respiratory",
    "pon_neurology",
    "pon_radiology",
    "pon_tpn",
    "pon_hemodialysis",
]

cat_cols = [
    "In-hospital death",
    "Extended stay",
    "Non-home discharge",
    "ICU admission",
    "Multimorbidity",
    "Complex multimorbidity",
]

In [16]:
samples_disp = ehr_data.rename(columns=disp_dict)
for col in cat_cols:
    samples_disp[col] = samples_disp[col].replace({0: "N", 1: "Y"})
sum_table = TableOne(
    samples_disp,
    [col for col in disp_dict.values()],
    order={"set": ["train", "val", "test"]},
    categorical=[col for col in disp_dict.values() if col not in cont_cols],
    overall=True,
    groupby="set",
    pval=True,
    htest_name=True,
    tukey_test=True,
    nonnormal=nn_cols,
)

In [17]:
sum_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by set,Grouped by set,Grouped by set,Grouped by set,Grouped by set,Grouped by set,Grouped by set
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,train,val,test,P-Value,Test
n,,,20130,16104,2013,2013,,
"Age, mean (SD)",,0.0,62.9 (17.4),62.9 (17.4),63.2 (17.4),62.8 (17.4),0.775,One-way ANOVA
"Gender, n (%)",F,,10262 (51.0),8209 (51.0),1026 (51.0),1027 (51.0),0.999,Chi-squared
"Gender, n (%)",M,,9868 (49.0),7895 (49.0),987 (49.0),986 (49.0),,
"Ethnicity, n (%)",Asian,,794 (3.9),636 (3.9),79 (3.9),79 (3.9),1.0,Chi-squared
"Ethnicity, n (%)",Black,,3421 (17.0),2737 (17.0),341 (16.9),343 (17.0),,
"Ethnicity, n (%)",Hispanic/Latino,,1145 (5.7),915 (5.7),115 (5.7),115 (5.7),,
"Ethnicity, n (%)",Other,,748 (3.7),599 (3.7),75 (3.7),74 (3.7),,
"Ethnicity, n (%)",White,,14022 (69.7),11217 (69.7),1403 (69.7),1402 (69.6),,
"Insurance, n (%)",Medicaid,,3486 (17.3),2791 (17.3),363 (18.0),332 (16.5),0.873,Chi-squared


#### Examine train distribution only before/after undersampling

In [58]:
ehr_us = ehr_data[ehr_data['set'] == 'train']
ehr_data = pd.read_csv('../outputs/ext_data/ehr_static.csv')
print(ehr_data.shape, ehr_data['subject_id'].nunique())
ehr_data = ehr_data[ehr_data['subject_id'].isin(list(pt_embs.keys()))]
ehr_us['set'] = 'train_us'
ehr_og = ehr_data[ehr_data['subject_id'].isin(list(train_ids_og['subject_id']))]
ehr_og['set'] = 'train_og'
ehr_full = pd.concat([ehr_us, ehr_og], axis=0).reset_index(drop=True)
samples_disp = ehr_full.rename(columns=disp_dict)
for col in cat_cols:
    samples_disp[col] = samples_disp[col].replace({0: "N", 1: "Y"})
sum_table = TableOne(
    samples_disp,
    [col for col in disp_dict.values()],
    order={"set": ["train_us", "train_og"]},
    categorical=[col for col in disp_dict.values() if col not in cont_cols],
    overall=True,
    groupby="set",
    pval=True,
    htest_name=True,
    tukey_test=True,
    nonnormal=nn_cols,
)

(35999, 214) 35999


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ehr_og['set'] = 'train_og'


In [59]:
sum_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by set,Grouped by set,Grouped by set,Grouped by set,Grouped by set,Grouped by set
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,train_us,train_og,P-Value,Test
n,,,25028,8924,16104,,
"Age, mean (SD)",,0.0,63.2 (17.3),63.6 (17.0),63.0 (17.4),0.009,Welch’s T-test
"Gender, n (%)",F,,12716 (50.8),4506 (50.5),8210 (51.0),0.467,Chi-squared
"Gender, n (%)",M,,12312 (49.2),4418 (49.5),7894 (49.0),,
"Ethnicity, n (%)",Asian,,997 (4.0),361 (4.0),636 (3.9),0.724,Chi-squared
"Ethnicity, n (%)",Black,,4268 (17.1),1531 (17.2),2737 (17.0),,
"Ethnicity, n (%)",Hispanic/Latino,,1391 (5.6),475 (5.3),916 (5.7),,
"Ethnicity, n (%)",Other,,915 (3.7),317 (3.6),598 (3.7),,
"Ethnicity, n (%)",White,,17457 (69.7),6240 (69.9),11217 (69.7),,
"Insurance, n (%)",Medicaid,,4334 (17.3),1532 (17.2),2802 (17.4),0.442,Chi-squared
