# Statistics and information about the cohort

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data_nolowfreq_statsmeanstd_mrmr30_knn9distweight/data_filtered.csv')
d_icd = pd.read_csv('data/D_ICD_DIAGNOSES.csv')
d_items = pd.read_csv('data/D_ITEMS.csv')
d_lab = pd.read_csv('data/D_LABITEMS.csv')

In [None]:
def decode(var_name):
    label = var_name
    if var_name.isdigit():
        id = int(var_name)

        if id in d_items['ITEMID'].values:
            label = d_items[d_items['ITEMID'] == id]['LABEL'].values[0]
        elif id in d_lab['ITEMID'].values:
            label = d_lab[d_lab['ITEMID'] == id]['LABEL'].values[0]

    elif 'ICD9:' in var_name:
        id = var_name.split('ICD9: ')[-1]

        if id in d_icd['ICD9_CODE'].values:
                label = d_icd[d_icd['ICD9_CODE'] == id]['LONG_TITLE'].values[0]
                label = label + f' ({id})'

    return label

vars = pd.DataFrame(df['variable_name'].unique(), columns=['variable_name'])
vars['variable_name_decoded'] = vars.apply(lambda x: decode(x['variable_name']), axis=1)

In [None]:
df = pd.merge(df, vars, on='variable_name', how='left')

In [None]:
from icd9cms.icd9 import search

In [None]:
df['group'].value_counts().sort_values(ascending=False).to_clipboard()

In [None]:
icd_vars = vars[vars['variable_name'].str.contains('ICD9:')]
icd_vars['code'] = icd_vars['variable_name'].str.split('ICD9: ', expand=True)[1]
icd_vars['icd_group'] = icd_vars['code'].str.slice(0, 3)
icd_vars['description'] = icd_vars['icd_group'].apply(lambda x: search(x).short_desc if search(x) else None)
icd_vars['group'] = icd_vars['description'] + ' (' + icd_vars['icd_group'] + ')'

df = pd.merge(df, icd_vars[['variable_name', 'group']], on='variable_name', how='left')

In [None]:
descriptors_cont = [
    'RR',
    'HR',
    'SpO2',
    'SysBP',
    'DiaBP',
    'Temperature',
    'GCS - Eye Opening',
    'GCS - Verbal Response',
    'GCS - Motor Response',
    'Potassium',
    'Hematocrit',
    'Sodium',
    'Chloride',
    'Creatinine',
    'Urea Nitrogen',
    'Bicarbonate',
    'Anion Gap',
    'Glucose',
    'Magnesium',
    'Platelet Count',
    'Hemoglobin',
    'White Blood Cells',
    'Red Blood Cells',
    'RDW',
    'MCHC',
    'MCH',
    'MCV',
    'Phosphate',
    'Calcium, Total',
    'AGE'
]
descriptors_demo = [
    'MARITAL_STATUS',
    'ETHNICITY',
    'GENDER',
    'Race'
]
descriptors_icd = [
    'Cardiac dysrhythmias (427)',
    'Disorders of fluid electrolyte and acid-base balance (276)',
    'Disorders of lipoid metabolism (272)',
    'Essential hypertension (401)',
    'Diabetes mellitus (250)',
    'Acute kidney failure (584)',
    'Chronic kidney disease (ckd) (585)',
    'Hypotension (458)'
]

Now make a table with the mean (stdev) of continuous variables and frequency of diagnoses in the train/test/case/control sets.

In [None]:
df_cont = df[df['variable_name_decoded'].isin(descriptors_cont)]
df_demo = df[df['variable_name_decoded'].isin(descriptors_demo)]
df_icd = df[df['group'].isin(descriptors_icd)]

In [None]:
df_demo = df_demo.pivot_table(index=['TRAIN','HF_LABEL','ID'], columns=['variable_name_decoded'], values='variable_value', aggfunc='first')
df_demo = pd.get_dummies(df_demo, columns=['ETHNICITY','GENDER','Gender','MARITAL_STATUS','Race'], dummy_na=True).reset_index()
df_demo = df_demo.melt(id_vars=['TRAIN','HF_LABEL','ID'])
sample_sizes = df_demo.groupby(['TRAIN','HF_LABEL'])['ID'].nunique()

perc = df_demo.groupby(['TRAIN','HF_LABEL','variable'])['value'].sum() / sample_sizes
count = df_demo.groupby(['TRAIN','HF_LABEL','variable'])['value'].sum()

df_demo = pd.concat([count,perc], axis=1)
df_demo['val'] = df_demo['value'].astype(str) + ' (' + (round(df_demo[0], 2) * 100).astype(str) + '%)'

df_demo.pivot_table(index=['TRAIN','HF_LABEL'], columns='variable', values='val', aggfunc='first').T.to_clipboard()

In [None]:
x = df_cont['variable_value'].str.contains('[a-zA-Z]', regex=True).fillna(False)
df_cont = df_cont[~x]
df_cont['variable_value'] = df_cont['variable_value'].str.strip()
df_cont['variable_value'] = df_cont['variable_value'].astype(float)

df_cont_stats = df_cont.groupby(['TRAIN','HF_LABEL','variable_name_decoded'])['variable_value'].agg(['mean', 'std'])
df_cont_stats['val'] = round(df_cont_stats['mean'], 2).astype(str) + ' ± ' + round(df_cont_stats['std'], 2).astype(str)

df_cont_stats.pivot_table(index=['TRAIN','HF_LABEL'], columns='variable_name_decoded', values=['val'], aggfunc='first').T.to_clipboard()

In [None]:
df_icd = df_icd.drop_duplicates(subset=['TRAIN','HF_LABEL','ID','group'])
df_icd['variable_value'] = df_icd['variable_value'].astype(int)
sample_sizes = df_icd.groupby(['TRAIN','HF_LABEL'])['ID'].nunique()
df_icd_count = df_icd.pivot_table(index=['TRAIN','HF_LABEL'], columns=['group'], values='variable_value', aggfunc='sum').T
# df_icd_count = df_icd_count.melt(id_vars=['TRAIN','HF_LABEL'])
sample_sizes = df_icd.groupby(['TRAIN','HF_LABEL'])['ID'].nunique()
# df_icd_perc = df_icd_count / sample_sizes

In [None]:
df_icd_count.to_clipboard()