In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.ticker as mtick
from scipy import stats
from matplotlib.dates import DateFormatter
from datetime import timedelta
from datetime import datetime
from tqdm import tqdm
from tableone import TableOne

from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import gc

In [None]:
### Helper functions
def timer(st=None):
    if not st:
        st = datetime.now()
        return st
    elif st:
        th, temps = divmod((datetime.now() - st).total_seconds(), 3600)
        tm, ts = divmod(temps, 60)
        print('Elapsed time: %i hours: %i mins: %i secs.' % (th, tm, round(ts,2)))

##### Feature selection and train/val split pipeline

In [None]:
r_types = pd.read_csv('', names=['item', 'dtype'], skiprows=1)
dtype_dict = {}
for idx, row in r_types.iterrows():
    dtype_dict[row['item']] = row['dtype']

In [None]:
#### Load index cohort
inp_data = pd.read_csv('', low_memory=True, dtype=dtype_dict)

In [None]:
mdata = pd.read_csv('')
mdata_sel = mdata[['ppid', 'total_n_disciplines', 'total_count_ooh_all', 'gt_eld', 'gt_eld_d1', 'gt_eld_d2', 'gt_eld_d3']]

In [None]:
inp_data.columns.tolist()

#### Set group variables

In [None]:
### Set age groups
inp_data['age_gr'] = np.where((inp_data['AgeAtAdmission']>=50)&(inp_data['AgeAtAdmission']<60), '50-59', '90+')
inp_data['age_gr'] = np.where((inp_data['AgeAtAdmission']>=60)&(inp_data['AgeAtAdmission']<70), '60-69', inp_data['age_gr'])
inp_data['age_gr'] = np.where((inp_data['AgeAtAdmission']>=70)&(inp_data['AgeAtAdmission']<80), '70-79', inp_data['age_gr'])
inp_data['age_gr'] = np.where((inp_data['AgeAtAdmission']>=80)&(inp_data['AgeAtAdmission']<90), '80-89', inp_data['age_gr'])

### Set care intensity outcome and groups
inp_data['total_count_all_tf'] = stats.boxcox(inp_data['total_count_all'], lmbda=0.0)
inp_data['total_count_cts_gr'] = pd.qcut(inp_data.total_count_all, q=3, labels=['Low', 'Medium', 'High']).astype('O')
print(inp_data['total_count_cts_gr'].value_counts())

In [None]:
#### Specify features to retain
lkup_features = ['ppid',
 'EpisodeNumber',
 'AdmissionDate',
 'ED_adate_dt',
 'IndexAttDate',
 'HOSP_adt',
 'DischargeDate',
 'HOSP_ddt',
 'breq_dt',
 'HOSP_FCC_dt',
 'HOSP_FAS_dt',
 'AgeAtAdmission',
 'DateOfDeath',
 'simd_dec',
 'gt_m',
 'gt_cc',
 'gt_es_hosp',
 'gt_dd',
 'total_count_all',
 'total_count_rehab',
 'Sex_F',
 'arrival_mode_B',
 'arrival_mode_E',
 'arrival_mode_O',
 'arrival_mode_PR',
 'arrival_mode_PU',
 'arrival_mode_R',
 'arrival_mode_U',
 'arrival_mode_Unk',
 'arrival_mode_W',
 'age_gr',
 'triage_code',
 'total_count_all_tf',
 'total_count_cts_gr'
]

In [None]:
#### Drop any features before preprocessing step
to_drop = ['PresentingCondition_Acute_myocardial_infarction',
 'PresentingCondition_Alcohol_abuse',
 'PresentingCondition_Asthma',
 'PresentingCondition_Atrial_fibrillation',
 'PresentingCondition_COPD',
 'PresentingCondition_Delirium',
 'PresentingCondition_Heart_failure',
 'PresentingCondition_Stable_angina',
 'PresentingCondition_Stroke',
 'PresentingCondition_Transient_ischaemic_attack',
 'coh_idx',
 'HOSP_adt_s','AttendedED', 'total_unique_conditions',
 ### Features after ED attendance
 'ww_prev_syst_code_cardiovascular',
 'ww_prev_syst_code_gastrointestinal',
 'ww_prev_syst_code_renal',
 'ww_prev_syst_code_respiratory',
 'ww_prev_syst_code_unk',
 'ww_prev_surg_code_cardiac',
 'ww_prev_surg_code_gastrointestinal',
 'ww_prev_surg_code_unk',
 'ww_prev_surg_code_urological',
 'ww_iv_need',
 'ww_cpap_need',
 'ww_02_b50pc',
 'ww_airway_eti',
 'ww_multi_iv_drugs',
 'ww_ac_renal_repl',
 'ww_inv_neuro_monitor',
 'ww_neuro_1to1_need',
 'ww_cns_depression',
 'ww_nutr_par_need',
 'ww_nutr_ent_need',
 'ww_antimicr',
 'ww_surgery',
 'ww_trauma',
 'ww_AP2',
 'ww_vent_days',
 'trQ_4at',
 'trQ_bwm_urinary_catheterisation',
 'trQ_bwm_urinary_incontinence',
 'trQ_bwm_dysuria',
 'trQ_bwm_>6times_per_day',
 'trQ_bwm_nocturia_>2_per_night',
 'trQ_bwm_faeces_incontinence',
 'trQ_bwm_constipation',
 'trQ_bwm_diarrhoea',
 'trQ_bwm_blood_in_stools',
 'trQ_bwm_medication',
 'trQ_falls_within_6_months',
 'trQ_falls_clinical_risk',
 'trQ_nutr_food_allergies',
 'trQ_nutr_swallowing_difficulty',
 'trQ_mrsa_infection_prevention',
 'trQ_mrsa_transfer_with_norovirus',
 'trQ_mrsa_resp_or_fever',
 'trQ_mrsa_rash_fever_or_flu',
 'trQ_mrsa_infectious_diseases_contact',
 'trQ_rub_nursing_falls_risk_assessment',
 'trQ_rub_at_risk_of_bed_fall',
 'trQ_waterlow_score',
 'trQ_MUST_score',
 'trQ_mobility_walking_ASSISTANCE',
 'trQ_mobility_walking_BED_REST',
 'trQ_mobility_walking_INDEPENDENT',
 'trQ_mobility_toileting_ASSISTANCE',
 'trQ_mobility_toileting_BED_REST',
 'trQ_mobility_toileting_INDEPENDENT',
 'trQ_mobility_bathing_ASSISTANCE',
 'trQ_mobility_bathing_BED_REST',
 'trQ_mobility_bathing_INDEPENDENT',
 'trQ_mobility_bed_rolling_ASSISTANCE',
 'trQ_mobility_bed_rolling_INDEPENDENT',
 'trQ_mobility_bed_moveup_ASSISTANCE',
 'trQ_mobility_bed_moveup_INDEPENDENT',
 'trQ_mobility_bed_out_ASSISTANCE',
 'trQ_mobility_bed_out_BED_REST',
 'trQ_mobility_bed_out_INDEPENDENT',
 'trQ_mobility_bed_in_ASSISTANCE',
 'trQ_mobility_bed_in_BED_REST',
 'trQ_mobility_bed_in_INDEPENDENT',
 'trQ_mobility_sss_ASSISTANCE',
 'trQ_mobility_sss_BED_REST',
 'trQ_mobility_sss_INDEPENDENT',
 'trQ_mobility_lateral_ASSISTANCE',
 'trQ_mobility_lateral_BED_REST',
 'trQ_mobility_lateral_INDEPENDENT',
 'trQ_mobility_floorup_ASSISTANCE',
 'trQ_mobility_floorup_BED_REST',
 'trQ_mobility_floorup_INDEPENDENT'
 ]
    
inp_data = inp_data.drop(to_drop, axis=1)
inp_data = inp_data.drop([col for col in inp_data.columns if col.endswith('_r')], axis=1)
inp_data = inp_data.drop([col for col in inp_data.columns if col.endswith('_d')], axis=1)
inp_data = inp_data.drop([col for col in inp_data.columns if col.startswith('dsf_')], axis=1)

In [None]:
#### Change formatting in categorical data
inp_data[[col for col in inp_data.columns if 'arrival_mode' in col]] = inp_data[[col for col in inp_data.columns if 'arrival_mode' in col]].astype(np.int8)
inp_data[[col for col in inp_data.columns if 'triage_code' in col]] = inp_data[[col for col in inp_data.columns if 'triage_code' in col]].astype(np.int8)
inp_data[[col for col in inp_data.columns if 'trQ_mobility' in col]] = inp_data[[col for col in inp_data.columns if 'trQ_mobility' in col]].astype(np.int8)

##### Remove correlated features

In [None]:
def remove_corr_feat(x, threshold, method='pearson'):
    corr_mat = x.corr(method=method)
    iters = range(len(corr_mat.columns) - 1)
    drop_cols = []
    #drop_fx = []
    for i in tqdm(iters):
        for j in range(i+1):
            item = corr_mat.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)
            if val >= threshold:
                print(col.values[0], "||", row.values[0], "||", round(val[0][0], 2))
                drop_cols.append(col.values[0])

    #for col in drop_cols:
        #if col.endswith('_v'):
            #drop_fx.append(col[:-2] + '_d')
            #drop_fx.append(col[:-2] + '_r')
        #else:
            #drop_fx.append(col)

    #print(drop_fx)
    drops = set(drop_cols)
    x = x.drop(columns=drops)
    print('Removed columns {}'.format(drops))
    return x

In [None]:
reh_data_i = inp_data[lkup_features]
reh_data_s = inp_data.drop(lkup_features, axis=1)
reh_data_s = remove_corr_feat(reh_data_s, threshold=0.9)

In [None]:
all_cols = reh_data_i.columns.tolist() + reh_data_s.columns.tolist()
inp_data = inp_data[all_cols]

In [None]:
inp_data.shape, inp_data.ppid.nunique()

In [None]:
inp_data.columns.tolist()

##### Setup automated feature selection across all health outcomes

In [None]:
bor_data_lk = inp_data[lkup_features]
bor_X_tr = inp_data.drop(lkup_features, axis=1)
bor_y = inp_data['gt_m']

In [None]:
def boruta_auto_sel(X_data, y_data, task='clf', outcome='gt_m', file_out='',
                   n_est=200, n_est_b='auto', md=5, es=5):
    if task=='clf':
        rfc = RandomForestClassifier(n_estimators=n_est, n_jobs=-1, class_weight='balanced', max_depth=md)
    else:
        rfc = GradientBoostingRegressor(loss='huber',
                                    n_estimators=n_est, max_depth=md)
    b_sel = BorutaPy(rfc, n_estimators=n_est_b, verbose=2, early_stopping=True, n_iter_no_change=es)
    start_time = timer(None)
    b_sel.fit(X_data, y_data)
    timer(start_time)
    print('Total N selected features {0}, for outcome {1}'.format(b_sel.n_features_, outcome))
    feat_df = pd.DataFrame(X_data.columns.tolist(), columns=['features'])
    feat_df['rank'] = b_sel.ranking_
    feat_df = feat_df.sort_values('rank', ascending=True).reset_index(drop=True)
    print('Saving ranking to disk.')
    feat_df.to_csv(file_out, index=False)
    feat_sel = X_data.columns[b_sel.support_]
    X_data_sel = X_data[feat_sel]
    return X_data_sel

In [None]:
X_sel_m = boruta_auto_sel(bor_X_tr, bor_y)

In [None]:
X_sel_m.shape

In [None]:
X_sel_cc = boruta_auto_sel(bor_X_tr, inp_data['gt_cc'], outcome='gt_cc', 
                           file_out='', es=10)

In [None]:
X_sel_es = boruta_auto_sel(bor_X_tr, inp_data['gt_es_hosp'], outcome='gt_es_hosp', 
                           file_out='', es=10)

In [None]:
X_sel_dd = boruta_auto_sel(bor_X_tr, inp_data['gt_dd'], outcome='gt_dd', 
                           file_out='', es=10)

In [None]:
X_sel_contacts = boruta_auto_sel(bor_X_tr, inp_data['total_count_all_tf'], outcome='total_count_all_tf', 
                           file_out='', n_est=10, n_est_b=50, es=10, task='reg')

In [None]:
#### Perform cross-outcome feature selection
rank_threshold = 10
X_m_feat = pd.read_csv('')
X_m_feat = X_m_feat[X_m_feat['rank']<rank_threshold].features.tolist()
X_cc_feat = pd.read_csv('')
X_cc_feat = X_cc_feat[X_cc_feat['rank']<rank_threshold].features.tolist()
X_es_feat = pd.read_csv('')
X_es_feat = X_es_feat[X_es_feat['rank']<rank_threshold].features.tolist()
X_dd_feat = pd.read_csv('')
X_dd_feat = X_dd_feat[X_dd_feat['rank']<rank_threshold].features.tolist()
X_cts_feat = pd.read_csv('')
X_cts_feat = X_cts_feat[X_cts_feat['rank']<rank_threshold].features.tolist()
bor_cross_feat = list(set(X_m_feat)|set(X_cc_feat)|set(X_es_feat)|set(X_dd_feat)|set(X_cts_feat))

bor_X_sel = inp_data[[feat for feat in bor_cross_feat if feat in inp_data.columns.tolist()]]
print('Features selected:')
print(bor_X_tr[[col for col in bor_X_tr if col in bor_cross_feat]].columns.tolist())
print(len(bor_X_tr[[col for col in bor_X_tr if col in bor_cross_feat]].columns.tolist()))
print('Features dropped:')
print(bor_X_tr[[col for col in bor_X_tr if col not in bor_cross_feat]].columns.tolist())
print(len(bor_X_tr[[col for col in bor_X_tr if col not in bor_cross_feat]].columns.tolist()))
print('Output shape:', bor_X_sel.shape)

In [None]:
#bor_X_sel = bor_X_tr # Skip Boruta and build full feature set
all_feat = lkup_features + bor_X_sel.columns.tolist()
inp_data = inp_data[all_feat]

In [None]:
inp_data.columns.tolist()

##### Create train/val splits

##### In-hospital death

In [None]:
inp_data = inp_data.merge(mdata_sel, how='left', on='ppid')

In [None]:
inp_data['total_n_disciplines_gr'] = np.where(inp_data['total_n_disciplines'] == 1, '1', '4+')
inp_data['total_n_disciplines_gr'] = np.where(inp_data['total_n_disciplines'] == 2, '2', inp_data['total_n_disciplines_gr'])
inp_data['total_n_disciplines_gr'] = np.where(inp_data['total_n_disciplines'] == 3, '3', inp_data['total_n_disciplines_gr']).astype(pd.Categorical)

In [None]:
inp_data[['age_gr', 'gt_m']].groupby('gt_m')['age_gr'].value_counts()

In [None]:
r_x_train, r_x_val, r_y_train, r_y_val = train_test_split(inp_data.drop(['gt_m', 'Sex_F', 'age_gr'], axis=1),
                                                          pd.concat([inp_data['age_gr'], inp_data['Sex_F'], inp_data['gt_m']], axis=1),
                                                          test_size=0.3,
                                                          random_state=42,
                                                          stratify=pd.concat([inp_data['age_gr'], inp_data['Sex_F'], inp_data['gt_m']], axis=1))
print(r_x_train.shape, r_x_val.shape)

In [None]:
r_x_train = pd.concat([r_x_train, r_y_train], axis=1)
r_x_train['set'] = 'train'
r_x_val = pd.concat([r_x_val, r_y_val], axis=1)
r_x_val['set'] = 'validation'
r_x = pd.concat([r_x_train, r_x_val], axis=0)
print(r_x_train.groupby('gt_m')['age_gr'].value_counts())
print(r_x_val.groupby('gt_m')['age_gr'].value_counts())

In [None]:
print(r_x_train.gt_m.value_counts(normalize=True))
print(r_x_val.gt_m.value_counts(normalize=True))

In [None]:
inp_data.total_n_disciplines_gr.value_counts()

In [None]:
dm_data_tb = r_x.copy()
dm_data_tb['Sex'] = np.where(r_x.Sex_F == 1, 'F', 'M')
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([1,2]), 1, -1)
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([3,4]), 2, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([5,6]), 3, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([7,8]), 4, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([9,10]), 5, dm_data_tb['simd_quint'])
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] < 2, '1', '5')
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] == -1, '-1', dm_data_tb['simd_gr'])
dm_data_tb['simd_gr'] = np.where((dm_data_tb['simd_quint'] < 5)&(dm_data_tb['simd_quint']>=2), '2-4', dm_data_tb['simd_gr'])
dm_data_tb = dm_data_tb.rename(columns={'AgeAtAdmission': 'Age',
                                 'age_gr': 'Age group',
                                 'simd_gr': 'SIMD (1 - most deprived, 5 - least deprived)',
                                 'gt_m': 'In-hospital mortality',
                                 'total_count_all': 'Total health contacts',
                                 'total_count_rehab': 'Total rehabilitation contacts',
                                 'total_count_ooh_all': 'Out-of-hours contacts',
                                 'total_n_disciplines_gr': 'Number of disciplines involved'
                                       })
t_cols = ['Age', 'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts']
groupby = ['set']
categorical = ['Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'In-hospital mortality', 'Number of disciplines involved']
dm_data_tb = dm_data_tb[['set', 'Age', 'Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'In-hospital mortality',
                        'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts', 
                         'Number of disciplines involved']]
sum_table = TableOne(dm_data_tb, dm_data_tb.columns.tolist(), categorical=categorical,
                     overall=True, missing=True, htest_name=True, decimals={'Age': 0}, tukey_test=True,
                     groupby=groupby, nonnormal=['Age', 'Total health contacts'], pval=True)
sum_table

In [None]:
## Export
r_x_train = r_x_train.drop(['set'], axis=1)
r_x_val = r_x_val.drop(['set'], axis=1)
pd.DataFrame(r_x_train.dtypes).to_csv('')
r_x_train.to_csv('', index=False)
r_x_val.to_csv('', index=False)

##### Prolonged length-of-stay

In [None]:
r_x_train, r_x_val, r_y_train, r_y_val = train_test_split(inp_data.drop(['gt_es_hosp', 'Sex_F', 'age_gr'], axis=1),
                                                          pd.concat([inp_data['age_gr'], inp_data['Sex_F'], inp_data['gt_es_hosp']], axis=1),
                                                          test_size=0.3,
                                                          random_state=42,
                                                          stratify=pd.concat([inp_data['age_gr'], inp_data['Sex_F'], inp_data['gt_es_hosp']], axis=1))
print(r_x_train.shape, r_x_val.shape)

In [None]:
r_x_train = pd.concat([r_x_train, r_y_train], axis=1)
r_x_train['set'] = 'train'
r_x_val = pd.concat([r_x_val, r_y_val], axis=1)
r_x_val['set'] = 'validation'
r_x = pd.concat([r_x_train, r_x_val], axis=0)
print(r_x_train.groupby('gt_es_hosp')['age_gr'].value_counts())
print(r_x_val.groupby('gt_es_hosp')['age_gr'].value_counts())

In [None]:
print(r_x_train.gt_es_hosp.value_counts(normalize=True))
print(r_x_val.gt_es_hosp.value_counts(normalize=True))

In [None]:
dm_data_tb = r_x.copy()
dm_data_tb['Sex'] = np.where(r_x.Sex_F == 1, 'F', 'M')
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([1,2]), 1, -1)
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([3,4]), 2, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([5,6]), 3, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([7,8]), 4, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([9,10]), 5, dm_data_tb['simd_quint'])
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] < 2, '1', '5')
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] == -1, '-1', dm_data_tb['simd_gr'])
dm_data_tb['simd_gr'] = np.where((dm_data_tb['simd_quint'] < 5)&(dm_data_tb['simd_quint']>=2), '2-4', dm_data_tb['simd_gr'])
dm_data_tb = dm_data_tb.rename(columns={'AgeAtAdmission': 'Age',
                                 'age_gr': 'Age group',
                                 'simd_gr': 'SIMD (1 - most deprived, 5 - least deprived)',
                                 'gt_es_hosp': 'Extended stay (>=14 days)',
                                 'total_count_all': 'Total health contacts',
                                 'total_count_rehab': 'Total rehabilitation contacts',
                                 'total_count_ooh_all': 'Out-of-hours contacts',
                                 'total_n_disciplines_gr': 'Number of disciplines involved'
                                       })
t_cols = ['Age', 'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts']
groupby = ['set']
categorical = ['Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'Extended stay (>=14 days)', 'Number of disciplines involved']
dm_data_tb = dm_data_tb[['set', 'Age', 'Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'Extended stay (>=14 days)',
                        'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts', 
                         'Number of disciplines involved']]
sum_table = TableOne(dm_data_tb, dm_data_tb.columns.tolist(), categorical=categorical,
                     overall=True, missing=True, htest_name=True, decimals={'Age': 0}, tukey_test=True,
                     groupby=groupby, nonnormal=['Age', 'Total health contacts'], pval=True)
sum_table

In [None]:
## Export
r_x_train = r_x_train.drop(['set'], axis=1)
r_x_val = r_x_val.drop(['set'], axis=1)
pd.DataFrame(r_x_train.dtypes).to_csv('')
r_x_train.to_csv('', index=False)
r_x_val.to_csv('', index=False)

##### ICU/HDU admission

In [None]:
r_x_train, r_x_val, r_y_train, r_y_val = train_test_split(inp_data.drop(['gt_cc', 'Sex_F', 'age_gr'], axis=1),
                                                          pd.concat([inp_data['age_gr'], inp_data['Sex_F'], inp_data['gt_cc']], axis=1),
                                                          test_size=0.3,
                                                          random_state=42,
                                                          stratify=pd.concat([inp_data['age_gr'], inp_data['Sex_F'], inp_data['gt_cc']], 
                                                                             axis=1))
print(r_x_train.shape, r_x_val.shape)

In [None]:
r_x_train = pd.concat([r_x_train, r_y_train], axis=1)
r_x_train['set'] = 'train'
r_x_val = pd.concat([r_x_val, r_y_val], axis=1)
r_x_val['set'] = 'validation'
r_x = pd.concat([r_x_train, r_x_val], axis=0)
print(r_x_train.groupby('gt_cc')['age_gr'].value_counts())
print(r_x_val.groupby('gt_cc')['age_gr'].value_counts())

In [None]:
print(r_x_train.gt_cc.value_counts(normalize=True))
print(r_x_val.gt_cc.value_counts(normalize=True))

In [None]:
dm_data_tb = r_x.copy()
dm_data_tb['Sex'] = np.where(r_x.Sex_F == 1, 'F', 'M')
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([1,2]), 1, -1)
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([3,4]), 2, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([5,6]), 3, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([7,8]), 4, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([9,10]), 5, dm_data_tb['simd_quint'])
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] < 2, '1', '5')
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] == -1, '-1', dm_data_tb['simd_gr'])
dm_data_tb['simd_gr'] = np.where((dm_data_tb['simd_quint'] < 5)&(dm_data_tb['simd_quint']>=2), '2-4', dm_data_tb['simd_gr'])
dm_data_tb = dm_data_tb.rename(columns={'AgeAtAdmission': 'Age',
                                 'age_gr': 'Age group',
                                 'simd_gr': 'SIMD (1 - most deprived, 5 - least deprived)',
                                 'gt_cc': 'ICU/HDU admission',
                                 'total_count_all': 'Total health contacts',
                                 'total_count_rehab': 'Total rehabilitation contacts',
                                 'total_count_ooh_all': 'Out-of-hours contacts',
                                 'total_n_disciplines_gr': 'Number of disciplines involved'
                                       })
t_cols = ['Age', 'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts']
groupby = ['set']
categorical = ['Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'ICU/HDU admission', 'Number of disciplines involved']
dm_data_tb = dm_data_tb[['set', 'Age', 'Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'ICU/HDU admission',
                        'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts', 
                         'Number of disciplines involved']]
sum_table = TableOne(dm_data_tb, dm_data_tb.columns.tolist(), categorical=categorical,
                     overall=True, missing=True, htest_name=True, decimals={'Age': 0}, tukey_test=True,
                     groupby=groupby, nonnormal=['Age', 'Total health contacts'], pval=True)
sum_table

In [None]:
## Export
r_x_train = r_x_train.drop(['set'], axis=1)
r_x_val = r_x_val.drop(['set'], axis=1)
pd.DataFrame(r_x_train.dtypes).to_csv('')
r_x_train.to_csv('', index=False)
r_x_val.to_csv('', index=False)

##### Discharge disposition

In [None]:
### Exclude in-hospital deaths from the outcome
inp_data['gt_dd'] = np.where(inp_data['gt_m'] == 1, 0, inp_data['gt_dd'])

In [None]:
inp_data['gt_dd'].value_counts(normalize=True)

In [None]:
r_x_train, r_x_val, r_y_train, r_y_val = train_test_split(inp_data.drop(['gt_dd', 'Sex_F', 'age_gr'], axis=1),
                                                          pd.concat([inp_data['age_gr'], inp_data['Sex_F'], inp_data['gt_dd']], axis=1),
                                                          test_size=0.3,
                                                          random_state=42,
                                                          stratify=pd.concat([inp_data['age_gr'], inp_data['Sex_F'], inp_data['gt_dd']], 
                                                                             axis=1))
print(r_x_train.shape, r_x_val.shape)

In [None]:
r_x_train = pd.concat([r_x_train, r_y_train], axis=1)
r_x_train['set'] = 'train'
r_x_val = pd.concat([r_x_val, r_y_val], axis=1)
r_x_val['set'] = 'validation'
r_x = pd.concat([r_x_train, r_x_val], axis=0)
print(r_x_train.groupby('gt_dd')['age_gr'].value_counts())
print(r_x_val.groupby('gt_dd')['age_gr'].value_counts())

In [None]:
print(r_x_train.gt_dd.value_counts(normalize=True))
print(r_x_val.gt_dd.value_counts(normalize=True))

In [None]:
dm_data_tb = r_x.copy()
dm_data_tb['Sex'] = np.where(r_x.Sex_F == 1, 'F', 'M')
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([1,2]), 1, -1)
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([3,4]), 2, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([5,6]), 3, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([7,8]), 4, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([9,10]), 5, dm_data_tb['simd_quint'])
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] < 2, '1', '5')
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] == -1, '-1', dm_data_tb['simd_gr'])
dm_data_tb['simd_gr'] = np.where((dm_data_tb['simd_quint'] < 5)&(dm_data_tb['simd_quint']>=2), '2-4', dm_data_tb['simd_gr'])
dm_data_tb = dm_data_tb.rename(columns={'AgeAtAdmission': 'Age',
                                 'age_gr': 'Age group',
                                 'simd_gr': 'SIMD (1 - most deprived, 5 - least deprived)',
                                 'gt_dd': 'Home discharge',
                                 'total_count_all': 'Total health contacts',
                                 'total_count_rehab': 'Total rehabilitation contacts',
                                 'total_count_ooh_all': 'Out-of-hours contacts',
                                 'total_n_disciplines_gr': 'Number of disciplines involved'
                                       })
t_cols = ['Age', 'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts']
groupby = ['set']
categorical = ['Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'Home discharge', 'Number of disciplines involved']
dm_data_tb = dm_data_tb[['set', 'Age', 'Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'Home discharge',
                        'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts', 
                         'Number of disciplines involved']]
sum_table = TableOne(dm_data_tb, dm_data_tb.columns.tolist(), categorical=categorical,
                     overall=True, missing=True, htest_name=True, decimals={'Age': 0}, tukey_test=True,
                     groupby=groupby, nonnormal=['Age', 'Total health contacts'], pval=True)
sum_table

In [None]:
## Export
r_x_train = r_x_train.drop(['set'], axis=1)
r_x_val = r_x_val.drop(['set'], axis=1)
pd.DataFrame(r_x_train.dtypes).to_csv('')
r_x_train.to_csv('', index=False)
r_x_val.to_csv('', index=False)

#### Any rehab

In [None]:
inp_data['gt_rehab'] = np.where(inp_data['total_count_rehab'] > 0, 1, 0).astype(np.int8)

In [None]:
r_x_train, r_x_val, r_y_train, r_y_val = train_test_split(inp_data.drop(['gt_rehab', 'Sex_F', 'age_gr'], axis=1),
                                                          pd.concat([inp_data['age_gr'], inp_data['Sex_F'], inp_data['gt_rehab']], axis=1),
                                                          test_size=0.3,
                                                          random_state=42,
                                                          stratify=pd.concat([inp_data['age_gr'], inp_data['Sex_F'], inp_data['gt_rehab']], 
                                                                             axis=1))
print(r_x_train.shape, r_x_val.shape)

In [None]:
r_x_train = pd.concat([r_x_train, r_y_train], axis=1)
r_x_train['set'] = 'train'
r_x_val = pd.concat([r_x_val, r_y_val], axis=1)
r_x_val['set'] = 'validation'
r_x = pd.concat([r_x_train, r_x_val], axis=0)
print(r_x_train.groupby('gt_rehab')['age_gr'].value_counts())
print(r_x_val.groupby('gt_rehab')['age_gr'].value_counts())

In [None]:
print(r_x_train.gt_rehab.value_counts(normalize=True))
print(r_x_val.gt_rehab.value_counts(normalize=True))

In [None]:
dm_data_tb = r_x.copy()
dm_data_tb['Sex'] = np.where(r_x.Sex_F == 1, 'F', 'M')
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([1,2]), 1, -1)
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([3,4]), 2, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([5,6]), 3, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([7,8]), 4, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([9,10]), 5, dm_data_tb['simd_quint'])
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] < 2, '1', '5')
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] == -1, '-1', dm_data_tb['simd_gr'])
dm_data_tb['simd_gr'] = np.where((dm_data_tb['simd_quint'] < 5)&(dm_data_tb['simd_quint']>=2), '2-4', dm_data_tb['simd_gr'])
dm_data_tb = dm_data_tb.rename(columns={'AgeAtAdmission': 'Age',
                                 'age_gr': 'Age group',
                                 'simd_gr': 'SIMD (1 - most deprived, 5 - least deprived)',
                                 'gt_rehab': 'Received rehab',
                                 'total_count_all': 'Total health contacts',
                                 'total_count_rehab': 'Total rehabilitation contacts',
                                 'total_count_ooh_all': 'Out-of-hours contacts',
                                 'total_n_disciplines_gr': 'Number of disciplines involved'
                                       })
t_cols = ['Age', 'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts']
groupby = ['set']
categorical = ['Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'Received rehab', 'Number of disciplines involved']
dm_data_tb = dm_data_tb[['set', 'Age', 'Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'Received rehab',
                        'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts', 
                         'Number of disciplines involved']]
sum_table = TableOne(dm_data_tb, dm_data_tb.columns.tolist(), categorical=categorical,
                     overall=True, missing=True, htest_name=True, decimals={'Age': 0}, tukey_test=True,
                     groupby=groupby, nonnormal=['Age', 'Total health contacts'], pval=True)
sum_table

In [None]:
## Export
r_x_train = r_x_train.drop(['set'], axis=1)
r_x_val = r_x_val.drop(['set'], axis=1)
pd.DataFrame(r_x_train.dtypes).to_csv('')
r_x_train.to_csv('', index=False)
r_x_val.to_csv('', index=False)

##### MoE

In [None]:
r_x_train, r_x_val, r_y_train, r_y_val = train_test_split(inp_data.drop(['gt_eld', 'Sex_F', 'age_gr'], axis=1),
                                                          pd.concat([inp_data['age_gr'], inp_data['Sex_F'], inp_data['gt_eld']], axis=1),
                                                          test_size=0.3,
                                                          random_state=42,
                                                          stratify=pd.concat([inp_data['age_gr'], inp_data['Sex_F'], inp_data['gt_eld']], axis=1))
print(r_x_train.shape, r_x_val.shape)

In [None]:
r_x_train = pd.concat([r_x_train, r_y_train], axis=1)
r_x_train['set'] = 'train'
r_x_val = pd.concat([r_x_val, r_y_val], axis=1)
r_x_val['set'] = 'validation'
r_x = pd.concat([r_x_train, r_x_val], axis=0)
print(r_x_train.groupby('gt_eld')['age_gr'].value_counts())
print(r_x_val.groupby('gt_eld')['age_gr'].value_counts())

In [None]:
print(r_x_train.gt_eld.value_counts(normalize=True))
print(r_x_val.gt_eld.value_counts(normalize=True))

In [None]:
dm_data_tb = r_x.copy()
dm_data_tb['Sex'] = np.where(r_x.Sex_F == 1, 'F', 'M')
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([1,2]), 1, -1)
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([3,4]), 2, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([5,6]), 3, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([7,8]), 4, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([9,10]), 5, dm_data_tb['simd_quint'])
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] < 2, '1', '5')
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] == -1, '-1', dm_data_tb['simd_gr'])
dm_data_tb['simd_gr'] = np.where((dm_data_tb['simd_quint'] < 5)&(dm_data_tb['simd_quint']>=2), '2-4', dm_data_tb['simd_gr'])
dm_data_tb = dm_data_tb.rename(columns={'AgeAtAdmission': 'Age',
                                 'age_gr': 'Age group',
                                 'simd_gr': 'SIMD (1 - most deprived, 5 - least deprived)',
                                 'gt_dd': 'Home discharge',
                                 'total_count_all': 'Total health contacts',
                                 'total_count_rehab': 'Total rehabilitation contacts',
                                 'total_count_ooh_all': 'Out-of-hours contacts',
                                 'total_n_disciplines_gr': 'Number of disciplines involved'
                                       })
t_cols = ['Age', 'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts']
groupby = ['set']
categorical = ['Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'Home discharge', 'Number of disciplines involved']
dm_data_tb = dm_data_tb[['set', 'Age', 'Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'Home discharge',
                        'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts', 
                         'Number of disciplines involved']]
sum_table = TableOne(dm_data_tb, dm_data_tb.columns.tolist(), categorical=categorical,
                     overall=True, missing=True, htest_name=True, decimals={'Age': 0}, tukey_test=True,
                     groupby=groupby, nonnormal=['Age', 'Total health contacts'], pval=True)
sum_table

In [None]:
## Export
r_x_train = r_x_train.drop(['set'], axis=1)
r_x_val = r_x_val.drop(['set'], axis=1)
pd.DataFrame(r_x_train.dtypes).to_csv('')
r_x_train.to_csv('', index=False)
r_x_val.to_csv('', index=False)

##### Total health contacts

In [None]:
### Set survivors
inp_data_s = inp_data

In [None]:
r_x_train, r_x_val, r_y_train, r_y_val = train_test_split(inp_data_s.drop(['total_count_cts_gr', 'Sex_F', 'age_gr'], axis=1),
                                                          pd.concat([inp_data_s['age_gr'], inp_data_s['Sex_F'], inp_data_s['total_count_cts_gr']], axis=1),
                                                          test_size=0.3,
                                                          random_state=42,
                                                          stratify=pd.concat([inp_data_s['age_gr'], inp_data_s['Sex_F'], inp_data_s['total_count_cts_gr']], 
                                                                             axis=1))
print(r_x_train.shape, r_x_val.shape)

In [None]:
r_x_train = pd.concat([r_x_train, r_y_train], axis=1)
r_x_train['set'] = 'train'
r_x_val = pd.concat([r_x_val, r_y_val], axis=1)
r_x_val['set'] = 'validation'
r_x = pd.concat([r_x_train, r_x_val], axis=0)
print(r_x_train.groupby('total_count_cts_gr')['age_gr'].value_counts())
print(r_x_val.groupby('total_count_cts_gr')['age_gr'].value_counts())

In [None]:
print(r_x_train.total_count_cts_gr.value_counts(normalize=True))
print(r_x_val.total_count_cts_gr.value_counts(normalize=True))

In [None]:
dm_data_tb = r_x.copy()
dm_data_tb['Sex'] = np.where(r_x.Sex_F == 1, 'F', 'M')
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([1,2]), 1, -1)
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([3,4]), 2, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([5,6]), 3, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([7,8]), 4, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([9,10]), 5, dm_data_tb['simd_quint'])
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] < 2, '1', '5')
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] == -1, '-1', dm_data_tb['simd_gr'])
dm_data_tb['simd_gr'] = np.where((dm_data_tb['simd_quint'] < 5)&(dm_data_tb['simd_quint']>=2), '2-4', dm_data_tb['simd_gr'])
dm_data_tb = dm_data_tb.rename(columns={'AgeAtAdmission': 'Age',
                                 'age_gr': 'Age group',
                                 'simd_gr': 'SIMD (1 - most deprived, 5 - least deprived)',
                                 'gt_m': 'In-hospital death',
                                 'gt_es_hosp': 'Extended stay (>=14 days)',
                                 'gt_cc': 'ICU/HDU admission',
                                 'gt_dd': 'Home discharge',
                                 'total_count_all': 'Total health contacts',
                                 'total_count_rehab': 'Total rehabilitation contacts',
                                 'total_count_ooh_all': 'Out-of-hours contacts',
                                 'total_n_disciplines_gr': 'Number of disciplines involved'
                                       })
t_cols = ['Age', 'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts']
groupby = ['set']
categorical = ['Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'In-hospital death',
                         'Extended stay (>=14 days)', 'ICU/HDU admission', 'Home discharge', 'Number of disciplines involved']
dm_data_tb = dm_data_tb[['set', 'Age', 'Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'In-hospital death',
                         'Extended stay (>=14 days)', 'ICU/HDU admission', 'Home discharge',
                        'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts', 
                         'Number of disciplines involved']]
sum_table = TableOne(dm_data_tb, dm_data_tb.columns.tolist(), categorical=categorical,
                     overall=True, missing=True, htest_name=True, decimals={'Age': 0}, tukey_test=True,
                     groupby=groupby, nonnormal=['Age', 'Total health contacts'], pval=True)
sum_table

In [None]:
dm_data_tb = r_x.copy()
dm_data_tb['Sex'] = np.where(r_x.Sex_F == 1, 'F', 'M')
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([1,2]), 1, -1)
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([3,4]), 2, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([5,6]), 3, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([7,8]), 4, dm_data_tb['simd_quint'])
dm_data_tb['simd_quint'] = np.where(dm_data_tb.simd_dec.isin([9,10]), 5, dm_data_tb['simd_quint'])
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] < 2, '1', '5')
dm_data_tb['simd_gr'] = np.where(dm_data_tb['simd_quint'] == -1, '-1', dm_data_tb['simd_gr'])
dm_data_tb['simd_gr'] = np.where((dm_data_tb['simd_quint'] < 5)&(dm_data_tb['simd_quint']>=2), '2-4', dm_data_tb['simd_gr'])
dm_data_tb = dm_data_tb.rename(columns={'AgeAtAdmission': 'Age',
                                 'age_gr': 'Age group',
                                 'simd_gr': 'SIMD (1 - most deprived, 5 - least deprived)',
                                 'gt_m': 'In-hospital death',
                                 'gt_es_hosp': 'Extended stay (>=14 days)',
                                 'gt_cc': 'ICU/HDU admission',
                                 'gt_dd': 'Home discharge',
                                 'total_count_all': 'Total health contacts',
                                 'total_count_rehab': 'Total rehabilitation contacts',
                                 'total_count_ooh_all': 'Out-of-hours contacts',
                                 'total_n_disciplines_gr': 'Number of disciplines involved'
                                       })
t_cols = ['Age', 'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts']
groupby = ['set']
categorical = ['Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'In-hospital death',
                         'Extended stay (>=14 days)', 'ICU/HDU admission', 'Home discharge', 'Number of disciplines involved']
dm_data_tb = dm_data_tb[['set', 'Age', 'Age group', 'Sex', 'SIMD (1 - most deprived, 5 - least deprived)', 'In-hospital death',
                         'Extended stay (>=14 days)', 'ICU/HDU admission', 'Home discharge',
                        'Total health contacts', 'Total rehabilitation contacts', 'Out-of-hours contacts', 
                         'Number of disciplines involved']]
sum_table = TableOne(dm_data_tb, dm_data_tb.columns.tolist(), categorical=categorical,
                     overall=True, missing=True, htest_name=True, decimals={'Age': 0}, tukey_test=True,
                     groupby=groupby, nonnormal=['Age', 'Total health contacts'], pval=True)
sum_table

In [None]:
## Export
r_x_train = r_x_train.drop(['set'], axis=1)
r_x_val = r_x_val.drop(['set'], axis=1)
pd.DataFrame(r_x_train.dtypes).to_csv('')
r_x_train.to_csv('', index=False)
r_x_val.to_csv('', index=False)

In [None]:
r_x_train.columns.tolist()