In [1]:
import pandas as pd
import numpy as np

pd.set_option('max_rows', 500)
pd.set_option('display.max_columns', 300)
np.random.seed(666)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:20,.3f}'.format)
pd.set_option('display.max_colwidth', None)

In [2]:
TARGET_COL = 'diabetes_mellitus'
cat_cols = ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']

def target_encode(var, dv, weight = 36):
    mean = dv.mean()
    true_weight = weight * max(1, mean/(1-mean))
    agg = dv.groupby(var,dropna=False).agg(['count','mean'])
    counts = agg['count']
    means = agg['mean']
    return (counts * means + true_weight) / (counts + true_weight/mean)

def target_encode_train_test(var_train, dv_train, var_test, weight = 36):
    encode_key = target_encode(var_train,dv_train,weight)
    return var_train.map(encode_key), var_test.map(encode_key).fillna(dv_train.mean())

In [3]:
train = pd.read_csv('Data/train_capped_ratio_all.csv')
test = pd.read_csv('Data/test_capped_ratio_all.csv')

In [4]:
vars_to_encode = ['icu_id','apache_3j_diagnosis']
for col in vars_to_encode:
    train[col+'_encoded'], test[col+'_encoded'] = target_encode_train_test(
        train[col],train[TARGET_COL],test[col])
features_to_ignore = vars_to_encode+['hospital_id','encounter_id']
features_to_ignore.remove('apache_3j_diagnosis')

In [5]:
combined = pd.concat([train.drop(TARGET_COL,axis=1),test])

In [None]:
corrs_spearman = combined.drop(features_to_ignore,axis=1).rank().corr()
corrs_spearman.to_csv('corrs_spearman_train_capped_ratio.csv')

In [None]:
corrs_pearson = combined.drop(features_to_ignore,axis=1).corr()
corrs_pearson.to_csv('corrs_pearson_train_capped_ratio.csv')

In [11]:
notna_counts = corrs_spearman.copy()
for row in notna_counts.columns:
    for col in notna_counts.columns:
        notna_counts.loc[row,col] = (combined[row].notna() & combined[col].notna()).sum()
notna_counts.to_csv('count_notna_pairwise.csv')

In [73]:
corrs = np.abs(corrs_spearman)
corrs2_notna = notna_counts*corrs_spearman.pow(2)
# corrs = np.abs(corrs_pearson)
# corrs2_notna = notna_counts*corrs_pearson.pow(2)

In [74]:
high_corrs_dict = {}
corr_list = []
for index, row in corrs.iterrows():
    corrs.loc[index,index] = 0
    corr_vars = list(row.index[row > .8])
    if corr_vars:
        high_corrs_dict[index] = corr_vars
        corr_list.append(index)

In [75]:
vars_to_remove = []
while corr_list:
    pos_to_remove = corrs2_notna.drop(vars_to_remove,axis=1).loc[corr_list].sum(axis=1).divide(
        combined[corr_list].notna().sum()).argmax()
    var_to_remove = corr_list.pop(pos_to_remove)
    vars_to_remove.append(var_to_remove)
    for var in high_corrs_dict[var_to_remove]:
        high_corrs_dict[var].remove(var_to_remove)
        if not high_corrs_dict[var]:
            corr_list.remove(var)

In [71]:
len(vars_to_remove)

80

In [76]:
pd.Series(vars_to_remove,name='feature').to_csv('corr_80_spearman.csv')

In [77]:
high_corr_vars = [
    'd1_glucose_diff','elective_surgery','apache_post_operative','d1_mbp_noninvasive_min','h1_mbp_noninvasive_max',
    'h1_mbp_noninvasive_min','h1_heartrate_max','h1_spo2_min','h1_spo2_max','h1_calcium_max','d1_hematocrit_min',
    'd1_inr_max','h1_arterial_po2_max','d1_arterial_po2_diff','d1_arterial_po2_min_h1_diff','h1_arterial_po2_diff',
    'd1_resprate_min_h1_diff','d1_mbp_noninvasive_diff','h1_arterial_pco2_measured','d1_sysbp_noninvasive_diff',
    'h1_diasbp_invasive_diff','h1_sysbp_invasive_diff','d1_max_sysbp_diasbp_ratio','h1_pao2fio2ratio_min',
    'h1_min_sysbp_diasbp_diff','d1_min_sysbp_diasbp_diff',
#-------------------- .85 cutoff
    'weight','gcs_eyes_apache','paco2_apache','d1_lactate_max','h1_arterial_ph_max',
    'h1_hco3_sodium_bun_creatinine_measured','h1_blood_tests_measured','d1_spo2_diff','d1_pao2fio2ratio_max_h1_diff',
    'h1_pao2fio2ratio_diff','h1_bp_invasive_flag','d1_heartrate_max','d1_sodium_min'
]

In [78]:
pd.Series(high_corr_vars,name='feature').to_csv('unimportant_vars_86457.csv')