In [2]:
import pandas as pd
import numpy as np
import os
import warnings
from los_functions_v3 import *
warnings.filterwarnings('ignore')

In [11]:
def View(df, rows=None, cols=None, width=None):
    """Displays the first `rows` of the DataFrame like R's View() by adjusting Pandas settings."""
    
    # Show only the first `rows` of the DataFrame
    with pd.option_context(
        "display.max_rows", rows,  # Limit number of rows shown
        "display.max_columns", cols,  # Show all columns
        "display.max_colwidth", width,  # Show full column width
        "display.expand_frame_repr", False  # Prevent column wrapping
    ):
        display(df.head(rows))  # Show only the first `rows`

In [15]:
#all hospitalized patients 
df_filt = pd.read_csv('/gpfs/milgram/project/rtaylor/imc33/LOS/data/master_los.csv', index_col=[0])
df_filt["viz_service_collapsed"] = df_filt["viz_service_collapsed"].apply(map_med)

In [16]:
fp_simple = '/gpfs/milgram/project/rtaylor/imc33/LOS/data/features_los_simple.xlsx'

features_los = pd.read_excel(fp_simple)

# Features that are divided into 5 types to conduct data preprocessing
# 1. drop: features that need to be dropped
drop_list = features_los[features_los['type'] == 'drop']['col_name'].tolist()

# 2. category: features that are already categorical and need to be OneHotEncoding (add missing value indicator, impute missing by adding 'missing' category)
category_list = features_los[features_los['type'] == 'category']['col_name'].tolist()

# 3. binary: features that are binary and need to be converted to categorical (add missing value indicator / prefer: fill missing with 0)
binary_list = features_los[features_los['type'] == 'binary']['col_name'].tolist()

# 4. continuous: features that are continous/numerical variables, need outlier handling and normalization (add missing value indicator, fill missing with median)
continuous_list = features_los[features_los['type'] == 'continuous']['col_name'].tolist()

# 5. discrete: features that are discrete/numerical variables, need to be discretized (fill missing with 0)
discrete_list = features_los[features_los['type'] == 'discrete']['col_name'].tolist()

# Split into X, y
X = df_filt.drop(['viz_outcome_prolonged_los_yn'], axis=1)
y = df_filt['viz_outcome_prolonged_los_yn']

#inna-updated
# Drop features that have >= 99% missing values And features in drop list
missing_pct = X.isnull().mean() * 100
cols_to_drop = missing_pct[missing_pct >= 99].index.tolist()
# print("Columns with >99% missing values", cols_to_drop)
final_drop_list = list(set(drop_list + cols_to_drop+continuous_list))
final_drop_list.remove("viz_age")

X = X.drop(columns=final_drop_list)
# print("Final drop list", final_drop_list)

X = X.reset_index()
y = y.reset_index()

# Update lists
category_list = [col for col in category_list if col not in final_drop_list]
continuous_list = [col for col in continuous_list if col not in final_drop_list]
discrete_list = [col for col in discrete_list if col not in final_drop_list]
binary_list = [col for col in binary_list if col not in final_drop_list]
#update 8/23: do not discretize hospitalist counts
#discrete_list.remove("con_service_hospitalist_service_count")

# Convert variables to categorical
X[category_list] = X[category_list].astype('category')

# Split the data by group shuffle split on 'PAT_MRN_ID' into train set and validation set
gss = GroupShuffleSplit(n_splits=2, test_size=0.2, random_state=SEED)

train_ix, val_ix = next(gss.split(X, y, groups=X['pat_mrn_id']))

X_train = X.loc[train_ix]
y_train = y.loc[train_ix]

X_val = X.loc[val_ix]
y_val = y.loc[val_ix]

# Drop 'PAT_MRN_ID' and set 'PAT_ENC_CSN_ID' as index
X_train = X_train.drop(['pat_mrn_id'], axis=1).set_index('pat_enc_csn_id')
X_val = X_val.drop(['pat_mrn_id'], axis=1).set_index('pat_enc_csn_id')
y_train = y_train.set_index('pat_enc_csn_id')
y_val = y_val.set_index('pat_enc_csn_id')

# 99% Missing

In [19]:
cols_to_drop

['summary_sw_consult_order_time',
 'summary_sw_consult_order_day',
 'con_max_consult_order_to_sign_colon_and_rectal_hrs',
 'con_max_consult_order_to_sign_lab_medicine_hrs',
 'con_max_consult_order_to_sign_picc_hrs',
 'con_max_consult_order_to_sign_radiation_oncology_hrs',
 'con_max_consult_order_to_sign_pharmacy_hrs',
 'con_max_consult_order_to_sign_internal_medicine_hrs',
 'con_max_consult_order_to_sign_neuro_oncology_hrs',
 'con_max_consult_order_to_sign_ophthalmology_hrs',
 'con_max_consult_order_to_sign_trauma_hrs',
 'con_max_consult_order_to_sign_rheumatology_hrs',
 'con_max_consult_order_to_sign_sw_hrs',
 'con_max_consult_note_to_sign_colon_and_rectal_hrs',
 'con_max_consult_note_to_sign_lab_medicine_hrs',
 'con_max_consult_note_to_sign_picc_hrs',
 'con_max_consult_note_to_sign_radiation_oncology_hrs',
 'con_max_consult_note_to_sign_pharmacy_hrs',
 'con_max_consult_note_to_sign_internal_medicine_hrs',
 'con_max_consult_note_to_sign_neuro_oncology_hrs',
 'con_max_consult_note_to_s

In [22]:
def miss_var_summary(df):
    miss_df = df.isna().sum().reset_index()
    miss_df.columns = ['variable', 'n_missing']
    miss_df['pct_missing'] = (miss_df['n_missing'] / len(df)) * 100
    miss_df = miss_df[miss_df['n_missing'] > 0]  # Optional: filter out columns with no missing
    miss_df = miss_df.sort_values(by='pct_missing', ascending=False).reset_index(drop=True)
    return miss_df


In [26]:
View(miss_var_summary(df_filt))

Unnamed: 0,variable,n_missing,pct_missing
0,summary_sw_consult_order_time,12771,100.0
1,con_max_admit_order_to_consult_order_sw_quintile,12771,100.0
2,con_avg_admit_order_to_consult_order_picc_hrs,12771,100.0
3,con_avg_consult_order_to_sign_sw_hrs,12771,100.0
4,con_max_admit_order_to_consult_completion_sw_quintile,12771,100.0
5,con_avg_admit_order_to_consult_order_colon_and_rectal_hrs,12771,100.0
6,con_max_admit_order_to_consult_order_colon_and_rectal_quintile,12771,100.0
7,con_max_admit_order_to_consult_order_sw_hrs,12771,100.0
8,con_avg_consult_note_to_sign_picc_quintile,12771,100.0
9,con_max_consult_order_to_sign_picc_quintile,12771,100.0
