In [1]:
import pandas as pd
import numpy as np
from mrmr import mrmr_classif
from sklearn.impute import KNNImputer
import os

CASE = 'CASE'
ID = 'ID'
data_dir = 'data'
subdir = 'experiment'
# k = 20 # number of features in each feature set

#### Load files

In [8]:
# load data
cohort = pd.read_csv(f'{data_dir}/cohort.csv', index_col=0)

# latest
X_latest_train = pd.read_csv(f'{data_dir}/X_latest_train.csv', index_col=0)
X_latest_test = pd.read_csv(f'{data_dir}/X_latest_test.csv', index_col=0)

# demographics
S_train = pd.read_csv(f'{data_dir}/S_train.csv', index_col=0)
S_test = pd.read_csv(f'{data_dir}/S_test.csv', index_col=0)

# labs vitals stats
X_cont_stats_train = pd.read_csv(f'{data_dir}/X_cont_stats_train.csv', index_col=0)
X_cont_stats_test = pd.read_csv(f'{data_dir}/X_cont_stats_test.csv', index_col=0)

# Dx Rx one hot encoded
X_dx_rx_history_train = pd.read_csv(f'{data_dir}/X_dx_rx_history_train.csv', index_col=0)
X_dx_rx_history_test = pd.read_csv(f'{data_dir}/X_dx_rx_history_test.csv', index_col=0)

# Dx Rx phenotypes
pheno_dir = f'{data_dir}/phenotypes_50_dxrx_HALS-exact'
X_train_dxrx = pd.read_csv(f'{pheno_dir}/pheno_patient_membership_train.csv', index_col=0)
X_test_dxrx = pd.read_csv(f'{pheno_dir}/pheno_patient_membership_test.csv', index_col=0)

# labs vitals phenotypes
pheno_dir = f'{data_dir}/phenotypes_30_lv_HALS-exact'
X_train_lv = pd.read_csv(f'{pheno_dir}/pheno_patient_membership_train.csv', index_col=0)
X_test_lv = pd.read_csv(f'{pheno_dir}/pheno_patient_membership_test.csv', index_col=0)

In [2]:
cohort = pd.read_csv('data/cohort.csv', index_col=0)

data_path = 'data/experiment/'

X_train = pd.read_csv(f'{data_path}/20_latest+demo/X_train.csv', index_col=0)
X_test = pd.read_csv(f'{data_path}/20_latest+demo/X_test.csv', index_col=0)
y_train = pd.read_csv(f'{data_path}/20_latest+demo/y_train.csv', index_col=0)
y_test = pd.read_csv(f'{data_path}/20_latest+demo/y_test.csv', index_col=0)

X_agg_train = pd.read_csv(f'{data_path}/30_aggregate/X_train.csv', index_col=0)
X_agg_test = pd.read_csv(f'{data_path}/30_aggregate/X_test.csv', index_col=0)

X_phe_train = pd.read_csv(f'{data_path}/30_phenotypes/X_train.csv', index_col=0)
X_phe_test = pd.read_csv(f'{data_path}/30_phenotypes/X_test.csv', index_col=0)

#### Combine Files

In [4]:
def rm_cols(df):
    df = df.drop([c for c in df.columns if 'px_' in c], axis=1)
    df = df.drop([c for c in df.columns if 'dx_Z' in c], axis=1)
    return df


def prep(X_train, X_test, cohort):
    # get target
    y_train = pd.merge(X_train, cohort, on=ID)[CASE]
    y_test = pd.merge(X_test, cohort, on=ID)[CASE]

    # get cols with >60% present
    col_freq = (X_train.notna().sum() / X_train.shape[0])
    col_freq = col_freq[col_freq > 0.6]
    freq_cols = col_freq.index
    X_train = X_train[freq_cols]

    # drop extra columns
    cols_to_drop = set(X_test.columns) - set(X_train.columns)
    X_test = X_test.drop(cols_to_drop, axis=1)

    return X_train, X_test, y_train, y_test


def run_mrmr(X_train, X_test, y_train, k):
    mrmr_results = mrmr_classif(X=X_train, y=y_train, K=k, show_progress=False, return_scores=True, relevance='rf')
    feature_idx = mrmr_results[0]
    feature_relevance = mrmr_results[1]
    feature_redundancy = mrmr_results[2]
    X_train = X_train.loc[:, feature_idx]
    X_test = X_test.loc[:, feature_idx]

    return X_train, X_test


def impute(X_train, X_test, cols, n):

    def make_bool(df):
        cols = [c for c in df.columns if 'dx_' in c or 'px_' in c or 'rx_' in c]
        df[cols] = df[cols].fillna(0)
        df[cols] = df[cols].apply(lambda x: x.apply(lambda y: 1 if y > 1 else y))
        return df

    # impute dx, rx, and px variables with 0, replace values >1 with 1.
    X_train = make_bool(X_train)
    X_test = make_bool(X_test)

    imputer = KNNImputer(n_neighbors=n, weights='distance')
    X_train[cols] = imputer.fit_transform(X_train)
    X_test[cols] = imputer.transform(X_test)

    # if boolean variable, round to 0 or 1.
    binary_features = set(['SMOKING_STATUS', 'ALCOHOL_USE_STATUS', 'ILLICIT_DRUG_USE', 'CARDIAC_HX'])
    binary_cols = []
    for b in binary_features:
        for c in X_train.columns:
            if b in c:
                binary_cols.append(c)
    binary_col_idx = np.array([i for i, e in enumerate(cols) if e in binary_cols])
    if binary_col_idx.shape[0] > 0:
        X_train.iloc[:,binary_col_idx] = np.round(X_train.iloc[:,binary_col_idx])
        X_test.iloc[:,binary_col_idx] = np.round(X_test.iloc[:,binary_col_idx])

    return X_train, X_test

def process(X1_train, X1_test, X2_train, X2_test, cohort, k=20, n=5, suffixes=['','']):
    X_train = pd.merge(X1_train, X2_train, on=ID, how='left', suffixes=suffixes) # left because a couple of the tables have too few or too many patients
    X_test = pd.merge(X1_test, X2_test, on=ID, how='left', suffixes=suffixes)
    X_train = rm_cols(X_train)
    X_test = rm_cols(X_test)
    X_train, X_test, y_train, y_test = prep(X_train, X_test, cohort)

    X_train_idx = X_train.index
    X_test_idx = X_test.index
    y_train_idx = y_train.index
    y_test_idx = y_test.index

    X_train, X_test = run_mrmr(
        X_train.reset_index(drop=True), 
        X_test.reset_index(drop=True), 
        y_train.reset_index(drop=True), 
        k
    )

    X_train, X_test = impute(X_train, X_test, X_train.columns, n)

    X_train.index = X_train_idx
    X_test.index = X_test_idx
    y_train.index = y_train_idx
    y_test.index = y_test_idx

    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

    return X_train, X_test, y_train, y_test

def save(X_train, X_test, y_train, y_test, data_dir, name):
    os.makedirs(f'{data_dir}/{name}', exist_ok=True)
    X_train.to_csv(f'{data_dir}/{name}/X_train.csv')
    X_test.to_csv(f'{data_dir}/{name}/X_test.csv')
    y_train.to_csv(f'{data_dir}/{name}/y_train.csv')
    y_test.to_csv(f'{data_dir}/{name}/y_test.csv')

def combine(X1, X2, y):
    X = pd.merge(X1, X2, left_index=True, right_index=True)
    y = pd.merge(X, y, left_index=True, right_index=True)['CASE']
    return X, y

In [11]:
X_train, X_test, y_train, y_test = process(X_latest_train, X_latest_test, S_train, S_test, cohort, k=20)
X_agg_train, X_agg_test, y_agg_train, y_agg_test = process(X_dx_rx_history_train, X_dx_rx_history_test, X_cont_stats_train, X_cont_stats_test, cohort, k=30)
X_phe_train, X_phe_test, y_phe_train, y_phe_test = process(X_train_dxrx, X_test_dxrx, X_train_lv, X_test_lv, cohort, suffixes=['_dxrx', '_lv'], k=30)

(5570, 20) (2358, 20) (5570,) (2358,)
(5570, 30) (2358, 30) (5570,) (2358,)
(5570, 30) (2358, 30) (5570,) (2358,)


In [12]:
save(X_train, X_test, y_train, y_test, data_dir, f'{subdir}/{20}_latest+demo')
save(X_agg_train, X_agg_test, y_agg_train, y_agg_test, data_dir, f'{subdir}/{30}_aggregate')
save(X_phe_train, X_phe_test, y_phe_train, y_phe_test, data_dir, f'{subdir}/{30}_phenotypes')

In [5]:
X_latest_demo_agg_train, y_latest_demo_agg_train = combine(X_train, X_agg_train, y_train)
X_latest_demo_agg_test, y_latest_demo_agg_test = combine(X_test, X_agg_test, y_test)
X_latest_demo_agg_train, X_latest_demo_agg_test = run_mrmr(
    X_latest_demo_agg_train.reset_index(drop=True), 
    X_latest_demo_agg_test.reset_index(drop=True), 
    y_latest_demo_agg_train.reset_index(drop=True), 
    20)

X_latest_demo_phe_train, y_latest_demo_phe_train = combine(X_train, X_phe_train, y_train)
X_latest_demo_phe_test, y_latest_demo_phe_test = combine(X_test, X_phe_test, y_test)
X_latest_demo_phe_train, X_latest_demo_phe_test = run_mrmr(
    X_latest_demo_phe_train.reset_index(drop=True), 
    X_latest_demo_phe_test.reset_index(drop=True), 
    y_latest_demo_phe_train.reset_index(drop=True), 
    30)

# combine all
X_all_train = pd.merge(X_train, X_agg_train, left_index=True, right_index=True)
X_all_train = pd.merge(X_all_train, X_phe_train, left_index=True, right_index=True)
y_all_train = pd.merge(X_all_train, y_train, left_index=True, right_index=True)['CASE']
X_all_test = pd.merge(X_test, X_agg_test, left_index=True, right_index=True)
X_all_test = pd.merge(X_all_test, X_phe_test, left_index=True, right_index=True)
y_all_test = pd.merge(X_all_test, y_test, left_index=True, right_index=True)['CASE']

# save(X_all_train, X_all_test, y_all_train, y_all_test, data_dir, f'{subdir}/all')

X_all_train, X_all_test = run_mrmr(
    X_all_train.reset_index(drop=True), 
    X_all_test.reset_index(drop=True), 
    y_all_train.reset_index(drop=True), 
    60)

In [6]:
save(X_train, X_test, y_train, y_test, data_dir, f'{subdir}/{20}_latest+demo')
save(X_agg_train, X_agg_test, y_agg_train, y_agg_test, data_dir, f'{subdir}/{30}_aggregate')
save(X_phe_train, X_phe_test, y_phe_train, y_phe_test, data_dir, f'{subdir}/{30}_phenotypes')
save(X_latest_demo_agg_train, X_latest_demo_agg_test, y_latest_demo_agg_train, y_latest_demo_agg_test, data_dir, f'{subdir}/{20}_latest+demo+aggregate')
save(X_latest_demo_phe_train, X_latest_demo_phe_test, y_latest_demo_phe_train, y_latest_demo_phe_test, data_dir, f'{subdir}/{30}_latest+demo+phenotypes')
save(X_all_train, X_all_test, y_all_train, y_all_test, data_dir, f'{subdir}/{60}_all')