# Functional status (mRS) prediction

## 4. Preprocessing
- Load raw data
- Transform data - favorable functional status (mRS <= 2)
- Transform data - functional status (ordinal)
- Transform data - mortality

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from pickle import dump, load

from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import resample

In [2]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [3]:
# helper function for preprocessing training features

def X_processor(X, include_mt_data = True):
    
    print('Loading {} samples...'.format(X.shape[0]))
    
    X['tici_success'] = np.where(X['tici'].str.contains('3|2B', regex = True), 'Y', 'N')
    X['coll_full'] = np.where(X['coll_score'] == 3.0, 'Y', 'N')
    X['pre_mrs_0'] = np.where(X['pre_mrs'] == 0.0, 'Y', 'N')
    
    # remove variables that have too many missing values/don't provide much information or have been transformed
    vars_to_drop = ['tici', 'coll_score', 'pre_mrs', 'approach', 'gen_anes_vol', 'tmax_6', 'tmax_10', 'cbf_30',
                    'mismatch_vol', 'hypoperf_index', 'cbv_index', 'stent_ret_length', 'bgc', 
                    'rescue_ia', 'trevo', 'solitaire', 'embotrap', 'capture', 'gen_anes_vol', 'map', 'a1c']
    
    if include_mt_data:
        X = X.drop(vars_to_drop, axis = 1)
        
    else:
        # if not including data from mechanical thrombectomy, remove the procedure-related variables
        procedural_vars = ['heparin', 'ptas', 'stent_ret', 'aspiration', 'num_pass', 'first_pass_reperf', 
                           'procedure_ap', 'gen_anes', 'fluoro_time', 'hypoten_mt', 
                           'time_to_puncture', 'time_to_first_pass', 'time_to_reperf', 'tici_success']
        
        vars_to_drop.extend(procedural_vars)
        X = X.drop(vars_to_drop, axis = 1)
    
    return X

In [4]:
def quality_check(X_train, X_test):
    
#     if X_train.isnull().any().sum() == 0 & X_test.isnull().any().sum() == 0:
#         print('No missing data')
#     else:
#         raise ValueError('There is missing data')
        
    if all(X_train.columns == X_test.columns):
        print('Features in training and testing sets match')
    else:
        raise ValueError('Features dont match')

In [5]:
# helper function to load data

def load_data(outcome_dir):
    
    files = sorted(os.listdir(os.path.join('splits', outcome_dir)))
    X_test = pd.read_pickle(os.path.join('splits', outcome_dir, files[0]))
    X_train = pd.read_pickle(os.path.join('splits', outcome_dir, files[1]))
    y_test = pd.read_pickle(os.path.join('splits', outcome_dir, files[2]))
    y_train = pd.read_pickle(os.path.join('splits', outcome_dir, files[3]))
    
    return X_train, X_test, y_train, y_test

In [6]:
# helper function to transform and save data

def transformer(X_train, X_test, outcome_dir, X_processor, quality_check, include_mt_data):
    
    if include_mt_data:
        print('Saving transformed files with variables from mechanical thrombectomy')
        
        X_train_trans_mt = X_processor(X = X_train, include_mt_data = True)
        X_test_trans_mt = X_processor(X = X_test, include_mt_data = True)
    
        quality_check(X_train = X_train_trans_mt, X_test = X_test_trans_mt)
    
        X_train_trans_mt.to_pickle(os.path.join(outcome_dir, 'mt_data', 'X_train_trans_mt.pkl'))
        X_test_trans_mt.to_pickle(os.path.join(outcome_dir, 'mt_data', 'X_test_trans_mt.pkl'))
        
        return X_train_trans_mt, X_test_trans_mt
        
    else:
        print('Saving transformed files without variables from mechanical thrombectomy')
        
        X_train_trans_nomt = X_processor(X = X_train, include_mt_data = False)
        X_test_trans_nomt = X_processor(X = X_test, include_mt_data = False)
    
        quality_check(X_train = X_train_trans_nomt, X_test = X_test_trans_nomt)
    
        X_train_trans_nomt.to_pickle(os.path.join(outcome_dir, 'no_mt_data', 'X_train_trans_nomt.pkl'))
        X_test_trans_nomt.to_pickle(os.path.join(outcome_dir, 'no_mt_data', 'X_test_trans_nomt.pkl'))
        
        return X_train_trans_nomt, X_test_trans_nomt

In [7]:
# helper function to transform and save labels

def label_transformer(y_train, y_test, outcome_dir):
    
    le = LabelEncoder()
    le.fit(y_train)
    y_train_trans = le.transform(y_train)
    y_test_trans = le.transform(y_test)
    
    np.save(os.path.join(outcome_dir, 'y_train_trans.npy'), y_train_trans)
    np.save(os.path.join(outcome_dir, 'y_test_trans.npy'), y_test_trans)
    
    return y_train_trans, y_test_trans

### Favorable functional status

Dichotomized as favorable (mRS <=2) or unfavorable (mRS >=3)

In [67]:
# load raw data

X_train, X_test, y_train, y_test = load_data(outcome_dir = 'fav_functional_status')

In [68]:
X_train_trans_mt, X_test_trans_mt = transformer(X_train = X_train, 
                                                X_test = X_test,
                                                outcome_dir = 'transformed_datasets/fav_functional_status',
                                                X_processor = X_processor, 
                                                quality_check = quality_check, 
                                                include_mt_data = True)

Saving transformed files with variables from mechanical thrombectomy
Loading 285 samples...
Loading 72 samples...
Features in training and testing sets match


In [69]:
X_train_trans_nomt, X_test_trans_nomt = transformer(X_train = X_train, 
                                                    X_test = X_test,
                                                    outcome_dir = 'transformed_datasets/fav_functional_status',
                                                    X_processor = X_processor, 
                                                    quality_check = quality_check, 
                                                    include_mt_data = False)

Saving transformed files without variables from mechanical thrombectomy
Loading 285 samples...
Loading 72 samples...
Features in training and testing sets match


In [70]:
y_train_trans, y_test_trans = label_transformer(y_train = y_train, 
                                                y_test = y_test, 
                                                outcome_dir = 'transformed_datasets/fav_functional_status')

### Functional status

Ordinal mRS (0 - 6)

In [73]:
# load raw data

X_train, X_test, y_train, y_test = load_data(outcome_dir = 'functional_status')

In [74]:
X_train_trans_mt, X_test_trans_mt = transformer(X_train = X_train, 
                                                X_test = X_test,
                                                outcome_dir = 'transformed_datasets/functional_status',
                                                X_processor = X_processor, 
                                                quality_check = quality_check, 
                                                include_mt_data = True)

Saving transformed files with variables from mechanical thrombectomy
Loading 285 samples...
Loading 72 samples...
Features in training and testing sets match


In [75]:
X_train_trans_nomt, X_test_trans_nomt = transformer(X_train = X_train, 
                                                    X_test = X_test,
                                                    outcome_dir = 'transformed_datasets/functional_status',
                                                    X_processor = X_processor, 
                                                    quality_check = quality_check, 
                                                    include_mt_data = False)

Saving transformed files without variables from mechanical thrombectomy
Loading 285 samples...
Loading 72 samples...
Features in training and testing sets match


In [76]:
y_train_trans, y_test_trans = label_transformer(y_train = y_train, 
                                                y_test = y_test, 
                                                outcome_dir = 'transformed_datasets/functional_status')

### Mortality
mRS = 6

In [77]:
# load raw data

X_train, X_test, y_train, y_test = load_data(outcome_dir = 'mortality')

In [78]:
X_train_trans_mt, X_test_trans_mt = transformer(X_train = X_train, 
                                                X_test = X_test,
                                                outcome_dir = 'transformed_datasets/mortality',
                                                X_processor = X_processor, 
                                                quality_check = quality_check, 
                                                include_mt_data = True)

Saving transformed files with variables from mechanical thrombectomy
Loading 285 samples...
Loading 72 samples...
Features in training and testing sets match


In [79]:
X_train_trans_nomt, X_test_trans_nomt = transformer(X_train = X_train, 
                                                    X_test = X_test,
                                                    outcome_dir = 'transformed_datasets/mortality',
                                                    X_processor = X_processor, 
                                                    quality_check = quality_check, 
                                                    include_mt_data = False)

Saving transformed files without variables from mechanical thrombectomy
Loading 285 samples...
Loading 72 samples...
Features in training and testing sets match


In [80]:
y_train_trans, y_test_trans = label_transformer(y_train = y_train, 
                                                y_test = y_test, 
                                                outcome_dir = 'transformed_datasets/mortality')

### Death or severe disability
mRS = 4-6

In [8]:
# load raw data

X_train, X_test, y_train, y_test = load_data(outcome_dir = 'dsd')

In [9]:
X_train_trans_mt, X_test_trans_mt = transformer(X_train = X_train, 
                                                X_test = X_test,
                                                outcome_dir = 'transformed_datasets/dsd',
                                                X_processor = X_processor, 
                                                quality_check = quality_check, 
                                                include_mt_data = True)

Saving transformed files with variables from mechanical thrombectomy
Loading 285 samples...
Loading 72 samples...
Features in training and testing sets match


In [10]:
X_train_trans_nomt, X_test_trans_nomt = transformer(X_train = X_train, 
                                                    X_test = X_test,
                                                    outcome_dir = 'transformed_datasets/dsd',
                                                    X_processor = X_processor, 
                                                    quality_check = quality_check, 
                                                    include_mt_data = False)

Saving transformed files without variables from mechanical thrombectomy
Loading 285 samples...
Loading 72 samples...
Features in training and testing sets match


In [11]:
y_train_trans, y_test_trans = label_transformer(y_train = y_train, 
                                                y_test = y_test, 
                                                outcome_dir = 'transformed_datasets/dsd')