# Functional status (mRS) prediction

## 5. Training

### Favorable functional status (mRS >= 2)
1. with MT data
2. without MT data
### Mortality (mRS 6)
3. with MT data
4. without MT data
### Death/severe disability (mRS 4-6)
5. with MT data
6. without MT data

In [72]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, StratifiedKFold, GridSearchCV
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.feature_selection import RFECV, SelectPercentile, chi2, VarianceThreshold

import joblib
import pickle

import xgboost as xgb
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from imblearn.pipeline import Pipeline as imb_pipeline
from imblearn.over_sampling import SMOTE

In [2]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [3]:
# helper function for training/saving results with a baseline LR model

def baseline_lr(X_train, y_train, n_splits, n_repeats, scoring, fname):
    
    # make sure fname is valid
    if os.path.isdir(os.path.split(fname)[0]) == False:
        raise ValueError('Invalid directory specified')
    
    # identify categorical and continuous vars
    cont_columns_selector = selector(dtype_exclude = object)
    cat_columns_selector = selector(dtype_include = object)
    cont_columns = cont_columns_selector(X_train)
    cat_columns = cat_columns_selector(X_train)
    all_columns = list(X_train.columns)

    # instantiate the pre-processing pipelines for categorical and continuous variables
    cat_preprocessor = Pipeline(steps = [
            ('ordinal encoder', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = np.nan)),
            ('imputer', SimpleImputer(strategy = 'most_frequent')),
            ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore'))])
    cont_preprocessor = Pipeline(steps = [
            ('imputer', IterativeImputer(max_iter = 10000, random_state = 42)),
            ('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(
            transformers = [
            ('cont', cont_preprocessor, cont_columns), 
            ('cat', cat_preprocessor, cat_columns)])

    pipe = Pipeline(
        steps = [('preprocessor', preprocessor), 
                 ('logistic regression', LogisticRegression(penalty = 'none', random_state = 42, 
                                                            max_iter = 10000))])

    cv = RepeatedStratifiedKFold(n_splits = n_splits, n_repeats = n_repeats, random_state = 42)
    print('Evaluating baseline logistic regression model with CV using {} splits and {} repeats'.
          format(n_splits, n_repeats))
    scores = list(cross_val_score(pipe, X_train, y_train, scoring = scoring, cv = cv))

    pd.DataFrame(scores).to_pickle(fname)
    print('Mean AUC for baseline logistic regression model: {} +/- {}'.format(np.mean(scores), np.std(scores)))
    
    return pipe, pd.DataFrame(scores)

In [4]:
# helper function for screening models with default settings and minimal preprocessing

def screen_models(models, score, X_train, y_train, fname, fname_df):
    results = []
    result_file = open(fname, 'a')
    
    # identify categorical and continuous vars
    cont_columns_selector = selector(dtype_exclude = object)
    cat_columns_selector = selector(dtype_include = object)
    cont_columns = cont_columns_selector(X_train)
    cat_columns = cat_columns_selector(X_train)
    all_columns = list(X_train.columns)
    
    # instantiate the pre-processing pipelines for categorical and continuous variables
    cat_preprocessor = Pipeline(steps = [
        ('ordinal encoder', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = np.nan)),
        ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore'))])
    cont_preprocessor = Pipeline(steps = [
        ('imputer', IterativeImputer(max_iter = 10000, random_state = 0)),
        ('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(
        transformers = [
        ('cont', cont_preprocessor, cont_columns), 
        ('cat', cat_preprocessor, cat_columns)])  
    
    for model_name, Model, params_list in models:
        print('Evaluating {}...'.format(model_name))
        for params in params_list:
            model = Pipeline(steps = [
                ('preprocessor', preprocessor),
                ('model', Model(**params))
            ])
            cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 10)
            scores = list(cross_val_score(model, X_train, y_train, scoring = score, cv = cv))
            results.append((model_name, model, params, np.median(scores), np.percentile(scores, [25, 75]), scores))
    
    results.sort(key = lambda x:x[-3], reverse = True)
    
    # write score summary to txt file
    result_file.write('\nmedian {} scores:\n\n'.format(score))
    for modelname, model, params, median, iqr, scores in results:
        result_file.write(str(modelname) + '\t' + str(params) + '\t' + str(median) + '\t' + '+/- ' + str(iqr) + '\n')
    result_file.close()
    
    # write scores to dataframe
    df = pd.DataFrame()
    for modelname, model, params, median, iqr, scores in results:
        column_name = str(modelname) + str(params)
        df[column_name] = scores
    df.to_pickle(fname_df)
    
    return df

In [5]:
# helper function for nested CV to find best parameters for sklearn models

def eval_params(fname_text, fname_results, tuning_model, param_grid, X_train, y_train):
    
    results = []
    result_file = open(fname_text, 'a')

    skf = StratifiedKFold(n_splits = 5, shuffle = True)
    fold_no = 1
    
    # identify categorical and continuous vars
    cont_columns_selector = selector(dtype_exclude = object)
    cat_columns_selector = selector(dtype_include = object)
    cont_columns = cont_columns_selector(X_train)
    cat_columns = cat_columns_selector(X_train)
    all_columns = list(X_train.columns)
    
    # instantiate the pre-processing pipelines for categorical and continuous variables
    cat_preprocessor = Pipeline(steps = [
        ('ordinal encoder', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = np.nan)),
        ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore')),
        ('selector', SelectPercentile(chi2))])
    cont_preprocessor = Pipeline(steps = [
        ('imputer', IterativeImputer(max_iter = 10000, random_state = 0)),
        ('variance_threshold', VarianceThreshold()),
        ('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(
        transformers = [
        ('cont', cont_preprocessor, cont_columns), 
        ('cat', cat_preprocessor, cat_columns)])  
    
    for train_index, test_index in skf.split(X_train, y_train):

        X_train_split, X_test = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_split, y_test = y_train[train_index], y_train[test_index]
        
        # find best model params
        print('finding best model parameters for fold number {}'.format(fold_no))
        model = imb_pipeline([
            ('preprocessor', preprocessor),
            ('SMOTE', SMOTE()),
            ('classifier', tuning_model)
        ])
        grid = GridSearchCV(estimator = model, param_grid = param_grid, n_jobs = -1, cv = 5, error_score = 'raise', 
                           scoring = 'roc_auc', verbose = 3)
        grid.fit(X_train_split, y_train_split)
        
        # evaluate best model params on outer fold
        print('evaluating model for fold number {}'.format(fold_no))
        best_params = grid.best_params_
        print(best_params)
        best_model = grid.best_estimator_
        score = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
        results.append((best_params, score))
        print('parameters: {}'.format(str(best_params)))
        print('AUC score: {}'.format(score))
        fold_no += 1

    results.sort(key = lambda x:x[-1], reverse = True)
    result_file.write('\nAUC scores:\n\n')
    
    # write score summary to text file
    for best_params, score in results:
        result_file.write(str(best_params) + '\t' + str(score) + '\n')
    result_file.close()
    
    # write scores to dataframe:
    df = pd.DataFrame(results, columns = ['params', 'score'])
    df.to_pickle(fname_results)
    
    return df

In [71]:
# helper function for retraining tuned model + saving

def retrain_save(X, y, percentile, threshold, final_model, fname):
    
    # identify categorical and continuous vars
    cont_columns_selector = selector(dtype_exclude = object)
    cat_columns_selector = selector(dtype_include = object)
    cont_columns = cont_columns_selector(X)
    cat_columns = cat_columns_selector(X)
    all_columns = list(X.columns)
    
    # instantiate the pre-processing pipelines for categorical and continuous variables
    cat_preprocessor = Pipeline(steps = [
        ('ordinal encoder', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = np.nan)),
        ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore')),
        ('selector', SelectPercentile(chi2, percentile = percentile))])
    cont_preprocessor = Pipeline(steps = [
        ('imputer', IterativeImputer(max_iter = 10000, random_state = 0)),
        ('variance_threshold', VarianceThreshold(threshold = threshold)),
        ('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(
        transformers = [
        ('cont', cont_preprocessor, cont_columns), 
        ('cat', cat_preprocessor, cat_columns)])  
    
    model = imb_pipeline([
            ('preprocessor', preprocessor),
            ('SMOTE', SMOTE()),
            ('classifier', final_model)
        ])
    
    model.fit(X, y)
    print('Training complete for {}'.format(final_model))
    
    pickle.dump(model, open(fname, 'wb'))
    if os.path.exists(fname):
        print('Model saved')
    else:
        print('Error saving model')
    
    return model

In [88]:
# helper function for training LR model + saving

def train_save_lr(X, y, fname):
    
    # identify categorical and continuous vars
    cont_columns_selector = selector(dtype_exclude = object)
    cat_columns_selector = selector(dtype_include = object)
    cont_columns = cont_columns_selector(X)
    cat_columns = cat_columns_selector(X)
    all_columns = list(X.columns)

    # instantiate the pre-processing pipelines for categorical and continuous variables
    cat_preprocessor = Pipeline(steps = [
            ('ordinal encoder', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = np.nan)),
            ('imputer', SimpleImputer(strategy = 'most_frequent')),
            ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore'))])
    cont_preprocessor = Pipeline(steps = [
            ('imputer', IterativeImputer(max_iter = 10000, random_state = 42)),
            ('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(
            transformers = [
            ('cont', cont_preprocessor, cont_columns), 
            ('cat', cat_preprocessor, cat_columns)])

    pipe = Pipeline(
        steps = [('preprocessor', preprocessor), 
                 ('logistic regression', LogisticRegression(penalty = 'none', random_state = 42, 
                                                            max_iter = 10000))])
    
    pipe.fit(X, y)
    print('Training complete')
    
    pickle.dump(pipe, open(fname, 'wb'))
    if os.path.exists(fname):
        print('Model saved')
    else:
        print('Error saving model')
    
    return pipe

In [37]:
svm_grid = [
    {
        'preprocessor__cat__selector__percentile': np.linspace(10, 100, num = 10), 
        'preprocessor__cont__variance_threshold__threshold': np.linspace(0.4, 0.6, num = 3), 
        'classifier__C': [0.01, 0.1, 1.0, 10], 
        'classifier__gamma': ['scale', 'auto']
    }    
]

In [38]:
rf_grid = [
    {
        'preprocessor__cat__selector__percentile': np.linspace(10, 100, num = 10), 
        'preprocessor__cont__variance_threshold__threshold': np.linspace(0.4, 0.6, num = 3), 
        'classifier__n_estimators': [int(x) for x in np.linspace(500, 2000, num = 4)],
        'classifier__max_depth': np.linspace(20, 100, num = 5),
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    }    
]

In [39]:
mlp_grid = [
    {
        'preprocessor__cat__selector__percentile': np.linspace(10, 100, num = 10), 
        'preprocessor__cont__variance_threshold__threshold': np.linspace(0.4, 0.6, num = 3), 
        'classifier__hidden_layer_sizes': [(10,), (10, 10,), (10, 10, 10,), (20,), (20, 20,), (20, 20, 20,), 
                                           (30,), (30, 30,), (30, 30, 30,)],
        'classifier__beta_1': np.linspace(0.3, 0.9, num = 3),
        'classifier__beta_2': np.linspace(0.3, 0.9, num = 3)
    }    
]

In [40]:
nb_grid = [
    {
        'preprocessor__cat__selector__percentile': np.linspace(10, 100, num = 10), 
        'preprocessor__cont__variance_threshold__threshold': np.linspace(0.4, 0.6, num = 3), 
        'classifier__var_smoothing': np.logspace(0, -9, num = 10)
    }
]

### 1. Favorable functional status prediction - with MT data

Model training includes variables from the mechanical thrombectomy

In [68]:
X_train_mt = pd.read_pickle('transformed_datasets/fav_functional_status/mt_data/X_train_trans_mt.pkl')
y_train = np.load('transformed_datasets/fav_functional_status/y_train_trans.npy')

In [12]:
# evaluate baseline LR model

pipe_lr_mt, scores_lr_mt = baseline_lr(X_train = X_train_mt, y_train = y_train, n_splits = 5, n_repeats = 10, 
                                       scoring = 'roc_auc', 
                                       fname = 'experiments/fav_functional_status/mt_data/baseline_lr.pkl')

Evaluating baseline logistic regression model with CV using 5 splits and 10 repeats
Mean AUC for baseline logistic regression model: 0.7718741032998565 +/- 0.0568870610395341


In [16]:
# models to screen

max_iter = 10000

en_params = [{'max_iter': max_iter}]
dec_tree_params = [{'criterion': 'gini'}, {'criterion': 'entropy'}]
rand_for_params = [{'criterion': 'gini', 'n_estimators': 500}, {'criterion': 'entropy', 'n_estimators': 500}]
kneighbors_params = [{'n_neighbors': 3}, {'n_neighbors': 5}]
naive_bayes_params = [{}]
svc_params = [{'C': 0.01}, {'C': 0.1}, {'C': 1}, {'C': 10}]
xgb_params = [{'use_label_encoder': False}]
lgbm_params = [{}]
mlp_params = [{'hidden_layer_sizes': (10,), 'max_iter': max_iter}, {'hidden_layer_sizes': (10, 10,), 'max_iter': max_iter}, 
              {'hidden_layer_sizes': (10, 10, 10,), 'max_iter': max_iter}]

models = [
    ['elastic net', ElasticNet, en_params],
    ['decision tree', DecisionTreeClassifier, dec_tree_params],
    ['random forest', RandomForestClassifier, rand_for_params],
    ['k neighbors', KNeighborsClassifier, kneighbors_params],
    ['naive bayes', GaussianNB, naive_bayes_params],
    ['support vector machines', SVC, svc_params],
    ['XG boost', xgb.XGBClassifier, xgb_params],
    ['Light GBM', LGBMClassifier, lgbm_params],
    ['MLP', MLPClassifier, mlp_params]
]

In [18]:
# screen models

fname = 'experiments/fav_functional_status/mt_data/init_screening_summary' + '_' + str(datetime.now().year) + \
    '_' + str(datetime.now().month) + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_df = 'experiments/fav_functional_status/mt_data/init_screening_results' + '_' + str(datetime.now().year) + \
    '_' + str(datetime.now().month) + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl' 

init_screen_mt = screen_models(models = models, score = 'roc_auc', X_train = X_train_mt, y_train = y_train, 
                              fname = fname, fname_df = fname_df)

Evaluating elastic net...
Evaluating decision tree...
Evaluating random forest...
Evaluating k neighbors...
Evaluating naive bayes...
Evaluating support vector machines...




Evaluating XG boost...
Evaluating Light GBM...
Evaluating MLP...


In [19]:
init_screen_mt.describe()

Unnamed: 0,support vector machines{'C': 1},support vector machines{'C': 0.01},support vector machines{'C': 0.1},support vector machines{'C': 10},"random forest{'criterion': 'gini', 'n_estimators': 500}","random forest{'criterion': 'entropy', 'n_estimators': 500}","MLP{'hidden_layer_sizes': (10,), 'max_iter': 10000}","MLP{'hidden_layer_sizes': (10, 10, 10), 'max_iter': 10000}",XG boost{'use_label_encoder': False},"MLP{'hidden_layer_sizes': (10, 10), 'max_iter': 10000}",Light GBM{},naive bayes{},k neighbors{'n_neighbors': 5},k neighbors{'n_neighbors': 3},decision tree{'criterion': 'entropy'},decision tree{'criterion': 'gini'},elastic net{'max_iter': 10000}
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.797484,0.796841,0.792268,0.775598,0.764498,0.758314,0.754566,0.746269,0.749872,0.74379,0.740431,0.715809,0.693907,0.683881,0.63102,0.638249,0.5
std,0.06623,0.054273,0.066496,0.061575,0.063794,0.06476,0.061257,0.067691,0.061448,0.065571,0.068307,0.062453,0.077251,0.055006,0.068572,0.072898,0.0
min,0.642647,0.669118,0.617647,0.619118,0.602896,0.641176,0.620588,0.555882,0.630882,0.58689,0.616176,0.533088,0.463235,0.554878,0.472059,0.480147,0.5
25%,0.765244,0.750762,0.750368,0.737132,0.725825,0.703676,0.716176,0.700139,0.709205,0.710294,0.692279,0.684926,0.641544,0.643674,0.586213,0.598498,0.5
50%,0.805882,0.804591,0.801704,0.784559,0.770588,0.7625,0.760482,0.748529,0.747794,0.747767,0.732353,0.718158,0.694118,0.685662,0.6375,0.630882,0.5
75%,0.844118,0.845588,0.844086,0.806618,0.803067,0.805147,0.796691,0.788971,0.793015,0.797982,0.792647,0.757023,0.744669,0.724265,0.675184,0.677112,0.5
max,0.911765,0.886765,0.913235,0.929412,0.903676,0.876471,0.860294,0.891176,0.875,0.857353,0.916176,0.825,0.860294,0.808088,0.807353,0.794853,0.5


In [43]:
# tune SVM

fname_text = 'experiments/fav_functional_status/mt_data/svm_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/fav_functional_status/mt_data/svm_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = SVC(probability = True), param_grid = svm_grid, X_train = X_train_mt, 
            y_train = y_train)

finding best model parameters for fold number 1
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
evaluating model for fold number 1
{'classifier__C': 1.0, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 50.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
parameters: {'classifier__C': 1.0, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 50.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
AUC score: 0.8455882352941175
finding best model parameters for fold number 2
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
evaluating model for fold number 2
{'classifier__C': 0.1, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 60.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
parameters: {'classifier__C': 0.1, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 60.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
AUC score: 0.885

Unnamed: 0,params,score
0,"{'classifier__C': 0.1, 'classifier__gamma': 'a...",0.885294
1,"{'classifier__C': 1.0, 'classifier__gamma': 'a...",0.845588
2,"{'classifier__C': 1.0, 'classifier__gamma': 'a...",0.822059
3,"{'classifier__C': 1.0, 'classifier__gamma': 'a...",0.796324
4,"{'classifier__C': 1.0, 'classifier__gamma': 'a...",0.746951


In [44]:
# tune RF

fname_text = 'experiments/fav_functional_status/mt_data/rf_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/fav_functional_status/mt_data/rf_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = RandomForestClassifier(), param_grid = rf_grid, X_train = X_train_mt, 
            y_train = y_train)

finding best model parameters for fold number 1
Fitting 5 folds for each of 5400 candidates, totalling 27000 fits
evaluating model for fold number 1
{'classifier__max_depth': 60.0, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 1500, 'preprocessor__cat__selector__percentile': 10.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
parameters: {'classifier__max_depth': 60.0, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 1500, 'preprocessor__cat__selector__percentile': 10.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
AUC score: 0.8073529411764706
finding best model parameters for fold number 2
Fitting 5 folds for each of 5400 candidates, totalling 27000 fits
evaluating model for fold number 2
{'classifier__max_depth': 40.0, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 1000, 'preprocessor__cat__selector__perc

Unnamed: 0,params,score
0,"{'classifier__max_depth': 60.0, 'classifier__m...",0.833824
1,"{'classifier__max_depth': 60.0, 'classifier__m...",0.830793
2,"{'classifier__max_depth': 60.0, 'classifier__m...",0.807353
3,"{'classifier__max_depth': 40.0, 'classifier__m...",0.773529
4,"{'classifier__max_depth': 60.0, 'classifier__m...",0.764706


In [45]:
# tune MLP

fname_text = 'experiments/fav_functional_status/mt_data/mlp_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/fav_functional_status/mt_data/mlp_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = MLPClassifier(max_iter = 10000), param_grid = mlp_grid, X_train = X_train_mt, 
            y_train = y_train)

finding best model parameters for fold number 1
Fitting 5 folds for each of 2430 candidates, totalling 12150 fits
evaluating model for fold number 1
{'classifier__beta_1': 0.9, 'classifier__beta_2': 0.9, 'classifier__hidden_layer_sizes': (10,), 'preprocessor__cat__selector__percentile': 70.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
parameters: {'classifier__beta_1': 0.9, 'classifier__beta_2': 0.9, 'classifier__hidden_layer_sizes': (10,), 'preprocessor__cat__selector__percentile': 70.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
AUC score: 0.75
finding best model parameters for fold number 2
Fitting 5 folds for each of 2430 candidates, totalling 12150 fits
evaluating model for fold number 2
{'classifier__beta_1': 0.3, 'classifier__beta_2': 0.3, 'classifier__hidden_layer_sizes': (30, 30, 30), 'preprocessor__cat__selector__percentile': 50.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
parameters: {'classifier__beta_1': 0.3, 'classifier__beta_

Unnamed: 0,params,score
0,"{'classifier__beta_1': 0.9, 'classifier__beta_...",0.826471
1,"{'classifier__beta_1': 0.3, 'classifier__beta_...",0.789706
2,"{'classifier__beta_1': 0.3, 'classifier__beta_...",0.760671
3,"{'classifier__beta_1': 0.9, 'classifier__beta_...",0.75
4,"{'classifier__beta_1': 0.9, 'classifier__beta_...",0.736765


In [77]:
# retrain best model on entire training set and save model

retrain_save(X = X_train_mt, 
             y = y_train, 
             percentile = 10, 
             threshold = 0.5, 
             final_model = RandomForestClassifier(max_depth = 60, 
                                                 min_samples_leaf = 4,
                                                 min_samples_split = 2, 
                                                 n_estimators = 1000), 
             fname = 'models/fav_functional_status/mt_data/final_rf_model.pkl')

Training complete for RandomForestClassifier(max_depth=60, min_samples_leaf=4, n_estimators=1000)
Model saved


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   IterativeImputer(max_iter=10000,
                                                                                    random_state=0)),
                                                                  ('variance_threshold',
                                                                   VarianceThreshold(threshold=0.5)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['glucose', 'calcium', 'mag',
                                                   'phos', 'inr', 'plt',
                                                   'plt_lymph', 'sbp',
                                                 

In [89]:
# train LR model on entire training set and save model

train_save_lr(X = X_train_mt, 
             y = y_train, 
             fname = 'models/fav_functional_status/mt_data/final_lr_model.pkl')

Training complete
Model saved


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   IterativeImputer(max_iter=10000,
                                                                                    random_state=42)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['glucose', 'calcium', 'mag',
                                                   'phos', 'inr', 'plt',
                                                   'plt_lymph', 'sbp',
                                                   'nih_admit', 'age', 'bmi',
                                                   'aspects', 'heparin',
                                                   'num_pass', 'fluoro_time',
          

### 2. Favorable functional status prediction - without MT data
Model training does not include variables from the mechanical thrombectomy

In [74]:
X_train_nomt = pd.read_pickle('transformed_datasets/fav_functional_status/no_mt_data/X_train_trans_nomt.pkl')
y_train = np.load('transformed_datasets/fav_functional_status/y_train_trans.npy')

In [14]:
# evaluate baseline LR model

pipe_lr_nomt, scores_lr_nomt = baseline_lr(X_train = X_train_nomt, y_train = y_train, n_splits = 5, n_repeats = 10, 
                                       scoring = 'roc_auc', 
                                       fname = 'experiments/fav_functional_status/no_mt_data/baseline_lr.pkl')

Evaluating baseline logistic regression model with CV using 5 splits and 10 repeats
Mean AUC for baseline logistic regression model: 0.7653073888091823 +/- 0.05668853285375565


In [20]:
# screen models

fname = 'experiments/fav_functional_status/no_mt_data/init_screening_summary' + '_' + str(datetime.now().year) + \
    '_' + str(datetime.now().month) + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_df = 'experiments/fav_functional_status/no_mt_data/init_screening_results' + '_' + str(datetime.now().year) + \
    '_' + str(datetime.now().month) + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl' 

init_screen_nomt = screen_models(models = models, score = 'roc_auc', X_train = X_train_nomt, y_train = y_train, 
                              fname = fname, fname_df = fname_df)

Evaluating elastic net...
Evaluating decision tree...
Evaluating random forest...
Evaluating k neighbors...
Evaluating naive bayes...
Evaluating support vector machines...
Evaluating XG boost...
Evaluating Light GBM...
Evaluating MLP...


In [21]:
init_screen_nomt.describe()

Unnamed: 0,support vector machines{'C': 1},"random forest{'criterion': 'entropy', 'n_estimators': 500}",support vector machines{'C': 0.1},support vector machines{'C': 0.01},"random forest{'criterion': 'gini', 'n_estimators': 500}","MLP{'hidden_layer_sizes': (10,), 'max_iter': 10000}","MLP{'hidden_layer_sizes': (10, 10, 10), 'max_iter': 10000}",support vector machines{'C': 10},naive bayes{},"MLP{'hidden_layer_sizes': (10, 10), 'max_iter': 10000}",XG boost{'use_label_encoder': False},Light GBM{},k neighbors{'n_neighbors': 5},k neighbors{'n_neighbors': 3},decision tree{'criterion': 'entropy'},decision tree{'criterion': 'gini'},elastic net{'max_iter': 10000}
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.739769,0.731725,0.735853,0.735799,0.726879,0.689289,0.690042,0.693569,0.684504,0.679984,0.680801,0.684174,0.643374,0.61545,0.576135,0.582048,0.5
std,0.05657,0.057517,0.04829,0.05581,0.06363,0.060079,0.073649,0.066492,0.076228,0.060335,0.068725,0.072771,0.072418,0.071068,0.063252,0.070848,0.0
min,0.613235,0.600735,0.626471,0.617647,0.564024,0.514706,0.482353,0.53811,0.502206,0.536765,0.525,0.457353,0.478676,0.45,0.438235,0.442647,0.5
25%,0.711029,0.701639,0.70161,0.697426,0.688215,0.648399,0.636029,0.653309,0.618936,0.633456,0.638971,0.644485,0.605882,0.579044,0.527094,0.539652,0.5
50%,0.743275,0.739136,0.735294,0.734559,0.732721,0.708824,0.703605,0.693382,0.690441,0.688971,0.688235,0.684451,0.647059,0.608528,0.576838,0.566544,0.5
75%,0.774632,0.769853,0.775,0.770588,0.765074,0.734559,0.743221,0.728842,0.740441,0.719118,0.723162,0.727941,0.690441,0.669669,0.610294,0.609191,0.5
max,0.897059,0.870588,0.820588,0.860294,0.874265,0.776471,0.835366,0.856707,0.844118,0.791176,0.839706,0.836765,0.806402,0.730882,0.727941,0.770579,0.5


In [47]:
# Tune SVM

fname_text = 'experiments/fav_functional_status/no_mt_data/svm_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/fav_functional_status/no_mt_data/svm_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = SVC(probability = True), param_grid = svm_grid, X_train = X_train_nomt, 
            y_train = y_train)

finding best model parameters for fold number 1
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
evaluating model for fold number 1
{'classifier__C': 0.1, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 20.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
parameters: {'classifier__C': 0.1, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 20.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
AUC score: 0.8205882352941176
finding best model parameters for fold number 2
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
evaluating model for fold number 2
{'classifier__C': 0.1, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 10.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
parameters: {'classifier__C': 0.1, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 10.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
AUC score: 0.869

Unnamed: 0,params,score
0,"{'classifier__C': 0.1, 'classifier__gamma': 'a...",0.869118
1,"{'classifier__C': 0.1, 'classifier__gamma': 'a...",0.820588
2,"{'classifier__C': 1.0, 'classifier__gamma': 'a...",0.737805
3,"{'classifier__C': 0.1, 'classifier__gamma': 's...",0.698529
4,"{'classifier__C': 0.01, 'classifier__gamma': '...",0.205882


In [48]:
# Tune RF

fname_text = 'experiments/fav_functional_status/no_mt_data/rf_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/fav_functional_status/no_mt_data/rf_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = RandomForestClassifier(), param_grid = rf_grid, X_train = X_train_nomt, 
            y_train = y_train)

finding best model parameters for fold number 1
Fitting 5 folds for each of 5400 candidates, totalling 27000 fits
evaluating model for fold number 1
{'classifier__max_depth': 100.0, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 1000, 'preprocessor__cat__selector__percentile': 20.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
parameters: {'classifier__max_depth': 100.0, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 1000, 'preprocessor__cat__selector__percentile': 20.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
AUC score: 0.7014705882352941
finding best model parameters for fold number 2
Fitting 5 folds for each of 5400 candidates, totalling 27000 fits
evaluating model for fold number 2
{'classifier__max_depth': 100.0, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 500, 'preprocessor__cat__selector__per

Unnamed: 0,params,score
0,"{'classifier__max_depth': 100.0, 'classifier__...",0.797059
1,"{'classifier__max_depth': 80.0, 'classifier__m...",0.770588
2,"{'classifier__max_depth': 100.0, 'classifier__...",0.701471
3,"{'classifier__max_depth': 100.0, 'classifier__...",0.661585
4,"{'classifier__max_depth': 80.0, 'classifier__m...",0.658824


In [49]:
# Tune MLP

fname_text = 'experiments/fav_functional_status/no_mt_data/mlp_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/fav_functional_status/no_mt_data/mlp_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = MLPClassifier(max_iter = 10000), param_grid = mlp_grid, X_train = X_train_nomt, 
            y_train = y_train)

finding best model parameters for fold number 1
Fitting 5 folds for each of 2430 candidates, totalling 12150 fits
evaluating model for fold number 1
{'classifier__beta_1': 0.9, 'classifier__beta_2': 0.3, 'classifier__hidden_layer_sizes': (10, 10, 10), 'preprocessor__cat__selector__percentile': 20.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
parameters: {'classifier__beta_1': 0.9, 'classifier__beta_2': 0.3, 'classifier__hidden_layer_sizes': (10, 10, 10), 'preprocessor__cat__selector__percentile': 20.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
AUC score: 0.6367647058823529
finding best model parameters for fold number 2
Fitting 5 folds for each of 2430 candidates, totalling 12150 fits
evaluating model for fold number 2
{'classifier__beta_1': 0.3, 'classifier__beta_2': 0.6000000000000001, 'classifier__hidden_layer_sizes': (10,), 'preprocessor__cat__selector__percentile': 50.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
parameters: {'classifi

Unnamed: 0,params,score
0,"{'classifier__beta_1': 0.6000000000000001, 'cl...",0.829268
1,"{'classifier__beta_1': 0.6000000000000001, 'cl...",0.732353
2,"{'classifier__beta_1': 0.9, 'classifier__beta_...",0.679412
3,"{'classifier__beta_1': 0.3, 'classifier__beta_...",0.645588
4,"{'classifier__beta_1': 0.9, 'classifier__beta_...",0.636765


In [78]:
# retrain best model on entire training set and save model

retrain_save(X = X_train_nomt, 
             y = y_train, 
             percentile = 10, 
             threshold = 0.4, 
             final_model = RandomForestClassifier(max_depth = 100, min_samples_leaf = 2, min_samples_split = 5, 
                                                  n_estimators = 500), 
             fname = 'models/fav_functional_status/no_mt_data/final_rf_model.pkl')

Training complete for RandomForestClassifier(max_depth=100, min_samples_leaf=2, min_samples_split=5,
                       n_estimators=500)
Model saved


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   IterativeImputer(max_iter=10000,
                                                                                    random_state=0)),
                                                                  ('variance_threshold',
                                                                   VarianceThreshold(threshold=0.4)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['glucose', 'calcium', 'mag',
                                                   'phos', 'inr', 'plt',
                                                   'plt_lymph', 'sbp',
                                                 

In [90]:
# train LR model on entire training set and save model

train_save_lr(X = X_train_nomt, 
             y = y_train, 
             fname = 'models/fav_functional_status/no_mt_data/final_lr_model.pkl')

Training complete
Model saved


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   IterativeImputer(max_iter=10000,
                                                                                    random_state=42)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['glucose', 'calcium', 'mag',
                                                   'phos', 'inr', 'plt',
                                                   'plt_lymph', 'sbp',
                                                   'nih_admit', 'age', 'bmi',
                                                   'aspects', 'time_to_arr']),
                                                 ('cat',
                         

### 3. Mortality prediction - with MT data
Model training includes variables from mechanical thrombectomy

In [79]:
X_train_mt_mort = pd.read_pickle('transformed_datasets/mortality/mt_data/X_train_trans_mt.pkl')
y_train_mort = np.load('transformed_datasets/mortality/y_train_trans.npy')

In [8]:
# evaluate baseline LR model

pipe_lr_mt_mort, scores_lr_mt_mort = baseline_lr(X_train = X_train_mt_mort, y_train = y_train_mort, n_splits = 5, 
                                                 n_repeats = 10, scoring = 'roc_auc', 
                                                 fname = 'experiments/mortality/mt_data/baseline_lr.pkl')

Evaluating baseline logistic regression model with CV using 5 splits and 10 repeats
Mean AUC for baseline logistic regression model: 0.7086845584083435 +/- 0.07527547643934501


In [11]:
# screen models

fname = 'experiments/mortality/mt_data/init_screening_summary' + '_' + str(datetime.now().year) + \
    '_' + str(datetime.now().month) + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_df = 'experiments/mortality/mt_data/init_screening_results' + '_' + str(datetime.now().year) + \
    '_' + str(datetime.now().month) + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl' 

init_screen_mt = screen_models(models = models, score = 'roc_auc', X_train = X_train_mt_mort, 
                               y_train = y_train_mort, fname = fname, fname_df = fname_df)

Evaluating elastic net...
Evaluating decision tree...
Evaluating random forest...
Evaluating k neighbors...
Evaluating naive bayes...
Evaluating support vector machines...




Evaluating XG boost...
Evaluating Light GBM...
Evaluating MLP...


In [12]:
init_screen_mt.describe()

Unnamed: 0,support vector machines{'C': 1},support vector machines{'C': 0.01},support vector machines{'C': 0.1},"random forest{'criterion': 'gini', 'n_estimators': 500}","random forest{'criterion': 'entropy', 'n_estimators': 500}","MLP{'hidden_layer_sizes': (10,), 'max_iter': 10000}","MLP{'hidden_layer_sizes': (10, 10), 'max_iter': 10000}",naive bayes{},"MLP{'hidden_layer_sizes': (10, 10, 10), 'max_iter': 10000}",support vector machines{'C': 10},Light GBM{},XG boost{'use_label_encoder': False},k neighbors{'n_neighbors': 5},k neighbors{'n_neighbors': 3},decision tree{'criterion': 'gini'},decision tree{'criterion': 'entropy'},elastic net{'max_iter': 10000}
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.745657,0.73979,0.745521,0.721038,0.723308,0.696333,0.700411,0.689227,0.683274,0.675561,0.680922,0.673149,0.614134,0.601035,0.581071,0.56581,0.5
std,0.051199,0.067034,0.058614,0.051313,0.066613,0.067169,0.070199,0.062833,0.067031,0.064342,0.056569,0.059451,0.067241,0.0692,0.061452,0.057465,0.0
min,0.638107,0.545455,0.611688,0.620205,0.58961,0.498721,0.537084,0.557143,0.55243,0.516883,0.568831,0.528571,0.42987,0.437662,0.41688,0.433766,0.5
25%,0.714194,0.689935,0.706035,0.681006,0.681006,0.662084,0.653453,0.652929,0.631625,0.637788,0.638747,0.636039,0.568039,0.559857,0.543019,0.528453,0.5
50%,0.756459,0.748701,0.743606,0.736144,0.71899,0.703964,0.698456,0.692966,0.690703,0.687013,0.681169,0.675192,0.626279,0.60692,0.590521,0.567208,0.5
75%,0.779194,0.785166,0.784091,0.761509,0.775889,0.735614,0.765584,0.735774,0.739286,0.71629,0.729545,0.718958,0.657245,0.64087,0.627877,0.605408,0.5
max,0.847826,0.855499,0.897403,0.823377,0.883632,0.86445,0.819693,0.827273,0.805627,0.787724,0.808184,0.812987,0.768831,0.737212,0.700767,0.715473,0.5


In [51]:
# Tune SVM

fname_text = 'experiments/mortality/mt_data/svm_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/mortality/mt_data/svm_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = SVC(probability = True), param_grid = svm_grid, X_train = X_train_mt_mort, 
            y_train = y_train_mort)

finding best model parameters for fold number 1
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
evaluating model for fold number 1
{'classifier__C': 0.01, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 20.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
parameters: {'classifier__C': 0.01, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 20.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
AUC score: 0.4772727272727273
finding best model parameters for fold number 2
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
evaluating model for fold number 2
{'classifier__C': 0.1, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 10.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
parameters: {'classifier__C': 0.1, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 10.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
AUC score: 0.7

Unnamed: 0,params,score
0,"{'classifier__C': 0.1, 'classifier__gamma': 'a...",0.808184
1,"{'classifier__C': 0.1, 'classifier__gamma': 'a...",0.72987
2,"{'classifier__C': 0.1, 'classifier__gamma': 'a...",0.600384
3,"{'classifier__C': 0.01, 'classifier__gamma': '...",0.477273
4,"{'classifier__C': 0.01, 'classifier__gamma': '...",0.223785


In [52]:
# Tune RF

fname_text = 'experiments/mortality/mt_data/rf_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/mortality/mt_data/rf_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = RandomForestClassifier(), param_grid = rf_grid, X_train = X_train_mt_mort, 
            y_train = y_train_mort)

finding best model parameters for fold number 1
Fitting 5 folds for each of 5400 candidates, totalling 27000 fits
evaluating model for fold number 1
{'classifier__max_depth': 80.0, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 500, 'preprocessor__cat__selector__percentile': 30.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
parameters: {'classifier__max_depth': 80.0, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 500, 'preprocessor__cat__selector__percentile': 30.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
AUC score: 0.7025974025974026
finding best model parameters for fold number 2
Fitting 5 folds for each of 5400 candidates, totalling 27000 fits
evaluating model for fold number 2
{'classifier__max_depth': 100.0, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 1000, 'preprocessor__cat__selector__percen

Unnamed: 0,params,score
0,"{'classifier__max_depth': 80.0, 'classifier__m...",0.794118
1,"{'classifier__max_depth': 80.0, 'classifier__m...",0.727621
2,"{'classifier__max_depth': 60.0, 'classifier__m...",0.709719
3,"{'classifier__max_depth': 80.0, 'classifier__m...",0.702597
4,"{'classifier__max_depth': 100.0, 'classifier__...",0.685714


In [53]:
# Tune MLP

fname_text = 'experiments/mortality/mt_data/mlp_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/mortality/mt_data/mlp_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = MLPClassifier(max_iter = 10000), param_grid = mlp_grid, X_train = X_train_mt_mort, 
            y_train = y_train_mort)

finding best model parameters for fold number 1
Fitting 5 folds for each of 2430 candidates, totalling 12150 fits
evaluating model for fold number 1
{'classifier__beta_1': 0.3, 'classifier__beta_2': 0.3, 'classifier__hidden_layer_sizes': (10, 10), 'preprocessor__cat__selector__percentile': 50.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
parameters: {'classifier__beta_1': 0.3, 'classifier__beta_2': 0.3, 'classifier__hidden_layer_sizes': (10, 10), 'preprocessor__cat__selector__percentile': 50.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
AUC score: 0.6935064935064936
finding best model parameters for fold number 2
Fitting 5 folds for each of 2430 candidates, totalling 12150 fits
evaluating model for fold number 2
{'classifier__beta_1': 0.3, 'classifier__beta_2': 0.6000000000000001, 'classifier__hidden_layer_sizes': (30, 30), 'preprocessor__cat__selector__percentile': 70.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
parameters: {'classifier__b

Unnamed: 0,params,score
0,"{'classifier__beta_1': 0.3, 'classifier__beta_...",0.735065
1,"{'classifier__beta_1': 0.3, 'classifier__beta_...",0.693506
2,"{'classifier__beta_1': 0.9, 'classifier__beta_...",0.653453
3,"{'classifier__beta_1': 0.9, 'classifier__beta_...",0.61509
4,"{'classifier__beta_1': 0.6000000000000001, 'cl...",0.567775


In [80]:
# retrain best model on entire training set and save model

retrain_save(X = X_train_mt_mort, 
             y = y_train_mort, 
             percentile = 60, 
             threshold = 0.6, 
             final_model = RandomForestClassifier(max_depth = 80, min_samples_leaf = 1, min_samples_split = 10, 
                                                 n_estimators = 500), 
             fname = 'models/mortality/mt_data/final_rf_model.pkl')

Training complete for RandomForestClassifier(max_depth=80, min_samples_split=10, n_estimators=500)
Model saved


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   IterativeImputer(max_iter=10000,
                                                                                    random_state=0)),
                                                                  ('variance_threshold',
                                                                   VarianceThreshold(threshold=0.6)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['glucose', 'calcium', 'mag',
                                                   'phos', 'inr', 'plt',
                                                   'plt_lymph', 'sbp',
                                                 

In [91]:
# train LR model on entire training set and save model

train_save_lr(X = X_train_mt_mort, 
             y = y_train_mort, 
             fname = 'models/mortality/mt_data/final_lr_model.pkl')

Training complete
Model saved


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   IterativeImputer(max_iter=10000,
                                                                                    random_state=42)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['glucose', 'calcium', 'mag',
                                                   'phos', 'inr', 'plt',
                                                   'plt_lymph', 'sbp',
                                                   'nih_admit', 'age', 'bmi',
                                                   'aspects', 'heparin',
                                                   'num_pass', 'fluoro_time',
          

### 4. Mortality prediction - without MT data
Model training does not include variables from mechanical thrombectomy

In [54]:
X_train_nomt_mort = pd.read_pickle('transformed_datasets/mortality/no_mt_data/X_train_trans_nomt.pkl')
y_train_mort = np.load('transformed_datasets/mortality/y_train_trans.npy')

In [14]:
# evaluate baseline LR model

pipe_lr_nomt_mort, scores_lr_nomt_mort = baseline_lr(X_train = X_train_nomt_mort, y_train = y_train_mort, 
                                                     n_splits = 5, n_repeats = 10, scoring = 'roc_auc', 
                                                 fname = 'experiments/mortality/no_mt_data/baseline_lr.pkl')

Evaluating baseline logistic regression model with CV using 5 splits and 10 repeats
Mean AUC for baseline logistic regression model: 0.7172533962201483 +/- 0.06451499139163538


In [15]:
# screen models

fname = 'experiments/mortality/no_mt_data/init_screening_summary' + '_' + str(datetime.now().year) + \
    '_' + str(datetime.now().month) + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_df = 'experiments/mortality/no_mt_data/init_screening_results' + '_' + str(datetime.now().year) + \
    '_' + str(datetime.now().month) + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl' 

init_screen_nomt = screen_models(models = models, score = 'roc_auc', X_train = X_train_nomt_mort, 
                               y_train = y_train_mort, fname = fname, fname_df = fname_df)

Evaluating elastic net...
Evaluating decision tree...
Evaluating random forest...
Evaluating k neighbors...
Evaluating naive bayes...
Evaluating support vector machines...
Evaluating XG boost...
Evaluating Light GBM...
Evaluating MLP...


In [16]:
init_screen_nomt.describe()

Unnamed: 0,support vector machines{'C': 1},"random forest{'criterion': 'gini', 'n_estimators': 500}",support vector machines{'C': 0.1},"random forest{'criterion': 'entropy', 'n_estimators': 500}",support vector machines{'C': 0.01},naive bayes{},Light GBM{},"MLP{'hidden_layer_sizes': (10, 10, 10), 'max_iter': 10000}",support vector machines{'C': 10},"MLP{'hidden_layer_sizes': (10, 10), 'max_iter': 10000}","MLP{'hidden_layer_sizes': (10,), 'max_iter': 10000}",XG boost{'use_label_encoder': False},k neighbors{'n_neighbors': 5},k neighbors{'n_neighbors': 3},decision tree{'criterion': 'gini'},decision tree{'criterion': 'entropy'},elastic net{'max_iter': 10000}
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.741238,0.729185,0.735184,0.728952,0.734906,0.700698,0.678547,0.664485,0.670452,0.678682,0.668774,0.671861,0.664241,0.627415,0.586481,0.566081,0.5
std,0.049287,0.072826,0.067367,0.05516,0.06149,0.075322,0.061403,0.065246,0.06055,0.051789,0.068489,0.054594,0.06244,0.073552,0.054524,0.064239,0.0
min,0.632992,0.554545,0.592072,0.621483,0.58312,0.530691,0.514286,0.517903,0.515345,0.568831,0.521739,0.557143,0.521739,0.42987,0.474425,0.393506,0.5
25%,0.713875,0.67289,0.693506,0.696928,0.703792,0.648377,0.638747,0.635065,0.640903,0.64513,0.626598,0.643588,0.629822,0.58376,0.554668,0.518831,0.5
50%,0.742327,0.736793,0.735934,0.734015,0.727457,0.694255,0.682864,0.679114,0.677749,0.676471,0.67336,0.664319,0.662404,0.634049,0.585714,0.570332,0.5
75%,0.767006,0.783644,0.786125,0.763584,0.784329,0.741973,0.729962,0.705243,0.707801,0.719949,0.713555,0.711957,0.703921,0.669318,0.62987,0.618766,0.5
max,0.872727,0.875959,0.894805,0.85422,0.869565,0.907928,0.785166,0.786445,0.796104,0.773657,0.815584,0.776623,0.797954,0.783117,0.70844,0.672634,0.5


In [55]:
# Tune SVM

fname_text = 'experiments/mortality/no_mt_data/svm_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/mortality/no_mt_data/svm_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = SVC(probability = True), param_grid = svm_grid, X_train = X_train_nomt_mort, 
            y_train = y_train_mort)

finding best model parameters for fold number 1
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
evaluating model for fold number 1
{'classifier__C': 0.01, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 80.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
parameters: {'classifier__C': 0.01, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 80.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
AUC score: 0.27402597402597406
finding best model parameters for fold number 2
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
evaluating model for fold number 2
{'classifier__C': 1.0, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 90.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
parameters: {'classifier__C': 1.0, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 90.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
AUC score: 0.

Unnamed: 0,params,score
0,"{'classifier__C': 1.0, 'classifier__gamma': 'a...",0.690909
1,"{'classifier__C': 0.1, 'classifier__gamma': 'a...",0.677749
2,"{'classifier__C': 0.01, 'classifier__gamma': '...",0.274026
3,"{'classifier__C': 0.01, 'classifier__gamma': '...",0.232097
4,"{'classifier__C': 0.01, 'classifier__gamma': '...",0.231458


In [56]:
# Tune RF

fname_text = 'experiments/mortality/no_mt_data/rf_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/mortality/no_mt_data/rf_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = RandomForestClassifier(), param_grid = rf_grid, X_train = X_train_nomt_mort, 
            y_train = y_train_mort)

finding best model parameters for fold number 1
Fitting 5 folds for each of 5400 candidates, totalling 27000 fits
evaluating model for fold number 1
{'classifier__max_depth': 20.0, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 500, 'preprocessor__cat__selector__percentile': 40.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
parameters: {'classifier__max_depth': 20.0, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 500, 'preprocessor__cat__selector__percentile': 40.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
AUC score: 0.6090909090909091
finding best model parameters for fold number 2
Fitting 5 folds for each of 5400 candidates, totalling 27000 fits
evaluating model for fold number 2
{'classifier__max_depth': 60.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 500, 'preprocessor__cat__selector__percent

Unnamed: 0,params,score
0,"{'classifier__max_depth': 60.0, 'classifier__m...",0.798701
1,"{'classifier__max_depth': 40.0, 'classifier__m...",0.736573
2,"{'classifier__max_depth': 40.0, 'classifier__m...",0.735294
3,"{'classifier__max_depth': 20.0, 'classifier__m...",0.609091
4,"{'classifier__max_depth': 100.0, 'classifier__...",0.592072


In [57]:
# Tune NB

fname_text = 'experiments/mortality/no_mt_data/nb_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/mortality/no_mt_data/nb_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = GaussianNB(), param_grid = nb_grid, X_train = X_train_nomt_mort, 
            y_train = y_train_mort)

finding best model parameters for fold number 1
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
evaluating model for fold number 1
{'classifier__var_smoothing': 1e-09, 'preprocessor__cat__selector__percentile': 60.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
parameters: {'classifier__var_smoothing': 1e-09, 'preprocessor__cat__selector__percentile': 60.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
AUC score: 0.6993506493506494
finding best model parameters for fold number 2
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
evaluating model for fold number 2
{'classifier__var_smoothing': 1.0, 'preprocessor__cat__selector__percentile': 20.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
parameters: {'classifier__var_smoothing': 1.0, 'preprocessor__cat__selector__percentile': 20.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
AUC score: 0.7558441558441559
finding best model parameters for fold number 3
Fi

Unnamed: 0,params,score
0,"{'classifier__var_smoothing': 1.0, 'preprocess...",0.755844
1,"{'classifier__var_smoothing': 1.0, 'preprocess...",0.70844
2,"{'classifier__var_smoothing': 1.0, 'preprocess...",0.699488
3,"{'classifier__var_smoothing': 1e-09, 'preproce...",0.699351
4,"{'classifier__var_smoothing': 1e-08, 'preproce...",0.616368


In [81]:
# retrain best model on entire training set and save model

retrain_save(X = X_train_nomt_mort, 
             y = y_train_mort, 
             percentile = 20, 
             threshold = 0.6, 
             final_model = RandomForestClassifier(max_depth = 60, min_samples_leaf = 1, min_samples_split = 10, 
                                                 n_estimators = 500), 
             fname = 'models/mortality/no_mt_data/final_rf_model.pkl')

Training complete for RandomForestClassifier(max_depth=60, min_samples_split=10, n_estimators=500)
Model saved


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   IterativeImputer(max_iter=10000,
                                                                                    random_state=0)),
                                                                  ('variance_threshold',
                                                                   VarianceThreshold(threshold=0.6)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['glucose', 'calcium', 'mag',
                                                   'phos', 'inr', 'plt',
                                                   'plt_lymph', 'sbp',
                                                 

In [92]:
# train LR model on entire training set and save model

train_save_lr(X = X_train_nomt_mort, 
             y = y_train_mort, 
             fname = 'models/mortality/no_mt_data/final_lr_model.pkl')

Training complete
Model saved


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   IterativeImputer(max_iter=10000,
                                                                                    random_state=42)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['glucose', 'calcium', 'mag',
                                                   'phos', 'inr', 'plt',
                                                   'plt_lymph', 'sbp',
                                                   'nih_admit', 'age', 'bmi',
                                                   'aspects', 'time_to_arr']),
                                                 ('cat',
                         

### 5. Death or severe disability prediction - with MT data

In [83]:
X_train_mt_dsd = pd.read_pickle('transformed_datasets/dsd/mt_data/X_train_trans_mt.pkl')
y_train_dsd = np.load('transformed_datasets/dsd/y_train_trans.npy')

In [15]:
# evaluate baseline LR model

pipe_lr_mt_dsd, scores_lr_mt_dsd = baseline_lr(X_train = X_train_mt_dsd, y_train = y_train_dsd, n_splits = 5, 
                                                 n_repeats = 10, scoring = 'roc_auc', 
                                                 fname = 'experiments/dsd/mt_data/baseline_lr.pkl')

Evaluating baseline logistic regression model with CV using 5 splits and 10 repeats
Mean AUC for baseline logistic regression model: 0.7564733044733045 +/- 0.05511069163625624


In [17]:
# screen models

fname = 'experiments/dsd/mt_data/init_screening_summary' + '_' + str(datetime.now().year) + \
    '_' + str(datetime.now().month) + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_df = 'experiments/dsd/mt_data/init_screening_results' + '_' + str(datetime.now().year) + \
    '_' + str(datetime.now().month) + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl' 

init_screen_mt = screen_models(models = models, score = 'roc_auc', X_train = X_train_mt_dsd, 
                               y_train = y_train_dsd, fname = fname, fname_df = fname_df)

Evaluating elastic net...
Evaluating decision tree...
Evaluating random forest...




Evaluating k neighbors...
Evaluating naive bayes...
Evaluating support vector machines...




Evaluating XG boost...
Evaluating Light GBM...
Evaluating MLP...


In [18]:
init_screen_mt.describe()

Unnamed: 0,support vector machines{'C': 0.1},support vector machines{'C': 1},support vector machines{'C': 0.01},"random forest{'criterion': 'entropy', 'n_estimators': 500}",support vector machines{'C': 10},"random forest{'criterion': 'gini', 'n_estimators': 500}","MLP{'hidden_layer_sizes': (10, 10, 10), 'max_iter': 10000}","MLP{'hidden_layer_sizes': (10,), 'max_iter': 10000}","MLP{'hidden_layer_sizes': (10, 10), 'max_iter': 10000}",Light GBM{},XG boost{'use_label_encoder': False},k neighbors{'n_neighbors': 5},naive bayes{},k neighbors{'n_neighbors': 3},decision tree{'criterion': 'entropy'},decision tree{'criterion': 'gini'},elastic net{'max_iter': 10000}
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.767939,0.7774,0.777472,0.742439,0.750694,0.741964,0.712674,0.716121,0.706828,0.707649,0.698947,0.68575,0.676217,0.662056,0.616725,0.614753,0.5
std,0.072417,0.063064,0.06556,0.063575,0.054117,0.043757,0.062829,0.066173,0.071583,0.065011,0.062387,0.054579,0.060334,0.069102,0.066822,0.056164,0.0
min,0.532468,0.637566,0.590909,0.592208,0.657143,0.658442,0.557143,0.595238,0.529101,0.559524,0.592593,0.56746,0.537037,0.494709,0.430556,0.492063,0.5
25%,0.723876,0.742587,0.736995,0.703247,0.712963,0.712455,0.675758,0.657407,0.655339,0.665675,0.645455,0.645972,0.651741,0.610227,0.581349,0.577679,0.5
50%,0.784404,0.776527,0.774471,0.755249,0.748353,0.742725,0.729497,0.72215,0.71164,0.706349,0.688107,0.685786,0.680556,0.672078,0.61434,0.614286,0.5
75%,0.822751,0.823413,0.828193,0.785886,0.786358,0.772493,0.757738,0.756944,0.759921,0.743831,0.747078,0.724495,0.702213,0.706385,0.660958,0.646807,0.5
max,0.891534,0.915344,0.912698,0.869481,0.872727,0.827381,0.831169,0.864935,0.827273,0.887013,0.863757,0.786376,0.843915,0.877273,0.763636,0.742063,0.5


In [59]:
# Tune SVM

fname_text = 'experiments/dsd/mt_data/svm_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/dsd/mt_data/svm_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = SVC(probability = True), param_grid = svm_grid, X_train = X_train_mt_dsd, 
            y_train = y_train_dsd)

finding best model parameters for fold number 1
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
evaluating model for fold number 1
{'classifier__C': 1.0, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 70.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
parameters: {'classifier__C': 1.0, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 70.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
AUC score: 0.7922077922077921
finding best model parameters for fold number 2
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
evaluating model for fold number 2
{'classifier__C': 1.0, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 100.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
parameters: {'classifier__C': 1.0, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 100.0, 'preprocessor__cont__variance_threshold__threshold': 0.6}
AUC score: 0.7

Unnamed: 0,params,score
0,"{'classifier__C': 1.0, 'classifier__gamma': 'a...",0.812169
1,"{'classifier__C': 1.0, 'classifier__gamma': 'a...",0.792208
2,"{'classifier__C': 1.0, 'classifier__gamma': 'a...",0.785714
3,"{'classifier__C': 1.0, 'classifier__gamma': 'a...",0.750649
4,"{'classifier__C': 0.01, 'classifier__gamma': '...",0.208995


In [60]:
# Tune RF

fname_text = 'experiments/dsd/mt_data/rf_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/dsd/mt_data/rf_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = RandomForestClassifier(), param_grid = rf_grid, X_train = X_train_mt_dsd, 
            y_train = y_train_dsd)

finding best model parameters for fold number 1
Fitting 5 folds for each of 5400 candidates, totalling 27000 fits
evaluating model for fold number 1
{'classifier__max_depth': 80.0, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 500, 'preprocessor__cat__selector__percentile': 100.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
parameters: {'classifier__max_depth': 80.0, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 500, 'preprocessor__cat__selector__percentile': 100.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
AUC score: 0.6311688311688312
finding best model parameters for fold number 2
Fitting 5 folds for each of 5400 candidates, totalling 27000 fits
evaluating model for fold number 2
{'classifier__max_depth': 80.0, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 500, 'preprocessor__cat__selector__perce

Unnamed: 0,params,score
0,"{'classifier__max_depth': 100.0, 'classifier__...",0.851852
1,"{'classifier__max_depth': 60.0, 'classifier__m...",0.80291
2,"{'classifier__max_depth': 80.0, 'classifier__m...",0.792208
3,"{'classifier__max_depth': 20.0, 'classifier__m...",0.650794
4,"{'classifier__max_depth': 80.0, 'classifier__m...",0.631169


In [61]:
# Tune MLP

fname_text = 'experiments/dsd/mt_data/mlp_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/dsd/mt_data/mlp_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = MLPClassifier(max_iter = 10000), param_grid = mlp_grid, X_train = X_train_mt_dsd, 
            y_train = y_train_dsd)

finding best model parameters for fold number 1
Fitting 5 folds for each of 2430 candidates, totalling 12150 fits
evaluating model for fold number 1
{'classifier__beta_1': 0.3, 'classifier__beta_2': 0.6000000000000001, 'classifier__hidden_layer_sizes': (20, 20, 20), 'preprocessor__cat__selector__percentile': 70.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
parameters: {'classifier__beta_1': 0.3, 'classifier__beta_2': 0.6000000000000001, 'classifier__hidden_layer_sizes': (20, 20, 20), 'preprocessor__cat__selector__percentile': 70.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
AUC score: 0.6857142857142857
finding best model parameters for fold number 2
Fitting 5 folds for each of 2430 candidates, totalling 12150 fits
evaluating model for fold number 2
{'classifier__beta_1': 0.9, 'classifier__beta_2': 0.6000000000000001, 'classifier__hidden_layer_sizes': (10, 10), 'preprocessor__cat__selector__percentile': 40.0, 'preprocessor__cont__variance_threshold__thresh

Unnamed: 0,params,score
0,"{'classifier__beta_1': 0.6000000000000001, 'cl...",0.785714
1,"{'classifier__beta_1': 0.9, 'classifier__beta_...",0.775325
2,"{'classifier__beta_1': 0.6000000000000001, 'cl...",0.756614
3,"{'classifier__beta_1': 0.3, 'classifier__beta_...",0.685714
4,"{'classifier__beta_1': 0.6000000000000001, 'cl...",0.619709


In [84]:
# retrain best model on entire training set and save model

retrain_save(X = X_train_mt_dsd, 
             y = y_train_dsd, 
             percentile = 30, 
             threshold = 0.4, 
             final_model = RandomForestClassifier(max_depth = 100, min_samples_leaf = 4, min_samples_split = 2, 
                                                 n_estimators = 2000), 
             fname = 'models/dsd/mt_data/final_rf_model.pkl')

Training complete for RandomForestClassifier(max_depth=100, min_samples_leaf=4, n_estimators=2000)
Model saved


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   IterativeImputer(max_iter=10000,
                                                                                    random_state=0)),
                                                                  ('variance_threshold',
                                                                   VarianceThreshold(threshold=0.4)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['glucose', 'calcium', 'mag',
                                                   'phos', 'inr', 'plt',
                                                   'plt_lymph', 'sbp',
                                                 

In [93]:
# train LR model on entire training set and save model

train_save_lr(X = X_train_mt_dsd, 
             y = y_train_dsd, 
             fname = 'models/dsd/mt_data/final_lr_model.pkl')

Training complete
Model saved


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   IterativeImputer(max_iter=10000,
                                                                                    random_state=42)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['glucose', 'calcium', 'mag',
                                                   'phos', 'inr', 'plt',
                                                   'plt_lymph', 'sbp',
                                                   'nih_admit', 'age', 'bmi',
                                                   'aspects', 'heparin',
                                                   'num_pass', 'fluoro_time',
          

### 6. Death or severe disability prediction - without MT data

In [85]:
X_train_nomt_dsd = pd.read_pickle('transformed_datasets/dsd/no_mt_data/X_train_trans_nomt.pkl')
y_train_dsd = np.load('transformed_datasets/dsd/y_train_trans.npy')

In [12]:
# evaluate baseline LR model

pipe_lr_nomt_dsd, scores_lr_nomt_dsd = baseline_lr(X_train = X_train_nomt_dsd, y_train = y_train_dsd, 
                                                     n_splits = 5, n_repeats = 10, scoring = 'roc_auc', 
                                                 fname = 'experiments/dsd/no_mt_data/baseline_lr.pkl')

Evaluating baseline logistic regression model with CV using 5 splits and 10 repeats
Mean AUC for baseline logistic regression model: 0.710118807118807 +/- 0.07409231233684198


In [19]:
# screen models

fname = 'experiments/dsd/no_mt_data/init_screening_summary' + '_' + str(datetime.now().year) + \
    '_' + str(datetime.now().month) + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_df = 'experiments/dsd/no_mt_data/init_screening_results' + '_' + str(datetime.now().year) + \
    '_' + str(datetime.now().month) + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl' 

init_screen_nomt = screen_models(models = models, score = 'roc_auc', X_train = X_train_nomt_dsd, 
                               y_train = y_train_dsd, fname = fname, fname_df = fname_df)

Evaluating elastic net...
Evaluating decision tree...
Evaluating random forest...
Evaluating k neighbors...
Evaluating naive bayes...
Evaluating support vector machines...
Evaluating XG boost...
Evaluating Light GBM...
Evaluating MLP...


In [20]:
init_screen_nomt.describe()

Unnamed: 0,support vector machines{'C': 0.1},"random forest{'criterion': 'gini', 'n_estimators': 500}","random forest{'criterion': 'entropy', 'n_estimators': 500}",support vector machines{'C': 0.01},support vector machines{'C': 1},"MLP{'hidden_layer_sizes': (10,), 'max_iter': 10000}",XG boost{'use_label_encoder': False},Light GBM{},naive bayes{},k neighbors{'n_neighbors': 5},"MLP{'hidden_layer_sizes': (10, 10, 10), 'max_iter': 10000}",support vector machines{'C': 10},k neighbors{'n_neighbors': 3},"MLP{'hidden_layer_sizes': (10, 10), 'max_iter': 10000}",decision tree{'criterion': 'entropy'},decision tree{'criterion': 'gini'},elastic net{'max_iter': 10000}
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.703814,0.689284,0.703496,0.706095,0.695691,0.646703,0.642336,0.64251,0.649688,0.620783,0.629017,0.612499,0.621181,0.618557,0.566118,0.567456,0.5
std,0.069848,0.051269,0.05883,0.064084,0.057399,0.06122,0.067535,0.055169,0.077715,0.076589,0.064498,0.063417,0.058193,0.067454,0.06627,0.056477,0.0
min,0.51455,0.52381,0.583995,0.588624,0.545455,0.480519,0.470899,0.484127,0.476623,0.375325,0.482804,0.443122,0.511905,0.480159,0.393506,0.450649,0.5
25%,0.660053,0.653274,0.651455,0.661454,0.656006,0.604004,0.596771,0.607804,0.613757,0.584656,0.595238,0.580026,0.57705,0.571429,0.524675,0.53373,0.5
50%,0.706373,0.700397,0.699916,0.696104,0.692857,0.655844,0.651455,0.650794,0.650673,0.630291,0.623846,0.621032,0.617641,0.616655,0.567172,0.565873,0.5
75%,0.759055,0.724351,0.748181,0.75,0.736111,0.688173,0.684854,0.684193,0.714084,0.677579,0.667659,0.656818,0.652477,0.674026,0.603671,0.605159,0.5
max,0.820106,0.78961,0.822727,0.851948,0.816138,0.777778,0.784392,0.738961,0.787013,0.77013,0.75974,0.755291,0.746032,0.762338,0.708333,0.684524,0.5


In [63]:
# Tune SVM

fname_text = 'experiments/dsd/no_mt_data/svm_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/dsd/no_mt_data/svm_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = SVC(probability = True), param_grid = svm_grid, X_train = X_train_nomt_dsd, 
            y_train = y_train_dsd)

finding best model parameters for fold number 1
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
evaluating model for fold number 1
{'classifier__C': 0.1, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 10.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
parameters: {'classifier__C': 0.1, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 10.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
AUC score: 0.6753246753246753
finding best model parameters for fold number 2
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
evaluating model for fold number 2
{'classifier__C': 0.01, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 30.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
parameters: {'classifier__C': 0.01, 'classifier__gamma': 'auto', 'preprocessor__cat__selector__percentile': 30.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
AUC score: 0.3

Unnamed: 0,params,score
0,"{'classifier__C': 0.1, 'classifier__gamma': 'a...",0.846561
1,"{'classifier__C': 1.0, 'classifier__gamma': 'a...",0.710317
2,"{'classifier__C': 0.1, 'classifier__gamma': 's...",0.685847
3,"{'classifier__C': 0.1, 'classifier__gamma': 'a...",0.675325
4,"{'classifier__C': 0.01, 'classifier__gamma': '...",0.318182


In [64]:
# Tune RF

fname_text = 'experiments/dsd/no_mt_data/rf_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/dsd/no_mt_data/rf_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = RandomForestClassifier(), param_grid = rf_grid, X_train = X_train_nomt_dsd, 
            y_train = y_train_dsd)

finding best model parameters for fold number 1
Fitting 5 folds for each of 5400 candidates, totalling 27000 fits
evaluating model for fold number 1
{'classifier__max_depth': 40.0, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 500, 'preprocessor__cat__selector__percentile': 40.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
parameters: {'classifier__max_depth': 40.0, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 500, 'preprocessor__cat__selector__percentile': 40.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
AUC score: 0.7818181818181817
finding best model parameters for fold number 2
Fitting 5 folds for each of 5400 candidates, totalling 27000 fits
evaluating model for fold number 2
{'classifier__max_depth': 80.0, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 1500, 'preprocessor__cat__selector__percen

Unnamed: 0,params,score
0,"{'classifier__max_depth': 20.0, 'classifier__m...",0.828042
1,"{'classifier__max_depth': 80.0, 'classifier__m...",0.785714
2,"{'classifier__max_depth': 40.0, 'classifier__m...",0.781818
3,"{'classifier__max_depth': 80.0, 'classifier__m...",0.62963
4,"{'classifier__max_depth': 100.0, 'classifier__...",0.574074


In [65]:
# Tune MLP

fname_text = 'experiments/dsd/no_mt_data/mlp_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.txt'

fname_results = 'experiments/dsd/no_mt_data/mlp_tuning' + '_' + str(datetime.now().year) + '_' + str(datetime.now().month) \
    + '_' + str(datetime.now().day) + '_' + str(datetime.now().hour) + '_' + \
    str(datetime.now().minute) + '.pkl'

eval_params(fname_text = fname_text, fname_results = fname_results, 
            tuning_model = MLPClassifier(max_iter = 10000), param_grid = mlp_grid, X_train = X_train_nomt_dsd, 
            y_train = y_train_dsd)

finding best model parameters for fold number 1
Fitting 5 folds for each of 2430 candidates, totalling 12150 fits
evaluating model for fold number 1
{'classifier__beta_1': 0.9, 'classifier__beta_2': 0.3, 'classifier__hidden_layer_sizes': (20,), 'preprocessor__cat__selector__percentile': 50.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
parameters: {'classifier__beta_1': 0.9, 'classifier__beta_2': 0.3, 'classifier__hidden_layer_sizes': (20,), 'preprocessor__cat__selector__percentile': 50.0, 'preprocessor__cont__variance_threshold__threshold': 0.4}
AUC score: 0.6597402597402597
finding best model parameters for fold number 2
Fitting 5 folds for each of 2430 candidates, totalling 12150 fits
evaluating model for fold number 2
{'classifier__beta_1': 0.3, 'classifier__beta_2': 0.9, 'classifier__hidden_layer_sizes': (10, 10, 10), 'preprocessor__cat__selector__percentile': 80.0, 'preprocessor__cont__variance_threshold__threshold': 0.5}
parameters: {'classifier__beta_1': 0.3, 'cla

Unnamed: 0,params,score
0,"{'classifier__beta_1': 0.3, 'classifier__beta_...",0.672727
1,"{'classifier__beta_1': 0.9, 'classifier__beta_...",0.65974
2,"{'classifier__beta_1': 0.6000000000000001, 'cl...",0.657407
3,"{'classifier__beta_1': 0.3, 'classifier__beta_...",0.636243
4,"{'classifier__beta_1': 0.6000000000000001, 'cl...",0.571429


In [86]:
# retrain best model on entire training set and save model

retrain_save(X = X_train_nomt_dsd, 
             y = y_train_dsd, 
             percentile = 80, 
             threshold = 0.4, 
             final_model = RandomForestClassifier(max_depth = 20, min_samples_leaf = 4, min_samples_split = 5, 
                                                 n_estimators = 1500), 
             fname = 'models/dsd/no_mt_data/final_rf_model.pkl')

Training complete for RandomForestClassifier(max_depth=20, min_samples_leaf=4, min_samples_split=5,
                       n_estimators=1500)
Model saved


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   IterativeImputer(max_iter=10000,
                                                                                    random_state=0)),
                                                                  ('variance_threshold',
                                                                   VarianceThreshold(threshold=0.4)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['glucose', 'calcium', 'mag',
                                                   'phos', 'inr', 'plt',
                                                   'plt_lymph', 'sbp',
                                                 

In [94]:
# train LR model on entire training set and save model

train_save_lr(X = X_train_nomt_dsd, 
             y = y_train_dsd, 
             fname = 'models/dsd/no_mt_data/final_lr_model.pkl')

Training complete
Model saved


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   IterativeImputer(max_iter=10000,
                                                                                    random_state=42)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['glucose', 'calcium', 'mag',
                                                   'phos', 'inr', 'plt',
                                                   'plt_lymph', 'sbp',
                                                   'nih_admit', 'age', 'bmi',
                                                   'aspects', 'time_to_arr']),
                                                 ('cat',
                         