# mRS prediction

## 1. Load data

- Remove duplicates
- Remove cases with missing labels
- Remove non-LVOs

## 2. Wrangling

- Remove MRNs
- Create time variables
- Create alternate outcomes: favorable functional status (mRS <= 2) and mortality
- Create train/test sets and save as pkl

In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [3]:
# load each csv file as a pd dataframe, drop duplicate mrn's, and add them to a list

dfs = []

for file in os.listdir('data_export_3523'):
    df = pd.read_csv(os.path.join('data_export_3523', file))
    print('There are {} patients before removing duplicates'.format(df.shape[0]))
    df = df.drop_duplicates(subset = 'mrn', keep = 'first')
    print('There are {} patients after removing duplicates'.format(df.shape[0]))
    dfs.append(df)

There are 410 patients before removing duplicates
There are 408 patients after removing duplicates
There are 410 patients before removing duplicates
There are 408 patients after removing duplicates
There are 410 patients before removing duplicates
There are 408 patients after removing duplicates
There are 410 patients before removing duplicates
There are 408 patients after removing duplicates
There are 410 patients before removing duplicates
There are 408 patients after removing duplicates
There are 410 patients before removing duplicates
There are 408 patients after removing duplicates


In [4]:
# merge each individual pd dataframe from the list

for i in range(len(dfs)):
    if i == 0:
        dat = dfs[i]
    else:
        dat = dat.merge(dfs[i], how = 'left', on = 'mrn')

In [5]:
# remove empty column

dat = dat.drop(['Unnamed: 21', 'Unnamed: 9'], axis = 1)

In [6]:
# remove cases with missing labels

dat = dat[dat['mrs_90'].notna()]
print('there are {} cases after removing patients with missing labels'.format(dat.shape[0]))

there are 373 cases after removing patients with missing labels


In [7]:
# confirm there are no cases with missing labels

any(dat['mrs_90'].isnull())

False

In [8]:
# remove non-LVO cases

sites_of_occlusion = ['M1', 'M2', 'ICA']

dat = dat[dat['occ_site'].isin(sites_of_occlusion)]
print('there are {} cases after removing non-LVOs'.format(dat.shape[0]))

there are 357 cases after removing non-LVOs


In [9]:
# remove MRNs

dat = dat.drop('mrn', axis = 1)

In [10]:
# convert time variables to datetime objects

time_columns = ['lkw', 'arr', 'skin_puncture', 'first_pass_time', 'reperf']
dat[time_columns] = dat[time_columns].apply(lambda x: pd.to_datetime(x, errors = 'coerce', utc = True))

In [11]:
# calculate time intervals and convert to hours

dat['time_to_arr'] = (dat['arr'] - dat['lkw']).dt.total_seconds() / 3600
dat['time_to_puncture'] = (dat['skin_puncture'] - dat['arr']).dt.total_seconds() / 3600
dat['time_to_first_pass'] = (dat['first_pass_time'] - dat['skin_puncture']).dt.total_seconds() / 3600
dat['time_to_reperf'] = (dat['reperf'] - dat['lkw']).dt.total_seconds() / 3600

In [12]:
# remove raw time variables

dat = dat.drop(['arr', 'lkw', 'skin_puncture', 'first_pass_time', 'reperf'], axis = 1)

In [13]:
# if no heparin was used fill as 0

dat['heparin'] = dat['heparin'].fillna(0)

In [14]:
# create separate variables for each stent retriever used

stent_retriever_types = ['Trevo', 'Solitaire', 'Embotrap', 'Capture']

for stent in stent_retriever_types:
    dat[stent.lower()] = np.where(dat['stent_ret_type'].str.contains(stent, na = False), 'Y', 'N')
    
dat = dat.drop('stent_ret_type', axis = 1)

In [15]:
# some categorical variables do not parse correctly; manually convert

categorical_vars = ['stroke_etiol', 'mrs_90', 'ich_type', 'pre_mrs', 'coll_score', 'hyperdense']
for var in categorical_vars:
    dat[var] = dat[var].astype(object)

In [16]:
# create mortality outcome

dat['mortality'] = np.where(dat['mrs_90'] == 6, 1, 0)

In [17]:
# create favorable functional status outcome

dat['fav_mrs'] = np.where(dat['mrs_90'] <= 2, 1, 0)

In [18]:
# create death/severe disability outcome

dat['dsd'] = np.where(dat['mrs_90'] >= 4, 1, 0)

In [20]:
# create separate df with outcomes for analysis

outcomes = ['ich_type', 'ich_symptomatic', 'malig_infarct', 'mrs_90', 'fav_mrs', 'mortality', 'dsd']

dat_outcomes = dat[outcomes]
dat_outcomes.to_pickle('analysis/dat_outcomes.pkl')

In [21]:
# save data for later use

dat.to_pickle('dat_3523.pkl')

In [21]:
# function for splitting and saving data

def split_save_data(outcomes, test_size = 0.2):
    
    for col_name, outcome in outcomes:
    
        X = dat.drop(['ich_type', 'ich_symptomatic', 'malig_infarct', 'mrs_90', 'fav_mrs', 'mortality', 'dsd'], axis = 1)
        y = dat[col_name]
    
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 42)
        
        print('split data for outcome: {}'.format(outcome))
        print('there are {} training samples and {} testing samples\n\n'.format(X_train.shape[0], X_test.shape[0]))
        
        X_train.to_pickle(os.path.join('splits', outcome, 'X_train_' + outcome + '.pkl'))
        X_test.to_pickle(os.path.join('splits', outcome, 'X_test_' + outcome + '.pkl'))
        y_train.to_pickle(os.path.join('splits', outcome, 'y_train_' + outcome + '.pkl'))
        y_test.to_pickle(os.path.join('splits', outcome, 'y_test_' + outcome + '.pkl'))

In [25]:
# specify outcomes and split data

outcomes = [('mrs_90', 'functional_status'),
            ('fav_mrs', 'fav_functional_status'),
            ('mortality', 'mortality'), 
            ('dsd', 'dsd')] 

split_save_data(outcomes = outcomes, test_size = 0.2)

split data for outcome: functional_status
there are 285 training samples and 72 testing samples


split data for outcome: fav_functional_status
there are 285 training samples and 72 testing samples


split data for outcome: mortality
there are 285 training samples and 72 testing samples


