# MMI prediction

## 2. Data wrangling
- Remove MRNs
- Remove non-LVOs
- Create time variables
- Create train/test sets and save as pkl

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
dat = pd.read_pickle('dat_42422.pkl')
print('There are {} total samples'.format(dat.shape[0]))

There are 397 total samples


In [4]:
dat = dat.drop('mrn', axis = 1)

In [5]:
dat['occ_site'].value_counts()

M1      241
M2      117
ICA      23
None     13
A2        1
M3        1
ACA       1
Name: occ_site, dtype: int64

In [6]:
# remove non-LVO 

locations_to_exclude = ['ACA', 'A2', 'M3', 'None']

dat = dat[dat['occ_site'].isin(locations_to_exclude) == False]
print('There are {} total samples that fit inclusion criteria'.format(dat.shape[0]))

There are 381 total samples that fit inclusion criteria


In [7]:
# create separate df with outcomes for analysis

outcomes = ['mrs_90', 'ich_type', 'malig_infarct']

dat_outcomes = dat[outcomes]
dat_outcomes.to_pickle('/Users/haydnhoffman/ml/stroke/malig_infarct/analysis/dat_outcomes.pkl')

In [7]:
# convert time variables to datetime objects

time_columns = ['lkw', 'arr', 'skin_puncture', 'first_pass_time', 'reperf']
dat[time_columns] = dat[time_columns].apply(lambda x: pd.to_datetime(x, errors = 'coerce', utc = True))

In [8]:
# calculate time intervals and convert to hours

dat['time_to_arr'] = (dat['arr'] - dat['lkw']).dt.total_seconds() / 3600
dat['time_to_puncture'] = (dat['skin_puncture'] - dat['arr']).dt.total_seconds() / 3600
dat['time_to_first_pass'] = (dat['first_pass_time'] - dat['skin_puncture']).dt.total_seconds() / 3600
dat['time_to_reperf'] = (dat['reperf'] - dat['lkw']).dt.total_seconds() / 3600

In [9]:
# remove raw time variables

dat = dat.drop(['arr', 'lkw', 'skin_puncture', 'first_pass_time', 'reperf'], axis = 1)

In [10]:
# if no heparin was used fill as 0

dat['heparin'] = dat['heparin'].fillna(0)

In [11]:
# create separate variables for each stent retriever used

stent_retriever_types = ['Trevo', 'Solitaire', 'Embotrap', 'Capture']

for stent in stent_retriever_types:
    dat[stent.lower()] = np.where(dat['stent_ret_type'].str.contains(stent, na = False), 'Y', 'N')
    
dat = dat.drop('stent_ret_type', axis = 1)

In [12]:
# some categorical variables do not parse correctly; manually convert

categorical_vars = ['stroke_etiol', 'mrs_90', 'ich_type', 'pre_mrs', 'coll_score', 'hyperdense']
for var in categorical_vars:
    dat[var] = dat[var].astype(object)

In [13]:
# separate labels from features

outcomes = ['mrs_90', 'ich_type', 'malig_infarct']

y = dat['malig_infarct']
X = dat.drop(outcomes, axis = 1)

In [14]:
# create train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print('There are {} training samples and {} testing samples'.format(X_train.shape[0], X_test.shape[0]))

There are 304 training samples and 77 testing samples


In [15]:
# save data for later use

X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')
y_train.to_pickle('y_train.pkl')
y_test.to_pickle('y_test.pkl')