# MMI prediction

## 1. Load data
- Remove duplicate patients (i.e. repeat thrombectomies)
- Remove patients with missing outcomes

In [1]:
import os

import pandas as pd

In [5]:
# load each csv file as a pd dataframe, drop duplicate mrn's, and add them to a list

dfs = []

for file in os.listdir('data'):
    df = pd.read_csv(os.path.join('data', file))
    print('There are {} patients before removing duplicates'.format(df.shape[0]))
    df = df.drop_duplicates(subset = 'mrn', keep = 'first')
    print('There are {} patients after removing duplicates'.format(df.shape[0]))
    dfs.append(df)

There are 413 patients before removing duplicates
There are 411 patients after removing duplicates
There are 413 patients before removing duplicates
There are 411 patients after removing duplicates
There are 413 patients before removing duplicates
There are 411 patients after removing duplicates
There are 413 patients before removing duplicates
There are 411 patients after removing duplicates
There are 413 patients before removing duplicates
There are 411 patients after removing duplicates
There are 413 patients before removing duplicates
There are 411 patients after removing duplicates


In [6]:
# merge each individual pd dataframe from the list

for i in range(len(dfs)):
    if i == 0:
        dat = dfs[i]
    else:
        dat = dat.merge(dfs[i], how = 'left', on = 'mrn')

In [7]:
# remove empty columns

dat = dat.drop(['Unnamed: 9', 'Unnamed: 21'], axis = 1)

In [8]:
# drop rows that are missing labels

dat = dat[dat['malig_infarct'].notna()]
print('There are {} patients after removing those with missing outcomes'.format(dat.shape[0]))

There are 397 patients after removing those with missing outcomes


In [9]:
# double check no patients have missing outcomes

any(dat['malig_infarct'].isnull())

False

In [11]:
# save data for later use

dat.to_pickle('dat_42422.pkl')