In [1]:
import pandas as pd
import pickle
import numpy as np
from datetime import datetime, timedelta
from EM import EM

### TO DO
* Don't drop all the treatments before the first measurements. Depending on how far we want to look back (another thing to specify), keep the relevant ones

In [2]:
cdm_t = pd.read_pickle('../Data/cdm_t.pkl')
cdm_s = pd.read_pickle('../Data/cdm_s.pkl')

### Data Query and Preprocessing Parameters

In [3]:
signal_name = 'inr'

treatment_names = {}
treatment_names['nsaid'] = ['acetaminophen_dose','celecoxib_dose','diclofenac_dose','ibuprofen_dose','indomethacin_dose',
'ketorolac_dose','meloxicam_dose','naproxen_dose']
treatment_names['anticoagulant'] = ['warfarin_dose','heparin_dose','dabigatran_dose','edoxaban_dose','rivaroxaban_dose',
'apixaban_dose','enoxaparin_dose','dalteparin_dose','fondaparinux_dose']
treatment_names['transfusion_plasma'] = ['transfuse_plasma']
treatment_names['transfusion_platelets'] = ['transfuse_platelets']
treatment_names['aspirin'] = ['aspirin_dose']

# chronic dict is keyed on the keywords of the chronic conditions we care about
# the keywords are specified in chronic_keywords
# the value is a list of all the features in cdm_s that contain that keyword
chronic_keywords = ['liver_disease', 'sickle_cell']
chronic_names = {}

demographic_names = ['age']

# the least number of signal observation a patient needs to have to be included
cutoff = 5

# bin_size is a offset alias used by the resample method
bin_size = '18H'
bin_size_num = 18

# the maximum percent of missingness allowed in observations for each individual
max_missing_pct = .4

# number of past time points where treatment effects are considered
num_past_effects = 3

### Model Training Parameters

In [4]:
# EM Setting
training_pct = .8
single_effect = False

### Preprocessing

In [5]:
# fill in chronic name dict
all_chronic = cdm_s.loc[:, 'fid'].unique()
for name in chronic_keywords:
    chronic_names[name] = [s for s in all_chronic if name in s]

In [6]:
# put all the treatment names into a list to get the corresponding columns
treatment_list = []
for name in treatment_names.values():
    treatment_list.append(name)
# flatten the list
treatment_list = [item for sublist in treatment_list for item in sublist]

In [7]:
# df_t is part of the original dataframe that has all the ids who have measurements for the signal we are interested 
# in
signal = cdm_t.loc[cdm_t.loc[:, 'fid'] == signal_name, 'value']
ids = np.unique(cdm_t.loc[signal.index, 'enc_id'])
df_t = cdm_t.loc[cdm_t.loc[:, 'enc_id'].isin(ids), :]

In [8]:
# df_t is now part of the dataframe that contains only the rows with fid being either the signal or the treatments
df_t = df_t.loc[df_t.loc[:, 'fid'].isin(treatment_list + [signal_name]), :]

In [9]:
# convert tsp field to python datetime object
df_t.loc[:, 'tsp'] = df_t.loc[:, 'tsp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S+%f'))
#df_t.loc[:, 'tsp'] = df_t.groupby('enc_id')['tsp'].apply(lambda x: x - x.iloc[0])

In [10]:
# for each id, adjust tsp so that time is zero for the first time the signal is measured
adjusted_time = df_t.groupby('enc_id').apply(lambda x: x.loc[:, 'tsp'] - x.loc[x.loc[:, 'fid'] == signal_name, 'tsp'].iloc[0])
# adjusted_time is multiindexed, need to drop one level before assigning it to column tsp
adjusted_time.index = adjusted_time.index.droplevel()
df_t.loc[:, 'tsp_adjusted'] = adjusted_time

In [59]:
# cut the dataframe based on a cutoff number on the number of signal measurement a patient has
keep_index = df_t.groupby('enc_id')['fid'].filter(lambda x: x.value_counts().loc['inr'] >= cutoff).index
df_t_cut = df_t.loc[keep_index].copy()

In [61]:
# delete rows whose adjusted time is less than the earliest treatment effect we wish to consider
df_t_cut = df_t_cut.loc[df_t_cut.loc[:, 'tsp_adjusted'] >= timedelta(hours=-num_past_effects*bin_size_num), :]

In [62]:
# delete rows before the first observation
#df_t_cut = df_t_cut.loc[df_t_cut.loc[:, 'tsp_adjusted'] >= timedelta(), :]

In [63]:
df_t_cut

Unnamed: 0,dataset_id,enc_id,tsp,fid,value,confidence,tsp_adjusted
84516,3,1020,2014-04-23 15:41:00,heparin_dose,"{""dose"": 5000.0, ""order_tsp"": ""2014-04-23 15:4...",4,-3 days +20:29:00
84575,3,1020,2014-04-23 17:55:00,heparin_dose,"{""dose"": 1000.0, ""order_tsp"": ""2014-04-23 17:0...",4,-3 days +22:43:00
84601,3,1020,2014-04-23 18:55:00,heparin_dose,"{""dose"": 1000.0, ""order_tsp"": ""2014-04-23 17:0...",4,-3 days +23:43:00
84619,3,1020,2014-04-23 19:55:00,heparin_dose,"{""dose"": 1000.0, ""order_tsp"": ""2014-04-23 17:0...",4,-2 days +00:43:00
84635,3,1020,2014-04-23 20:55:00,heparin_dose,"{""dose"": 1000.0, ""order_tsp"": ""2014-04-23 17:0...",4,-2 days +01:43:00
84644,3,1020,2014-04-23 21:30:00,aspirin_dose,"{""dose"": 325.0, ""order_tsp"": ""2014-04-23 17:42...",1,-2 days +02:18:00
84682,3,1020,2014-04-24 07:54:00,aspirin_dose,"{""dose"": 325.0, ""order_tsp"": ""2014-04-23 17:42...",1,-2 days +12:42:00
84712,3,1020,2014-04-24 14:41:00,heparin_dose,"{""dose"": 1100.0, ""order_tsp"": ""2014-04-23 17:0...",4,-2 days +19:29:00
84716,3,1020,2014-04-24 15:15:00,heparin_dose,"{""dose"": 1100.0, ""order_tsp"": ""2014-04-23 17:0...",4,-2 days +20:03:00
84737,3,1020,2014-04-24 23:02:00,heparin_dose,"{""dose"": 2000.0, ""order_tsp"": ""2014-04-23 17:4...",4,-1 days +03:50:00


In [64]:
#adjusted_time = df_t_cut.groupby('enc_id').apply(lambda x: x.loc[:, 'tsp_adjusted']-x.loc[:, 'tsp_adjusted'].iloc[0])
#adjusted_time.index = adjusted_time.index.droplevel()
#df_t_cut.loc[:, 'tsp_adjusted'] = adjusted_time

In [66]:
%%capture
# create column for the signal
df_t_cut.loc[:, signal_name] = df_t_cut.loc[df_t_cut.loc[:, 'fid'].isin([signal_name]), 'value']
df_t_cut.loc[:, signal_name] = df_t_cut.loc[:, signal_name].apply(lambda x: float(x))

In [68]:
%%capture
# create a column for each treatment category
# binarize
for category, names in treatment_names.items():
    df_t_cut.loc[:, category] = df_t_cut.loc[df_t_cut.loc[:, 'fid'].isin(names), 'value']
for treatment in treatment_names.keys():
    df_t_cut.loc[df_t_cut.loc[:, treatment].notna(), treatment] = 1
    df_t_cut.loc[:, treatment] = df_t_cut.loc[:, treatment].fillna(value = 0)

In [69]:
# for every patient, delete all rows after the last valid signal measurement 
df_t_cut = df_t_cut.groupby('enc_id').apply(lambda x: x.loc[x.index <= x.loc[:, 'inr'].last_valid_index(), :]).reset_index(drop=True)

In [271]:
# put signals in bins
binned_signal = df_t_cut.loc[:, ['enc_id', 'tsp_adjusted', signal_name]].dropna().groupby('enc_id').apply(lambda x: x.loc[:, ['tsp_adjusted', signal_name]].resample(bin_size, on='tsp_adjusted', base=0).mean())
binned_signal.reset_index(level='enc_id', inplace=True)
new_times = binned_signal.index.get_level_values('tsp_adjusted')
binned_signal.loc[:, 'time'] = new_times
binned_signal.reset_index(drop=True, inplace=True)

In [19]:
# put treatments in bins
binned_treatment = df_t_cut.loc[:, ['enc_id', 'tsp_adjusted']+ list(treatment_names.keys())].groupby('enc_id').apply(lambda x: x.resample(bin_size, on='tsp_adjusted').max())
# if nothing is recorded in a certain time bin, treatment will be nan. But that is the same as no treatment given
# so mark it as zero
binned_treatment.fillna(0, inplace=True)

In [256]:
# put the binned values into a new dataframe called df_binned
df_binned = pd.concat([binned_signal, binned_treatment.reset_index(drop=True).loc[:, list(treatment_names.keys())]], axis = 1)

In [257]:
df_binned

Unnamed: 0,enc_id,inr,time,nsaid,anticoagulant,transfusion_plasma,transfusion_platelets,aspirin
0,1020,1.000,0 days 00:00:00,0.0,1.0,0.0,0.0,1.0
1,1020,2.600,0 days 18:00:00,0.0,1.0,0.0,0.0,0.0
2,1020,,1 days 12:00:00,0.0,1.0,0.0,0.0,1.0
3,1020,6.500,2 days 06:00:00,0.0,0.0,0.0,0.0,1.0
4,1020,4.700,3 days 00:00:00,0.0,0.0,0.0,0.0,1.0
5,1020,2.400,3 days 18:00:00,0.0,1.0,0.0,0.0,0.0
6,1221,2.050,0 days 00:00:00,0.0,1.0,0.0,0.0,0.0
7,1221,,0 days 18:00:00,0.0,1.0,0.0,0.0,1.0
8,1221,4.200,1 days 12:00:00,0.0,0.0,0.0,0.0,1.0
9,1221,3.100,2 days 06:00:00,0.0,0.0,0.0,0.0,1.0


#### Resampling but consider treatments before the first observation

In [198]:
# seperate the rows with positive and negative time to different dataframe to do resampling
# combine them at the end

# every row with nonnegative time
df_t_pos = df_t_cut.loc[df_t_cut.loc[:, 'tsp_adjusted'] >= timedelta(), :]
# every row with nonpositive time
df_t_neg = df_t_cut.loc[df_t_cut.loc[:, 'tsp_adjusted'] <= timedelta(), :]

In [199]:
# make all the negative time positive, so the largest value is the time point farthest out
# necessary because resampling starts from the smallest time, we want it to be zero
df_t_neg.loc[:, 'tsp_adjusted'] = df_t_neg.loc[:, 'tsp_adjusted'].apply(lambda x: -x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [287]:
# binned treatments for time points prior to the first signal observation
binned_treatment_prev = df_t_neg.loc[:, ['enc_id', 'tsp_adjusted']+ list(treatment_names.keys())].groupby('enc_id').apply(lambda x: x.resample(bin_size, on='tsp_adjusted').max())
binned_treatment_prev.loc[:, 'enc_id'].fillna(method='bfill', inplace=True)
binned_treatment_prev.fillna(0, inplace=True)
new_times = binned_treatment_prev.index.get_level_values('tsp_adjusted')
binned_treatment_prev.reset_index(drop=True, inplace=True)
binned_treatment_prev.loc[:, 'time'] = new_times

In [288]:
# for some patient there are no treatments before the first observation
# but to allow the model to work consistently, we add rows with all zeros to those patients
def add_rows_treatment(x):
    last_time = x.loc[:, 'time'].iloc[-1]
    while last_time < timedelta(hours=(num_past_effects-1)*bin_size_num):
        last_row = x.iloc[-1, :]
        new_last_time = last_time + timedelta(hours=bin_size_num)
        last_row.loc['time'] = new_last_time
        last_row.loc[list(treatment_names.keys())] = 0
        last_time = new_last_time
        x = x.append(last_row)
    return x

binned_treatment_prev = binned_treatment_prev.groupby('enc_id').apply(lambda x: add_rows_treatment(x))
binned_treatment_prev.reset_index(drop=True, inplace=True)

In [289]:
# adjust time so that the farthest out time point is time zero
positive_time = binned_treatment_prev.groupby('enc_id').apply(lambda x: (x.loc[:, 'time'].iloc[-1]-x.loc[:, 'time']))
positive_time.reset_index(drop=True, inplace=True)
binned_treatment_prev.loc[:, 'time'] = positive_time

In [290]:
# flip row order so time is in increasing order
binned_treatment_prev = binned_treatment_prev.groupby('enc_id').apply(lambda x: x.iloc[::-1])
binned_treatment_prev.reset_index(drop=True, inplace=True)

In [292]:
# resampling for treatments after the first signal observation
# the first time point for each patient is num_past_effects*bin_size_num
binned_treatment_post = df_t_pos.loc[:, ['enc_id', 'tsp_adjusted']+ list(treatment_names.keys())].groupby('enc_id').apply(lambda x: x.resample(bin_size, on='tsp_adjusted').max())
# since the last id should not be nan, use bfill to fill the id column
# it's important for id column to be correct since we use it to combine prev and post binned treatments
binned_treatment_post.loc[:, 'enc_id'].fillna(method='bfill', inplace=True)
binned_treatment_post.fillna(0, inplace=True)
new_time = binned_treatment_post.index.get_level_values('tsp_adjusted') + timedelta(hours=num_past_effects*bin_size_num)
binned_treatment_post.loc[:, 'time'] = new_time
binned_treatment_post.reset_index(drop=True, inplace=True)

In [314]:
# put the binned treatment together
binned_treatment_list = []
for i in np.unique(binned_signal.loc[:, 'enc_id']):
    prev = binned_treatment_prev.loc[binned_treatment_prev.loc[:, 'enc_id']==i, :]
    post = binned_treatment_post.loc[binned_treatment_post.loc[:, 'enc_id']==i, :]
    binned_treatment_list.append(pd.concat([prev, post]))

binned_treatment_combined = pd.concat(binned_treatment_list)

In [316]:
binned_treatment_combined.reset_index(drop=True, inplace=True)

In [296]:
binned_signal.loc[:, 'time'] += timedelta(hours=num_past_effects*bin_size_num)

In [297]:
def add_rows_signal(x):
    x = x.iloc[::-1]
    first_time = x.loc[:, 'time'].iloc[-1]
    while first_time > timedelta():
        new_first_time = first_time - timedelta(hours=bin_size_num)
        row = x.iloc[0]
        row[signal_name] = np.nan
        row['time'] = new_first_time
        first_time = new_first_time
        x = x.append(row)
    return x.iloc[::-1]

In [298]:
binned_signal = binned_signal.groupby('enc_id').apply(lambda x: add_rows_signal(x)).reset_index(drop=True)

In [326]:
df_binned = pd.concat([binned_signal, binned_treatment_combined.loc[:, list(treatment_names.keys())]], axis = 1)

In [327]:
df_binned

Unnamed: 0,enc_id,inr,time,nsaid,anticoagulant,transfusion_plasma,transfusion_platelets,aspirin
0,1020,,0 days 00:00:00,0.0,1.0,0.0,0.0,1.0
1,1020,,0 days 18:00:00,0.0,1.0,0.0,0.0,1.0
2,1020,,1 days 12:00:00,0.0,1.0,0.0,0.0,1.0
3,1020,1.000,2 days 06:00:00,0.0,1.0,0.0,0.0,1.0
4,1020,2.600,3 days 00:00:00,0.0,1.0,0.0,0.0,0.0
5,1020,,3 days 18:00:00,0.0,1.0,0.0,0.0,1.0
6,1020,6.500,4 days 12:00:00,0.0,0.0,0.0,0.0,1.0
7,1020,4.700,5 days 06:00:00,0.0,0.0,0.0,0.0,1.0
8,1020,2.400,6 days 00:00:00,0.0,1.0,0.0,0.0,0.0
9,1221,,0 days 00:00:00,0.0,0.0,0.0,0.0,0.0


In [328]:
# remove individuals with observation missing pct larger than the threshold
df_binned = df_binned.groupby('enc_id').filter(lambda x: (np.where(x.loc[:, signal_name].isna())[0].shape[0]-num_past_effects) / x.shape[0] < max_missing_pct)
# the number of ids available for training
np.unique(df_binned.loc[:, 'enc_id']).shape

(2328,)

In [300]:
# take only part of cdm_s that has the patient ids which we use in df_binned
df_s = cdm_s.loc[cdm_s.loc[:, 'enc_id'].isin(df_binned.loc[:, 'enc_id'].unique()), :]

In [24]:
# create dataframe containing each chronic conditions, binarize
df_static = pd.DataFrame()
for chronic, names in chronic_names.items():
    col = df_s.groupby('enc_id').apply(lambda x: x.loc[x.loc[:, 'fid'].isin(names), 'value'].any())
    df_static.loc[:, chronic] = col # to make sure index is correct
    df_static.loc[:, chronic] = np.where(col == False, 0, 1)

In [25]:
# add demographic information to df_static
for demo in demographic_names:
    col = df_s.groupby('enc_id').apply(lambda x: int(x.loc[x.loc[:, 'fid'] == demo, 'value'].values[0]))
    df_static.loc[:, demo] = col

In [26]:
df_static

Unnamed: 0_level_0,liver_disease,sickle_cell,age
enc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1020,0,0,56
1221,0,0,89
1319,0,0,88
1330,0,0,90
1337,0,0,68
1575,0,0,82
1602,0,0,72
1793,0,0,80
1905,0,0,83
2072,0,0,60


In [329]:
# also remove the individuals with high missing pct from df_static
df_static = df_static.loc[np.unique(df_binned.loc[:, 'enc_id']), :]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


The number below is less than the max bin number in the np array preprocess version because the max bin number there was calculated **before** individuals with high missing percentage is removed.

In [331]:
# overall percentage of missing observations after removal
np.where(df_binned.loc[:, signal_name].isnull())[0].shape[0]/df_binned.shape[0]

0.4151474042947424

In [30]:
# maximum number of bins for an individual
# necessary if convert dataframe to np array
max_num_bins = int(max(df_binned.loc[:, 'time']) / timedelta(hours=bin_size_num)) + 1

In [31]:
# turn static features into np array
c_mtx = df_static.values

In [32]:
# create matrix storing the signal observations
# shape is (number of patient * max_num_bins)
y_list = []
df_binned.groupby('enc_id').apply(lambda x: y_list.append(x.loc[:, signal_name].values))
# for some reason the first group always got appended twice to the list, so delete one
y_list.pop(0)
y_mtx = np.full((len(y_list), max_num_bins), np.nan)
for i, y in enumerate(y_list):
    y_mtx[i, :y.shape[0]] = y

In [33]:
# create matrix storing treatment information
# shape is (number of patients * max_num_bins * number of treatment categories)
x_list = []
df_binned.groupby('enc_id').apply(lambda x: x_list.append(x.loc[:, list(treatment_names.keys())].values))
x_list.pop(0)
X_mtx = np.zeros((len(x_list), max_num_bins, len(treatment_names.keys())))
for i, x in enumerate(x_list):
    X_mtx[i, :x.shape[0], :] = x

In [34]:
#np.savez('../Data/'+signal_name+'_preprocessed_data', y_mtx=y_mtx, X_mtx=X_mtx, c_mtx=c_mtx)

### Model Training

In [35]:
em = EM(y_mtx, X_mtx, c_mtx, num_past_effects, 0, train_pct=training_pct, single_effect=single_effect)

### Future Improvements
* for now, signals and treatments are stored as np array whose shape is determined by the maximum number of bins an individual has. This is to accomodate the existing code in EM.py, but the resulting matrix has lots of extra nans(in the case of signal) and zeros(in the case of treatments), which could be address by changing the data structure used in the EM.py code