In [60]:
import pandas as pd
import pickle
import numpy as np
from datetime import datetime, timedelta

In [2]:
cdm_t = pd.read_pickle('../Data/cdm_t.pkl')
cdm_s = pd.read_pickle('../Data/cdm_s.pkl')

In [3]:
signal_name = 'inr'
treatment_names = {}
treatment_names['nsaid'] = ['acetaminophen_dose','celecoxib_dose','diclofenac_dose','ibuprofen_dose','indomethacin_dose',
'ketorolac_dose','meloxicam_dose','naproxen_dose']
treatment_names['anticoagulant'] = ['warfarin_dose','heparin_dose','dabigatran_dose','edoxaban_dose','rivaroxaban_dose',
'apixaban_dose','enoxaparin_dose','dalteparin_dose','fondaparinux_dose']
treatment_names['transfusion_plasma'] = ['transfuse_plasma']
treatment_names['transfusion_platelets'] = ['transfuse_platelets']
treatment_names['aspirin'] = ['aspirin_dose']
treatment_list = []
for name in treatment_names.values():
    treatment_list.append(name)
# flatten the list
treatment_list = [item for sublist in treatment_list for item in sublist]

In [4]:
# df_t is part of the original dataframe that has all the ids who have measurements for the signal we are interested 
# in
signal = cdm_t.loc[cdm_t.loc[:, 'fid'] == signal_name, 'value']
ids = np.unique(cdm_t.loc[signal.index, 'enc_id'])
df_t = cdm_t.loc[cdm_t.loc[:, 'enc_id'].isin(ids), :]

In [5]:
df_t

Unnamed: 0,dataset_id,enc_id,tsp,fid,value,confidence
460,3,8,2015-08-03 17:12:00+00,care_unit,HCGH LABOR & DELIVERY,1
461,3,8,2015-08-03 17:38:00+00,care_unit,HCGH LABOR & DELIVERY,1
462,3,8,2015-08-03 17:39:00+00,resp_rate,16.0,1
463,3,8,2015-08-03 17:39:00+00,temperature,99.2,1
464,3,8,2015-08-03 17:39:00+00,spo2,100.0,1
465,3,8,2015-08-03 17:39:00+00,nbp_dias,79.0,1
466,3,8,2015-08-03 17:39:00+00,heart_rate,84.0,1
467,3,8,2015-08-03 17:39:00+00,nbp_sys,122.0,1
468,3,8,2015-08-03 18:04:00+00,cbc_order,"{""name"": ""COMPLETE BLOOD COUNT (CBC) WITHOUT D...",2
469,3,8,2015-08-03 18:04:00+00,crystalloid_fluid_order,125,2


In [6]:
# convert tsp field to python datetime object
# make the time for each id to start from zero
df_t.loc[:, 'tsp'] = df_t.loc[:, 'tsp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S+%f'))
#df_t.loc[:, 'tsp'] = df_t.groupby('enc_id')['tsp'].apply(lambda x: x - x.iloc[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [7]:
# df_t is now part of the dataframe that contains only the rows with fid being either the signal or the treatments
df_t = df_t.loc[df_t.loc[:, 'fid'].isin(treatment_list + [signal_name]), :]

In [8]:
# for each id, adjust tsp so that time is zero for the first time the signal is measured
adjusted_time = df_t.groupby('enc_id').apply(lambda x: x.loc[:, 'tsp'] - x.loc[x.loc[:, 'fid'] == signal_name, 'tsp'].iloc[0])
# adjusted_time is multiindexed, need to drop one level before assigning it to column tsp
adjusted_time.index = adjusted_time.index.droplevel()
df_t.loc[:, 'tsp_adjusted'] = adjusted_time

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [9]:
df_t

Unnamed: 0,dataset_id,enc_id,tsp,fid,value,confidence,tsp_adjusted
477,3,8,2015-08-03 19:12:00,inr,0.9,1,0 days 00:00:00
607,3,10,2015-03-02 02:24:00,ketorolac_dose,"{""dose"": 30.0, ""order_tsp"": ""2015-03-02 02:22:...",1,-2 days +22:00:00
639,3,10,2015-03-02 08:18:00,ketorolac_dose,"{""dose"": 30.0, ""order_tsp"": ""2015-03-02 02:22:...",1,-1 days +03:54:00
667,3,10,2015-03-02 14:08:00,ketorolac_dose,"{""dose"": 30.0, ""order_tsp"": ""2015-03-02 02:22:...",1,-1 days +09:44:00
694,3,10,2015-03-02 21:25:00,ketorolac_dose,"{""dose"": 30.0, ""order_tsp"": ""2015-03-02 02:22:...",1,-1 days +17:01:00
707,3,10,2015-03-03 02:41:00,ibuprofen_dose,"{""dose"": 600.0, ""order_tsp"": ""2015-03-02 02:22...",1,-1 days +22:17:00
723,3,10,2015-03-03 04:24:00,inr,0.9,1,0 days 00:00:00
729,3,10,2015-03-03 06:34:00,acetaminophen_dose,"{""dose"": 1000.0, ""order_tsp"": ""2015-03-02 07:4...",1,0 days 02:10:00
731,3,10,2015-03-03 09:12:00,ibuprofen_dose,"{""dose"": 600.0, ""order_tsp"": ""2015-03-02 02:22...",1,0 days 04:48:00
753,3,10,2015-03-03 15:00:00,ibuprofen_dose,"{""dose"": 600.0, ""order_tsp"": ""2015-03-02 02:22...",1,0 days 10:36:00


In [43]:
cutoff = 5
keep_index = df_t.groupby('enc_id')['fid'].filter(lambda x: x.value_counts().loc['inr'] >= cutoff).index

In [45]:
df_t_cut = df_t.loc[keep_index]

In [69]:
# delete rows whose adjusted time is less than zero
df_t_cut = df_t_cut.loc[df_t_cut.loc[:, 'tsp_adjusted'] >= timedelta(), :]

In [81]:
df_t_cut.loc[:, 'inr'] = df_t_cut.loc[df_t_cut.loc[:, 'fid'].isin([signal_name]), 'value']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [83]:
for category, names in treatment_names.items():
    df_t_cut.loc[:, category] = df_t_cut.loc[df_t_cut.loc[:, 'fid'].isin(names), 'value']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [85]:
for treatment in treatment_names.keys():
    df_t_cut.loc[df_t_cut.loc[:, treatment].notna(), treatment] = 1
    df_t_cut.loc[:, treatment] = df_t_cut.loc[:, treatment].fillna(value = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [86]:
df_t_cut

Unnamed: 0,dataset_id,enc_id,tsp,fid,value,confidence,tsp_adjusted,inr,nsaid,anticoagulant,transfusion,aspirin
84810,3,1020,2014-04-25 19:12:00,inr,1.0,1,0 days 00:00:00,1.0,0,0,0,0
84829,3,1020,2014-04-26 01:51:00,inr,1.0,1,0 days 06:39:00,1.0,0,0,0,0
84831,3,1020,2014-04-26 02:50:00,heparin_dose,"{""dose"": 19.0, ""order_tsp"": ""2014-04-25 15:39:...",4,0 days 07:38:00,,0,1,0,0
84848,3,1020,2014-04-26 07:58:00,aspirin_dose,"{""dose"": 325.0, ""order_tsp"": ""2014-04-23 17:42...",1,0 days 12:46:00,,0,0,0,1
84873,3,1020,2014-04-26 16:56:00,warfarin_dose,"{""dose"": 10.0, ""order_tsp"": ""2014-04-25 15:44:...",1,0 days 21:44:00,,0,1,0,0
84875,3,1020,2014-04-26 17:38:00,heparin_dose,"{""dose"": 19.0, ""order_tsp"": ""2014-04-25 15:39:...",4,0 days 22:26:00,,0,1,0,0
84907,3,1020,2014-04-27 06:34:00,inr,2.6,1,1 days 11:22:00,2.6,0,0,0,0
84916,3,1020,2014-04-27 08:20:00,heparin_dose,"{""dose"": 25000.0, ""order_tsp"": ""2014-04-27 08:...",4,1 days 13:08:00,,0,1,0,0
84918,3,1020,2014-04-27 08:22:00,aspirin_dose,"{""dose"": 325.0, ""order_tsp"": ""2014-04-23 17:42...",1,1 days 13:10:00,,0,0,0,1
84924,3,1020,2014-04-27 11:05:00,heparin_dose,"{""dose"": 0, ""order_tsp"": ""2014-04-25 15:39:00""...",1,1 days 15:53:00,,0,1,0,0


### TO DO (not yet in order)
* Bin observation and treatments using resampling
* Cast dataframe to matrix

In [72]:
np.unique(df_t_cut['fid'])

array(['acetaminophen_dose', 'apixaban_dose', 'aspirin_dose',
       'celecoxib_dose', 'dabigatran_dose', 'dalteparin_dose',
       'enoxaparin_dose', 'fondaparinux_dose', 'heparin_dose',
       'ibuprofen_dose', 'indomethacin_dose', 'inr', 'ketorolac_dose',
       'meloxicam_dose', 'naproxen_dose', 'rivaroxaban_dose',
       'transfuse_plasma', 'transfuse_platelets', 'warfarin_dose'],
      dtype=object)

In [71]:
df_t_cut.groupby('enc_id').apply(lambda x: x.set_index('tsp_adjusted').resample('2T').mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset_id,enc_id,confidence
enc_id,tsp_adjusted,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1020,0 days 00:00:00,3.0,1020.0,1.0
1020,0 days 00:02:00,,,
1020,0 days 00:04:00,,,
1020,0 days 00:06:00,,,
1020,0 days 00:08:00,,,
1020,0 days 00:10:00,,,
1020,0 days 00:12:00,,,
1020,0 days 00:14:00,,,
1020,0 days 00:16:00,,,
1020,0 days 00:18:00,,,
