In [1]:
import pandas as pd
import pickle
import numpy as np
from datetime import datetime

In [2]:
cdm_t = pd.read_pickle('../Data/cdm_t.pkl')
cdm_s = pd.read_pickle('../Data/cdm_s.pkl')

In [3]:
signal_name = 'inr'
treatment_names = {}
treatment_names['nsaid'] = ['acetaminophen_dose','celecoxib_dose','diclofenac_dose','ibuprofen_dose','indomethacin_dose',
'ketorolac_dose','meloxicam_dose','naproxen_dose']
treatment_names['anticoagulant'] = ['warfarin_dose','heparin_dose','dabigatran_dose','edoxaban_dose','rivaroxaban_dose',
'apixaban_dose','enoxaparin_dose','dalteparin_dose','fondaparinux_dose']
treatment_names['transfusion'] = ['transfuse_plasma', 'transfuse_platelets']
treatment_names['aspirin'] = ['aspirin_dose']
treatment_list = []
for name in treatment_names.values():
    treatment_list.append(name)
# flatten the list
treatment_list = [item for sublist in treatment_list for item in sublist]

In [4]:
# df_t is part of the original dataframe that has all the ids who have measurements for the signal we are interested 
# in
signal = cdm_t.loc[cdm_t.loc[:, 'fid'] == signal_name, 'value']
ids = np.unique(cdm_t.loc[signal.index, 'enc_id'])
df_t = cdm_t.loc[cdm_t.loc[:, 'enc_id'].isin(ids), :]

In [5]:
df_t

Unnamed: 0,dataset_id,enc_id,tsp,fid,value,confidence
460,3,8,2015-08-03 17:12:00+00,care_unit,HCGH LABOR & DELIVERY,1
461,3,8,2015-08-03 17:38:00+00,care_unit,HCGH LABOR & DELIVERY,1
462,3,8,2015-08-03 17:39:00+00,resp_rate,16.0,1
463,3,8,2015-08-03 17:39:00+00,temperature,99.2,1
464,3,8,2015-08-03 17:39:00+00,spo2,100.0,1
465,3,8,2015-08-03 17:39:00+00,nbp_dias,79.0,1
466,3,8,2015-08-03 17:39:00+00,heart_rate,84.0,1
467,3,8,2015-08-03 17:39:00+00,nbp_sys,122.0,1
468,3,8,2015-08-03 18:04:00+00,cbc_order,"{""name"": ""COMPLETE BLOOD COUNT (CBC) WITHOUT D...",2
469,3,8,2015-08-03 18:04:00+00,crystalloid_fluid_order,125,2


In [6]:
# convert tsp field to python datetime object
# make the time for each id to start from zero
df_t.loc[:, 'tsp'] = df_t.loc[:, 'tsp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S+%f'))
df_t.loc[:, 'tsp'] = df_t.groupby('enc_id')['tsp'].apply(lambda x: x - x.iloc[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [7]:
# df_t is now part of the dataframe that contains only the rows with fid being either the signal or the treatments
df_t = df_t.loc[df_t.loc[:, 'fid'].isin(treatment_list + [signal_name]), :]

In [31]:
# for each id, adjust tsp so that time is zero for the first time the signal is measured
adjusted_time = df_t.groupby('enc_id').apply(lambda x: x.loc[:, 'tsp'] - x.loc[x.loc[:, 'fid'] == signal_name, 'tsp'].iloc[0])
# adjusted_time is multiindexed, need to drop one level before assigning it to column tsp
adjusted_time.index = adjusted_time.index.droplevel()
df_t.loc[:, 'tsp'] = adjusted_time

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [32]:
df_t

Unnamed: 0,dataset_id,enc_id,tsp,fid,value,confidence
477,3,8,0 days 00:00:00,inr,0.9,1
607,3,10,-2 days +22:00:00,ketorolac_dose,"{""dose"": 30.0, ""order_tsp"": ""2015-03-02 02:22:...",1
639,3,10,-1 days +03:54:00,ketorolac_dose,"{""dose"": 30.0, ""order_tsp"": ""2015-03-02 02:22:...",1
667,3,10,-1 days +09:44:00,ketorolac_dose,"{""dose"": 30.0, ""order_tsp"": ""2015-03-02 02:22:...",1
694,3,10,-1 days +17:01:00,ketorolac_dose,"{""dose"": 30.0, ""order_tsp"": ""2015-03-02 02:22:...",1
707,3,10,-1 days +22:17:00,ibuprofen_dose,"{""dose"": 600.0, ""order_tsp"": ""2015-03-02 02:22...",1
723,3,10,0 days 00:00:00,inr,0.9,1
729,3,10,0 days 02:10:00,acetaminophen_dose,"{""dose"": 1000.0, ""order_tsp"": ""2015-03-02 07:4...",1
731,3,10,0 days 04:48:00,ibuprofen_dose,"{""dose"": 600.0, ""order_tsp"": ""2015-03-02 02:22...",1
753,3,10,0 days 10:36:00,ibuprofen_dose,"{""dose"": 600.0, ""order_tsp"": ""2015-03-02 02:22...",1


### TO DO (not yet in order)
* Delete rows based on num of observation cutoff (maybe print out the number of data points for different cutoff before prompting for selection)
* Bin observation and treatments
* Group treatments by category
* Cast dataframe to matrix