In [1]:
import pandas as pd
import numpy as np

In [9]:
lab_sofa=pd.read_csv("Labs_withSOFA.csv")
vent_sofa=pd.read_csv("drugate_norm_updated.csv",usecols=['patientunitstayid','offset','SOFA_cardio'])
gcs_sofa=pd.read_csv("gcs_withSOFA.csv",usecols=['patientunitstayid','offset','SOFA_GCS'])
t_sus=pd.read_csv("ABX_BC_pid_offset_Clean_TsuspicionMax.csv")

In [10]:
patientIDs_IV = t_sus.patientunitstayid.unique().tolist()
patientIDs_IV_sub = pd.DataFrame(patientIDs_IV)
patientIDs_IV_sub.columns=['patientunitstayid']

<h3><center>Let's start the merging<center></h3>

In [28]:
labs_sus=pd.merge(lab_sofa,patientIDs_IV_sub,how="inner",on='patientunitstayid').drop_duplicates()
vent_sus=pd.merge(vent_sofa,patientIDs_IV_sub,how="inner",on='patientunitstayid').drop_duplicates()
gcs_sofa_sus=pd.merge(gcs_sofa,patientIDs_IV_sub,how="inner",on='patientunitstayid').drop_duplicates()
labs_vent=pd.merge(labs_sus,vent_sus,on=['patientunitstayid','offset'],how="outer").drop_duplicates()
final_sofa=pd.merge(labs_vent,gcs_sofa_sus,on=['patientunitstayid','offset'],how="outer").drop_duplicates()
final_sofa=final_sofa.fillna(0)


<h3><center>Calculating the Total SOFA score, difference between scores and the cumulative time, from admission</center></h3>

In [29]:
final_sofa=final_sofa.groupby(['patientunitstayid','offset'],as_index=False).max().drop_duplicates
final_sofa=final_sofa.groupby(['patientunitstayid'],as_index=False).apply(pd.DataFrame.sort_values,'offset').reset_index()
final_sofa=final_sofa.drop(columns=['level_0','level_1'])
final_sofa['Total_SOFA']=final_sofa['SOFA_Coagulation']+final_sofa['SOFA_Liver']+final_sofa['SOFA_Respiration']+final_sofa['SOFA_Renal']+final_sofa['SOFA_cardio']+final_sofa['SOFA_GCS']
del lab_sofa
del vent_sofa
del gcs_sofa

#We need a way to check whether the time when there is a difference of 2 or more in SOFA score, is less than or equal to 24 hrs
#Thus we either check (t_curr-t_min)<=24hrs or the better way,
#calculate the cumulative sum of diff of offsets 
final_sofa['diff_per_SOFA']=final_sofa.groupby(['patientunitstayid'])['Total_SOFA'].transform(lambda x: x.diff()).fillna(0)
final_sofa['diff_per_offset']=final_sofa.groupby(['patientunitstayid'])['offset'].transform(lambda x:x.diff()).fillna(0)
final_sofa['cumulative_time']=final_sofa.groupby(['patientunitstayid'])['diff_per_offset'].transform(lambda x:x.cumsum())

<h3><center>Filtering the SOFA table based on the (score diff >= 2 and cumulative time <= 24 hours) </center></h3>

In [58]:
for_24_hr=final_sofa.loc[(final_sofa['diff_per_SOFA']>=2) & (final_sofa['cumulative_time']<=(24*60))]

#The 72 hours one is just for the sake of calculation. Feel free to ignore it completely
for_72_hr=final_sofa.loc[(final_sofa['diff_per_SOFA']>=2) & (final_sofa['cumulative_time']<=(72*60))]

In [None]:
for_24_hr.head()

<h3><center>Clubbing the t_suspicion table with filtered SOFA table</center></h3>

In [63]:
t_sus=t_sus.rename(columns={'max':'tsus'})

for_24_hr_tsofa=for_24_hr.groupby(['patientunitstayid']).agg({'offset':'min'}).reset_index()
for_24_hr_tsofa=for_24_hr_tsofa.rename(columns={'offset':'tsofa'})

for_72_hr_tsofa=for_72_hr.groupby(['patientunitstayid']).agg({'offset':'min'}).reset_index()
for_72_hr_tsofa=for_72_hr_tsofa.rename(columns={'offset':'tsofa'})

In [104]:
for_24_hr_tsepsis=pd.merge(for_24_hr_tsofa,t_sus,on='patientunitstayid',how='inner').drop_duplicates()
for_72_hr_tsepsis=pd.merge(for_72_hr_tsofa,t_sus,on='patientunitstayid',how='inner')

<h3><center>flag==1 stands for cases, where as 0 for control.</center></h3>
<h3><center>Then we calculate the t_sepsis_onset time based on the required constraints</center></h3>

In [105]:
for_24_hr_tsepsis['flag']=0
for_72_hr_tsepsis['flag']=0
for_24_hr_tsepsis.loc[(for_24_hr_tsepsis['tsofa']>=(for_24_hr_tsepsis['tsus']-(24*60))) & (for_24_hr_tsepsis['tsofa']<=(for_24_hr_tsepsis['tsus']+(12*60))),'flag']=1

#the 72 hours seems wrong.....but for the sake of caclulation
for_72_hr_tsepsis.loc[(for_72_hr_tsepsis['tsofa']>=(for_72_hr_tsepsis['tsus']-(24*60))) & (for_72_hr_tsepsis['tsofa']<=(for_72_hr_tsepsis['tsus']+(12*60))),'flag']=1

In [112]:
for_24_hr_tsepsis.to_csv("24_hour_sepsis.csv",index=False)

In [106]:
for_24_hr_cases=for_24_hr_tsepsis[for_24_hr_tsepsis['flag']==1]
for_72_hr_cases=for_72_hr_tsepsis[for_72_hr_tsepsis['flag']==1]
for_24_hr_cases['tsepsis']=for_24_hr_cases[['tsus','tsofa']].min(axis=1)
for_72_hr_cases['tsepsis']=for_72_hr_cases[['tsus','tsofa']].min(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [None]:
for_24_hr_cases.head()

In [108]:
print("Total patient cases captured: ",len(for_24_hr_cases['patientunitstayid'].unique()))

Total patient cases captured:  14175


<h3><center>For cross checking purposes, compared with diagnoses table</center></h3>

In [98]:
diagnosis=pd.read_csv("diagnosis.csv")

In [None]:
diagnosis.head()

In [101]:
cases_diagnosed=diagnosis.loc[diagnosis['diagnosisstring'].str.contains("sepsis",case=False)]

In [102]:
len(cases_diagnosed['patientunitstayid'].unique())

23479

In [103]:
print("Fraction captured: ",(len(for_24_hr_cases['patientunitstayid'].unique())/len(cases_diagnosed['patientunitstayid'].unique())))   

Fraction captured:  0.6037309936539035
