In [1]:
import sys
import os

In [2]:
# Import src functions
# Set up paths
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
src_folder = os.path.join(project_root, 'src')
sys.path.insert(0, src_folder)
from s3_storage import *

In [37]:
# Set some display options
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [3]:
# Import data
item_lookup = pd.read_csv(os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'data', 'chart_lab_coalesce.csv')),index_col=0)
print('--> Importing done')
print(item_lookup.shape)

--> Importing done
(46, 3)


In [4]:
to_s3(df=item_lookup, bucket='mimic-jamesi', filename='item_lookup.csv')

In [5]:
in_file = from_s3(bucket='mimic-jamesi', filename='item_lookup.csv')
print('--> Importing done')
print(item_lookup.shape)

--> Importing done
(46, 3)


In [6]:
in_file

Unnamed: 0,new_id,name,itemid,description
0,9999001,Creatinine,50912,Using just from labevents due to high volume i...
1,9999002,Hematocrit,51221,Using just from labevents due to high volume i...
2,9999003,BUN,225624,Combining across sources due to similar distri...
3,9999003,BUN,1162,Combining across sources due to similar distri...
4,9999003,BUN,781,Combining across sources due to similar distri...
5,9999004,Chloride,50902,Using just from labevents due to high volume i...
6,9999005,White blood cells,51301,Using just from labevents due to high volume i...
7,9999006,Bicarbonate,50882,Using just from labevents due to high volume i...
8,9999007,Platelet Count,51265,Using just from labevents due to high volume i...
9,9999008,Sodium,50983,Using just from labevents due to high volume i...


In [12]:
# Import src functions
# Set up paths
import os
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
src_folder = os.path.join(project_root, 'src')
sys.path.insert(0, src_folder)
from s3_storage import *

admissions = from_s3(bucket='mimic-jamesi',
                     filename='admission_diagnosis_table.csv',
                     index_col=0)
admissions = admissions[['subject_id', 'hadm_id', 'diagnosis_name', 'diagnosis_icd9']]
admissions.head()

Unnamed: 0,subject_id,hadm_id,diagnosis_name,diagnosis_icd9
0,3572,140026,Abrasion forearm,9130
1,3572,140026,Phlbts sprfc vn up extrm,45182
2,3572,140026,Fx upper end fibula-clos,82301
3,3572,140026,Cl skull fx NEC-brf coma,80322
4,3572,140026,Open wound of scalp,8730


In [18]:
df = (admissions.groupby(['diagnosis_icd9', 'diagnosis_name'])
                .agg({'subject_id':'nunique'})
                .sort_values(by='subject_id', ascending=False)
                .reset_index())
df = df[df['subject_id']>1500]

In [20]:
df

Unnamed: 0,diagnosis_icd9,diagnosis_name,subject_id
0,4019,Hypertension NOS,17613
1,41401,Crnry athrscl natve vssl,10775
2,42731,Atrial fibrillation,10271
3,4280,CHF NOS,9843
4,5849,Acute kidney failure NOS,7687
5,2724,Hyperlipidemia NEC/NOS,7465
6,25000,DMII wo cmp nt st uncntr,7370
7,51881,Acute respiratry failure,6719
8,5990,Urin tract infection NOS,5779
9,V053,Need prphyl vc vrl hepat,5776


In [22]:
from generate_datasets import *

# Lab events
lab = get_data(query = "SELECT\
                            itemid\
                            ,COUNT(DISTINCT(hadm_id)) AS admission_count\
                        FROM\
                            mimiciii.labevents\
                        WHERE valuenum IS NOT null\
                        GROUP BY 1")

# Lab events
chart = get_data(query = "SELECT\
                            itemid\
                            ,COUNT(DISTINCT(hadm_id)) AS admission_count\
                        FROM\
                            mimiciii.chartevents\
                        WHERE valuenum IS NOT null\
                        GROUP BY 1")
df = lab.append(chart)

Unnamed: 0,itemid,admission_count,label,dbsource
0,50801,9884,,
1,50802,37356,,
2,50803,6976,,
3,50804,37353,,
4,50805,1725,,


In [32]:
# Lookup
chart_lookup = get_data(query = "SELECT DISTINCT\
                            itemid, label, dbsource\
                        FROM\
                            mimiciii.d_items")

lab_lookup = get_data(query = "SELECT DISTINCT\
                            itemid, label\
                        FROM\
                            mimiciii.d_labitems")

lookup = chart_lookup.append(lab_lookup)

# Merge on lookup so that identical concepts can be combined, identified through new_id
df_2 = pd.merge(df, lookup, how='left', left_on='itemid', right_on='itemid')
df_2['dbsource'].fillna(value='lab', inplace=True)
df_2.head()

Unnamed: 0,itemid,admission_count,dbsource,label
0,50801,9884,lab,Alveolar-arterial Gradient
1,50802,37356,lab,Base Excess
2,50803,6976,lab,"Calculated Bicarbonate, Whole Blood"
3,50804,37353,lab,Calculated Total CO2
4,50805,1725,lab,Carboxyhemoglobin


In [34]:
max_lab = df_2.loc[df_2['dbsource']=='lab', 'admission_count'].max()
max_metavision = df_2.loc[df_2['dbsource']=='metavision', 'admission_count'].max()
max_carevue = df_2.loc[df_2['dbsource']=='carevue', 'admission_count'].max()
print(max_lab)
print(max_metavision)
print(max_carevue)

57102
24549
34901


In [35]:
df_2['perc'] = np.where(df_2['dbsource']=='lab',
                       df_2['admission_count']/max_lab,
                       np.where(df_2['dbsource']=='metavision',
                               df_2['admission_count']/max_metavision,
                               df_2['admission_count']/max_carevue))

In [39]:
df_2 = df_2.sort_values(by='perc', ascending=False)

In [46]:
df_2['label'] = df_2['label'].str.lower()

In [49]:
df_2

Unnamed: 0,itemid,admission_count,dbsource,label,perc
561,211,34901,carevue,heart rate,1.000000
319,51221,57102,lab,hematocrit,1.000000
3015,226228,24549,metavision,gender,1.000000
3031,226543,24546,metavision,religion,0.999878
372,51301,56990,lab,white blood cells,0.998039
348,51265,56974,lab,platelet count,0.997758
320,51222,56949,lab,hemoglobin,0.997321
337,51249,56941,lab,mchc,0.997180
359,51279,56938,lab,red blood cells,0.997128
336,51248,56938,lab,mch,0.997128


In [62]:
df_2[df_2.label.str.contains('lactic')]

Unnamed: 0,itemid,admission_count,dbsource,label,perc
2944,225668,13766,metavision,lactic acid,0.560756
762,818,13365,carevue,lactic acid(0.5-2.0),0.38294
977,1531,11149,carevue,lactic acid,0.319446


In [59]:
df_2[df_2.itemid==220277]

Unnamed: 0,itemid,admission_count,dbsource,label,perc
2636,220277,21904,metavision,o2 saturation pulseoxymetry,0.892256


In [63]:
item_lookup = pd.read_csv('chart_lab_coalesce.csv')
to_s3(df=item_lookup, bucket='mimic-jamesi', filename='item_lookup.csv')

In [64]:
item_lookup = from_s3(bucket='mimic-jamesi', filename='item_lookup.csv', index_col=0)
test

Unnamed: 0,new_id,name,itemid,description
0,9999001,Creatinine,50912,Using just from labevents due to high volume i...
1,9999002,Hematocrit,51221,Using just from labevents due to high volume i...
2,9999003,BUN,225624,Combining across sources due to similar distri...
3,9999003,BUN,1162,Combining across sources due to similar distri...
4,9999003,BUN,781,Combining across sources due to similar distri...
5,9999004,Chloride,50902,Using just from labevents due to high volume i...
6,9999005,White blood cells,51301,Using just from labevents due to high volume i...
7,9999006,Bicarbonate,50882,Using just from labevents due to high volume i...
8,9999007,Platelet Count,51265,Using just from labevents due to high volume i...
9,9999008,Sodium,50983,Using just from labevents due to high volume i...
