In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

from tqdm import tqdm_notebook

In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
file_path = 'C:/Users/Jacob/Documents/datasets/amsterdamumcdb-1.0.2/'
save_path = 'C:/Users/Jacob/Documents/datasets/amsterdamumcdb-1.0.2-processed/'
if not os.path.exists(save_path):
    os.mkdir(save_path)

In [4]:
adm = pd.read_csv(os.path.join(save_path, 'admissions.csv'))
adm.head()

Unnamed: 0.1,Unnamed: 0,patientid,admissionid,admissioncount,location,urgency,origin,admittedat,admissionyeargroup,dischargedat,...,destination,gender,agegroup,dateofdeath,weightgroup,weightsource,heightgroup,heightsource,specialty,icudeath
0,4,4,4,1,IC&MC,0,Verpleegafdeling zelfde ziekenhuis,0,2010-2016,180900000,...,19,Man,70-79,,70-79,Anamnestisch,170-179,Anamnestisch,Cardiochirurgie,False
1,5,5,5,1,IC,1,Eerste Hulp afdeling zelfde ziekenhuis,0,2010-2016,246420000,...,31,Man,50-59,,60-69,Geschat,160-169,Gemeten,Longziekte,False
2,9,9,9,1,IC,0,,0,2003-2009,242040000,...,15,Vrouw,70-79,,70-79,,159-,,Cardiochirurgie,False
3,12,12,12,1,IC,0,,0,2003-2009,627960000,...,29,Man,18-39,,90-99,,,,,False
4,15,14,15,1,IC,1,Eerste Hulp afdeling zelfde ziekenhuis,0,2010-2016,893220000,...,Overleden,Man,60-69,812280000.0,60-69,Geschat,160-169,Geschat,Cardiologie,True


## Numeric items data

In [5]:
adm = pd.read_csv(os.path.join(save_path, "admissions.csv"))
num_file = os.path.join(file_path, "numericitems.csv")
num_cols = ["admissionid", "item", "value", "unit", "measuredat"]
chunksize = 100000
num = []
# ***reading chartevents is slow***
# Documentation says `numericitems.csv` has 977625612 rows
for chunk in tqdm_notebook(pd.read_csv(num_file, usecols=num_cols, chunksize=chunksize, encoding='latin-1'), total=(977625612//chunksize)+1, smoothing=0.01):
    chunk = chunk.loc[chunk["admissionid"].isin(adm["admissionid"])]
    chunk = chunk.loc[0 < chunk["measuredat"]]
    num.append(chunk)
# N.B. - ignores "value", so only uses numeric lab values

HBox(children=(FloatProgress(value=0.0, max=9777.0), HTML(value='')))




In [7]:
num = pd.concat(num)
print(num.shape)
num.head()

(458030949, 5)


Unnamed: 0,admissionid,item,value,unit,measuredat
3657,4,Hartfrequentie,71.0,/min,5220000
3658,4,Hartfrequentie,71.0,/min,5280000
3659,4,Hartfrequentie,70.0,/min,5340000
3660,4,Hartfrequentie,71.0,/min,5400000
3661,4,Hartfrequentie,71.0,/min,5460000


In [None]:
num.to_csv(os.path.join(save_path, "numericitems.csv"))

In [5]:
num = pd.read_csv(os.path.join(save_path, "numericitems.csv"))
print(num.shape)
num.head()

(458030949, 6)

In [10]:
# Keep 100 most frequent items in listitems
num = num.loc[num["item"].isin(num["item"].value_counts().index.to_list()[:100])]
print(num.shape)

(449938056, 6)


In [11]:
num.groupby("item")["unit"].value_counts(dropna=False)

item                              unit    
ABP diastolisch                   mmHg        15186274
ABP gemiddeld                     mmHg        15184881
ABP systolisch                    mmHg        15186371
Act.HCO3 (bloed)                  mmol/l        234911
Adem Frequentie (Set)             /min         2309185
Ademfreq.                         /min         7655957
Ademfrequentie Monitor            None        11370918
B.E. (bloed)                      mmol/l        233949
Backup druk (Set)                 cmH2O        4673496
Barometer druk                    mbar         7573358
CO2 min prod                      ml/min       7194799
CO2 tidal productie               ml           7195880
CVD                               mmHg          197444
CVDm-gekoppeld                    mmHg         2609664
Cdyn                              ml/cmH2O     7403581
Eind exp. druk                    cmH2O        7661320
Eind insp. cyclus (Set)           Geen         5683422
End tidal CO2 concentr

In [17]:
stats = num.groupby("item")["value"].describe(percentiles=[0.05, 0.5, 0.95])
stats

Unnamed: 0_level_0,count,mean,std,min,5%,50%,95%,max
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ABP diastolisch,15186274.0,62.466858,31.965752,-32700.0,44.0,60.0,87.0,66108.0
ABP gemiddeld,15184881.0,83.61,259.488737,-32698.0,61.0,81.0,114.0,1007410.0
ABP systolisch,15186371.0,128.054486,91.702024,-32698.0,89.0,125.0,176.0,336170.0
Act.HCO3 (bloed),234911.0,25.208026,5.817447,-24.1,17.0,25.1,33.700001,351.0
Adem Frequentie (Set),2309185.0,21.24716,5.790121,0.0,13.0,20.0,30.0,493.0
Ademfreq.,7655957.0,21.785483,7.262204,0.0,12.0,21.0,35.0,712.0
Ademfrequentie Monitor,11370918.0,19.727445,7.94742,-1.0,9.0,19.0,33.0,5402.0
B.E. (bloed),233949.0,2.931547,4.700849,-166.0,-5.1,2.9,10.0,283.3
Backup druk (Set),4673496.0,13.851904,5.978676,0.0,5.0,14.0,20.0,200.0
Barometer druk,7573358.0,1008.133187,10.194125,0.0,990.0,1009.0,1024.0,1045.0


In [21]:
for item, lb, ub in tqdm_notebook(zip(stats.index, stats["5%"], stats["95%"]), total=100):
    num = num.drop(num.loc[(num["item"] == item) & ((num["value"] < lb) | (ub < num["value"]))].index)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [22]:
num.shape

(413365459, 6)

In [14]:
plt.figure(figsize=(20, 80))
for i, item in enumerate(sorted(num["item"].unique())):
    plt.subplot(20, 5, 1+i)
    num["value"][num["item"] == item].hist(bins=100)
    plt.title(item)

ValueError: num must be 1 <= num <= 50, not 51