In [1]:
import os
import pathlib

p_project = str(pathlib.Path(os.getcwd()).parents[1])

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import datetime
from datetime import timedelta

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 300)

In [3]:
adm=pd.read_csv(p_project + "/data/mimic3/processed/admissions_processed.csv")

We now consider the labevents dataset. We select only the patients with the same criteria as above.

In [4]:
lab=pd.read_csv(p_project + "/data/mimic3/LABEVENTS.csv.gz", compression='gzip')

#Restrict the dataset to the previously selected admission ids only.
adm_ids=list(adm["HADM_ID"])
lab=lab.loc[lab["HADM_ID"].isin(adm_ids)]

print("Number of patients remaining in the database: ")
print(lab["SUBJECT_ID"].nunique())

Number of patients remaining in the database: 
24642


We load the D_ITEMS dataframe which contains the name of the ITEMID. And we merge both tables together.

In [5]:
#item_id
item_id=pd.read_csv(p_project + "/data/mimic3/D_LABITEMS.csv.gz", compression='gzip')
item_id_1=item_id[["ITEMID","LABEL"]]
item_id_1.head()

#We merge the name of the item administrated.
lab2=pd.merge(lab,item_id_1,on="ITEMID")
lab2.head()
print("Number of patients remaining in the database: ")
print(lab2["SUBJECT_ID"].nunique())

Number of patients remaining in the database: 
24642


In [6]:
#Only select the subset that was used in the paper (only missing is INR(PT))
subset=["Albumin","Alanine Aminotransferase (ALT)","Alkaline Phosphatase","Anion Gap","Asparate Aminotransferase (AST)","Base Excess","Basophils","Bicarbonate","Bilirubin, Total","Calcium, Total","Calculated Total CO2","Chloride","Creatinine","Eosinophils","Glucose","Hematocrit","Hemoglobin",
"Lactate","Lymphocytes","MCH","MCHC","MCV","Magnesium","Monocytes","Neutrophils","PT","PTT","Phosphate","Platelet Count","Potassium","RDW","Red Blood Cells","Sodium","Specific Gravity","Urea Nitrogen","White Blood Cells","pCO2","pH","pO2"]

lab3=lab2.loc[lab2["LABEL"].isin(subset)].copy()

print("Number of patients remaining in the database: ")
print(lab3["SUBJECT_ID"].nunique())

Number of patients remaining in the database: 
24642


### Units Cleaning

#### 1) In amounts

In [7]:
#Verification that all input labels have the same amounts units.
print(lab3.groupby("LABEL")["VALUEUOM"].value_counts())

LABEL                            VALUEUOM
Alanine Aminotransferase (ALT)   IU/L         58702
Albumin                          g/dL         38849
Alkaline Phosphatase             IU/L         56854
Anion Gap                        mEq/L       262969
Asparate Aminotransferase (AST)  IU/L         58640
Base Excess                      mEq/L       229451
Basophils                        %            40307
Bicarbonate                      mEq/L       269058
Bilirubin, Total                 mg/dL        58906
Calcium, Total                   mg/dL       206907
Calculated Total CO2             mEq/L       219549
                                 MEQ/L         9888
Chloride                         mEq/L       276398
Creatinine                       mg/dL       278623
Eosinophils                      %            40917
Glucose                          mg/dL       404050
Hematocrit                       %           314183
Hemoglobin                       g/dL        309533
Lactate               

In [8]:
#Correct the units
lab3.loc[lab3["LABEL"]=="Calculated Total CO2","VALUEUOM"]="mEq/L"
lab3.loc[lab3["LABEL"]=="PT","VALUEUOM"]="sec"
lab3.loc[lab3["LABEL"]=="pCO2","VALUEUOM"]="mm Hg"
lab3.loc[lab3["LABEL"]=="pH","VALUEUOM"]="units"
lab3.loc[lab3["LABEL"]=="pO2","VALUEUOM"]="mm Hg"

### Check for outliers

#### 1) In amounts

In [9]:
lab3.groupby("LABEL")["VALUENUM"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alanine Aminotransferase (ALT),58697.0,215.854904,806.312662,0.0,21.0,40.0,96.0,25460.0
Albumin,38841.0,3.032422,0.692599,1.0,2.5,3.0,3.5,6.4
Alkaline Phosphatase,56854.0,140.733299,166.023159,0.0,66.0,94.0,151.0,4695.0
Anion Gap,262985.0,13.510965,3.62643,-6.0,11.0,13.0,15.0,67.0
Asparate Aminotransferase (AST),58640.0,238.205832,1034.628934,0.0,26.0,47.0,105.0,36400.0
Base Excess,229420.0,-0.036823,4.952865,-414.0,-2.0,0.0,2.0,162.0
Basophils,40307.0,0.29447,0.479617,0.0,0.0,0.2,0.4,40.0
Bicarbonate,269032.0,25.331971,4.684684,2.0,23.0,25.0,28.0,53.0
"Bilirubin, Total",58902.0,3.266241,6.579278,0.0,0.5,0.9,2.5,82.8
"Calcium, Total",206903.0,8.385096,0.788048,0.0,7.9,8.4,8.8,31.2


In [10]:
#Glucose : mettre -1 aux résultats négatifs et supprimer les autres entrées dont la valeur numérique est NaN.
lab3.loc[(lab3["LABEL"]=="Glucose")&(lab3["VALUENUM"].isnull())&(lab3["VALUE"]=="NEG"),"VALUENUM"]=-1
lab3=lab3.drop(lab3.loc[(lab3["LABEL"]=="Glucose")&(lab3["VALUENUM"].isnull())].index).copy()

#Retirer les entrées avec NaN aux values et valuenum
lab3=lab3.drop(lab3.loc[(lab3["VALUENUM"].isnull())&(lab3["VALUE"].isnull())].index).copy()

#Remove the remaining NAN Values
lab3=lab3.drop(lab3.loc[(lab3["VALUENUM"].isnull())].index).copy()

#Remove anion gaps lower than 0
lab3=lab3.drop(lab3.loc[(lab3["VALUENUM"]<0)&(lab3["LABEL"]=="Anion Gap")].index).copy()

#Remove BE <-50
lab3=lab3.drop(lab3.loc[(lab3["LABEL"]=="Base Excess")&(lab3["VALUENUM"]<-50)].index).copy()
#Remove BE >50
lab3=lab3.drop(lab3.loc[(lab3["LABEL"]=="Base Excess")&(lab3["VALUENUM"]>50)].index).copy()

#Remove high Hemoglobins
lab3=lab3.drop(lab3.loc[(lab3["LABEL"]=="Hemoglobin")&(lab3["VALUENUM"]>25)].index).copy()

#Clean some glucose entries
lab3=lab3.drop(lab3.loc[(lab3["LABEL"]=="Glucose")&(lab3["VALUENUM"]>2000)&(lab3["HADM_ID"]==103500.0)].index).copy()
lab3=lab3.drop(lab3.loc[(lab3["LABEL"]=="Glucose")&(lab3["VALUENUM"]>2000)&(lab3["HADM_ID"]==117066.0)].index).copy()

#Clean too high levels of Potassium
lab3=lab3.drop(lab3.loc[(lab3["LABEL"]=="Potassium")&(lab3["VALUENUM"]>30)].index).copy()


In [12]:
lab3.to_csv(p_project + "/data/mimic3/processed/lab_processed.csv")