#### Data Curation and Population Extraction for emergency attendances with ECG screenings for prediction of acute cardiac diagnoses in hospital

In [1]:
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib.patches import Patch
from matplotlib.dates import DateFormatter
from datetime import timedelta, datetime
from tqdm import tqdm
import numpy as np
from scipy import stats, special
from tableone import TableOne

import os
import json
import re
import pprint
import missingno as msno
from statannotations.Annotator import Annotator
import warnings

pd.set_option('display.max_rows', None)

In [2]:
mimic_path = '../../data/MIMIC-IV/mimiciv/3.1/'
mimic_ed_path = '../../data/MIMIC-IV/mimic-iv-ed/3.1/'
mimic_ecg_path = '../../data/MIMIC-IV/mimic-iv-ecg/1.0/'
out_path = '../outputs/ext_data/'

path_to_local = '../../data/MIMIC-IV/config/'

In [3]:
### Helper-functions for extracting EHR data
def dataframe_from_csv(path, compression='gzip', header=0, index_col=0, chunksize=None):
    return pd.read_csv(path, compression=compression, header=header, index_col=index_col, chunksize=None)

In [4]:
test_EHR = pd.read_csv(out_path + 'ehr_static.csv')
test_ts = pd.read_csv(out_path + 'events_ts.csv')

In [5]:
print(test_EHR.shape, test_EHR.subject_id.nunique(), test_EHR.hadm_id.nunique())
print(test_ts.shape, test_ts.subject_id.nunique())

(41376, 84) 41376 41376
(2388913, 7) 39020


In [None]:
print(test_EHR.isnull().sum())

subject_id                                 0
gender                                     0
dod                                    28761
anchor_age                                 0
yob                                        0
hadm_id                                    0
admittime                                  0
dischtime                                  0
deathtime                              38980
admission_location                         0
discharge_location                         0
insurance                                  0
marital_status                             0
race                                       0
edregtime                                  0
edouttime                                  0
los_days                                   0
ext_stay_7                                 0
ed_stay_id                                 0
ed_intime                                  0
arrival_transport                          0
disposition                                0
in_hosp_de

In [7]:
for outcome in test_EHR.columns:
    if 'outcome' in outcome:
        print(outcome, test_EHR[outcome].value_counts())

outcome_code outcome_code
{'outcome_HF'}                                          3310
{'outcome_AMI'}                                         1935
{'outcome_PE'}                                          1057
{'outcome_HF', 'outcome_AMI'}                            781
{'outcome_AD'}                                           369
{'outcome_myocarditis'}                                  199
{'outcome_HF', 'outcome_PE'}                              75
{'outcome_PE', 'outcome_AMI'}                             43
{'outcome_myocarditis', 'outcome_AMI'}                    40
{'outcome_AD', 'outcome_AMI'}                             27
{'outcome_PE', 'outcome_myocarditis'}                     27
{'outcome_AD', 'outcome_HF'}                              16
{'outcome_HF', 'outcome_PE', 'outcome_AMI'}               15
{'outcome_AcuteMI', 'outcome_HF', 'outcome_AMI'}          13
{'outcome_AcuteMI', 'outcome_AMI'}                        11
{'outcome_HF', 'outcome_myocarditis'}                     1

In [19]:
for cond in test_EHR.columns:
    if 'physltc' in cond:
        print(cond, test_EHR[cond].value_counts())

physltc_ischaemic_heart_disease physltc_ischaemic_heart_disease
0    27256
1    14120
Name: count, dtype: int64
physltc_hypertension physltc_hypertension
1    25718
0    15658
Name: count, dtype: int64
physltc_stroke physltc_stroke
0    40594
1      782
Name: count, dtype: int64
physltc_peripheral_arterial_disease physltc_peripheral_arterial_disease
0    36630
1     4746
Name: count, dtype: int64
physltc_acute_MI physltc_acute_MI
0    36268
1     5108
Name: count, dtype: int64
physltc_old_MI physltc_old_MI
0    35691
1     5685
Name: count, dtype: int64
physltc_cvd physltc_cvd
0    34500
1     6876
Name: count, dtype: int64
physltc_diabetes physltc_diabetes
0    28428
1    12948
Name: count, dtype: int64
physltc_chronic_kidney_disease physltc_chronic_kidney_disease
0    30426
1    10950
Name: count, dtype: int64
physltc_arthritis physltc_arthritis
0    30977
1    10399
Name: count, dtype: int64
physltc_hip_fracture physltc_hip_fracture
0    40044
1     1332
Name: count, dtype: int64
ph

In [17]:
test_EHR.proc_PCI.value_counts()

proc_PCI
0    41140
1      236
Name: count, dtype: int64

In [18]:
test_EHR.proc_CABG.value_counts()

proc_CABG
0    41376
Name: count, dtype: int64

In [117]:
test_EHR.shape, test_EHR['hadm_id'].nunique(), test_EHR['subject_id'].nunique(), test_EHR['stay_id'].nunique()

((41376, 84), 41376, 41376, 11055)

In [113]:
test_EHR.shape, test_EHR['hadm_id'].nunique(), test_EHR['subject_id'].nunique()

((41376, 84), 41376, 41376)

In [114]:
test_ts.columns

Index(['subject_id', 'charttime', 'itemid', 'label', 'value', 'valueuom',
       'linksto'],
      dtype='object')

In [46]:
test_ts[test_ts.label=='c-reactive_protein']['value'].describe()

count     74.000000
mean      55.854054
std       69.157959
min        0.700000
25%        4.700000
50%       11.800000
75%       99.650000
max      262.300000
Name: value, dtype: float64

In [8]:
test_ts['label'].value_counts()

label
Heart rate                  422640
Diastolic blood pressure    418716
Systolic blood pressure     418716
Respiratory rate            415813
Oxygen saturation           399529
Temperature                 232590
wbc                           8682
hemoglobin                    6914
creatinine                    6772
urea_nitrogen                 6739
hematocrit                    6734
platelet_count                6705
potassium                     6579
sodium                        6578
chloride                      6575
bicarbonate                   6569
glucose                       6568
anion_gap                     6555
eGFR                          5987
troponin_t                    3231
ntprobnp                      1054
creatine_kinase               1014
creatine_kinase_mb             906
d-dimer                        302
c-reactive_protein             175
Name: count, dtype: int64

In [103]:
test_EHR.isnull().sum()

subject_id                                 0
gender                                     0
dod                                    28761
anchor_age                                 0
yob                                        0
hadm_id                                    0
admittime                                  0
dischtime                                  0
deathtime                              38980
admission_location                         0
discharge_location                         0
insurance                                  0
marital_status                             0
race                                       0
edregtime                                  0
edouttime                                  0
los_days                                   0
ext_stay_7                                 0
ed_stay_id                                 0
ed_intime                                  0
arrival_transport                          0
disposition                                0
in_hosp_de

In [93]:
test_EHR.dtypes

subject_id                               int64
gender                                  object
dod                                     object
anchor_age                               int64
yob                                      int64
hadm_id                                  int64
admittime                               object
dischtime                               object
deathtime                               object
admission_location                      object
discharge_location                      object
insurance                               object
marital_status                          object
race                                    object
edregtime                               object
edouttime                               object
los_days                               float64
ext_stay_7                               int64
ed_stay_id                               int64
ed_intime                               object
arrival_transport                       object
disposition  

In [8]:
test_EHR.head()

Unnamed: 0,subject_id,gender,dod,anchor_age,yob,hadm_id,admittime,dischtime,deathtime,admission_location,...,outcome_AD,outcome_myocarditis,outcome_AMI,outcome_CMI,num_measures,n_presc_acei,n_presc_arb,n_presc_bb,n_presc_dapt,n_presc_aspirin
0,10000980,F,2193-08-26,73,2113,20897796,2193-08-15 01:01:00,2193-08-17 15:07:00,,WALK-IN/SELF REFERRAL,...,0,0,0,0,10,2289,2387,2226,2275,2296
1,10000980,F,2193-08-26,73,2113,20897796,2193-08-15 01:01:00,2193-08-17 15:07:00,,WALK-IN/SELF REFERRAL,...,0,0,0,0,10,2289,2387,2226,2275,91
2,10000980,F,2193-08-26,73,2113,20897796,2193-08-15 01:01:00,2193-08-17 15:07:00,,WALK-IN/SELF REFERRAL,...,0,0,0,0,10,2289,2387,2226,112,2296
3,10000980,F,2193-08-26,73,2113,20897796,2193-08-15 01:01:00,2193-08-17 15:07:00,,WALK-IN/SELF REFERRAL,...,0,0,0,0,10,2289,2387,2226,112,91
4,10000980,F,2193-08-26,73,2113,20897796,2193-08-15 01:01:00,2193-08-17 15:07:00,,WALK-IN/SELF REFERRAL,...,0,0,0,0,10,2289,2387,161,2275,2296


#### Test ED metadata

In [8]:
labs = pd.read_csv(mimic_path + 'hosp/labevents.csv', header=0, index_col=0, nrows=1000)

In [10]:
labs.columns

Index(['subject_id', 'hadm_id', 'specimen_id', 'itemid', 'order_provider_id',
       'charttime', 'storetime', 'value', 'valuenum', 'valueuom',
       'ref_range_lower', 'ref_range_upper', 'flag', 'priority', 'comments'],
      dtype='object')

In [11]:
labs.dtypes

subject_id             int64
hadm_id              float64
specimen_id            int64
itemid                 int64
order_provider_id     object
charttime             object
storetime             object
value                 object
valuenum             float64
valueuom              object
ref_range_lower      float64
ref_range_upper      float64
flag                  object
priority              object
comments              object
dtype: object

In [16]:
proc = dataframe_from_csv(os.path.join(mimic_path, 'hosp/procedures_icd.csv.gz')).reset_index()

In [17]:
proc.dtypes

subject_id      int64
hadm_id         int64
seq_num         int64
chartdate      object
icd_code       object
icd_version     int64
dtype: object

In [5]:
ed_md = dataframe_from_csv(os.path.join(mimic_ed_path, 'ed/triage.csv.gz')).reset_index()

In [9]:
ed_md.dtypes

subject_id          int64
stay_id             int64
temperature       float64
heartrate         float64
resprate          float64
o2sat             float64
sbp               float64
dbp               float64
pain               object
acuity            float64
chiefcomplaint     object
dtype: object

In [None]:
# Define a regex pattern to capture variations of chest pain, dyspnea, and palpitations
pattern = r'(chest[\s-]*pain|dyspnea|dysnea|shortness[\s-]*of[\s-]*breath|palpitation|palpitations|chest[\s-]*tightness|angina pectoris)'

stay_ids = ed_md[ed_md['chiefcomplaint'].str.contains(pattern, case=False, na=False, regex=True)]['stay_id'].unique()
ecg_dataset['suggestive_symptoms'] = ecg_dataset['ed_stay_id'].isin(stay_ids).astype(int)

#### Curate hospital stays across ED, ICU and Hospital modules

In [10]:
admissions = dataframe_from_csv(os.path.join(mimic_path, 'hosp/admissions.csv.gz')).reset_index()
admissions_ed = admissions[admissions['edregtime'].notna()]
icu_stays = dataframe_from_csv(os.path.join(mimic_ed_path, 'icu/icustays.csv.gz')).reset_index()
ed_attendances = dataframe_from_csv(os.path.join(mimic_ed_path, 'ed/edstays.csv.gz')).reset_index()

In [5]:
ed_attendances.head()

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition
0,10000032,22595853.0,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,AMBULANCE,ADMITTED
1,10000032,22841357.0,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,AMBULANCE,ADMITTED
2,10000032,25742920.0,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,AMBULANCE,ADMITTED
3,10000032,29079034.0,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,AMBULANCE,HOME
4,10000032,29079034.0,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,AMBULANCE,ADMITTED


In [11]:
icu_stays.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
0,10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.410266
1,10000690,25860671,37081114,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2150-11-02 19:37:00,2150-11-06 17:03:17,3.893252
2,10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.497535
3,10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.118032
4,10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.948113


In [12]:
admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P49AFC,TRANSFER FROM HOSPITAL,HOME,Medicaid,English,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P784FA,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P19UTS,EMERGENCY ROOM,HOSPICE,Medicaid,English,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P06OTX,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P39NWO,EMERGENCY ROOM,,,English,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0


In [6]:
print(f'Admissions: {admissions_ed.shape}, ICU Stays: {icu_stays.shape}, ED Attendances: {ed_attendances.shape}')
print(f'Unique patients in Admissions: {admissions_ed["subject_id"].nunique()}, ICU Stays: {icu_stays["subject_id"].nunique()}, ED Attendances: {ed_attendances["subject_id"].nunique()}')

Admissions: (379240, 16), ICU Stays: (94458, 8), ED Attendances: (425087, 9)
Unique patients in Admissions: 182439, ICU Stays: 65366, ED Attendances: 205504


In [7]:
# Check if all hadm_id in icu_stays are present in admissions
all_ids_present = icu_stays['hadm_id'].isin(admissions['hadm_id']).all()
print("All hadm_id in icu_stays are present in admissions:", all_ids_present)

All hadm_id in icu_stays are present in admissions: True


In [8]:
ed_attendances['disposition'].value_counts()

disposition
HOME                           241632
ADMITTED                       158010
TRANSFER                         7025
LEFT WITHOUT BEING SEEN          6155
ELOPED                           5710
OTHER                            4297
LEFT AGAINST MEDICAL ADVICE      1881
EXPIRED                           377
Name: count, dtype: int64

In [9]:
ed_attendances.arrival_transport.value_counts()

arrival_transport
WALK IN       251849
AMBULANCE     155752
UNKNOWN        15352
OTHER           1266
HELICOPTER       868
Name: count, dtype: int64

In [10]:
# Filter ED stays for 'ADMITTED' disposition
admitted_ed_stays = ed_attendances[ed_attendances['disposition'] == 'ADMITTED']

# Count how many 'ADMITTED' stays have an associated hadm_id
total_admitted_stays = admitted_ed_stays.shape[0]
admitted_with_hadm_id = admitted_ed_stays['hadm_id'].notna().sum()

# Check how many of those hadm_ids in ed_attendances are also in admissions
hadm_ids_in_ed_stays = admitted_ed_stays['hadm_id'].dropna().unique()
hadm_ids_in_admissions = admissions_ed['hadm_id'].unique()

# Count the number of hadm_ids from admitted stays that are also in admissions
hadm_ids_in_both = sum(hadm_id in hadm_ids_in_admissions for hadm_id in hadm_ids_in_ed_stays)
hadm_ed_set_both = set(hadm_ids_in_ed_stays).intersection(set(hadm_ids_in_admissions))

# Output the results
print(f"Total 'ADMITTED' stays: {total_admitted_stays}")
print(f"'ADMITTED' stays with hadm_id: {admitted_with_hadm_id}")
print(f"hadm_ids from admitted stays that are in admissions: {hadm_ids_in_both}")

admissions_ed = admissions_ed[admissions_ed['hadm_id'].isin(hadm_ed_set_both)]
admitted_ed_stays = admitted_ed_stays[admitted_ed_stays['hadm_id'].isin(hadm_ed_set_both)]
print(f'Filtered Admissions: {admissions_ed.shape}, Filtered ADMITTED ED Stays: {admitted_ed_stays.shape}')

Total 'ADMITTED' stays: 158010
'ADMITTED' stays with hadm_id: 157626
hadm_ids from admitted stays that are in admissions: 157601
Filtered Admissions: (157601, 16), Filtered ADMITTED ED Stays: (157607, 9)


#### Link relevant lab tests

In [47]:
def read_d_labitems_table(mimic4_path):
    labitems = dataframe_from_csv(os.path.join(mimic4_path, 'hosp/d_labitems.csv.gz'))
    labitems.reset_index(inplace=True)
    return labitems[['itemid', 'label', 'category']]

In [48]:
lab_ids = read_d_labitems_table(mimic_path)

In [49]:
lab_ids.shape

(1650, 3)

In [50]:
lab_ids.category.value_counts()

category
Chemistry     800
Hematology    785
Blood Gas      65
Name: count, dtype: int64

In [51]:
lab_ids[lab_ids['label'].str.contains('trop', case=False, na=False)]

Unnamed: 0,itemid,label,category
70,50872,Anti-Neutrophil Cytoplasmic Antibody,Chemistry
143,50946,Human Chorionic Gonadotropin,Chemistry
172,50975,Protein Electrophoresis,Chemistry
198,51002,Troponin I,Chemistry
199,51003,Troponin T,Chemistry
286,51098,"Prot. Electrophoresis, Urine",Chemistry
417,51232,Hypersegmented Neutrophils,Hematology
441,51256,Neutrophils,Hematology
795,51673,Immunoelectrophoresis,Chemistry
809,51687,Lipo Electrophoresis,Chemistry


In [27]:
lab_ids[lab_ids['label'].str.contains('gfr', case=False, na=False)]

Unnamed: 0,itemid,label,category
118,50920,Estimated GFR (MDRD equation),Chemistry
886,51770,MDRDgfr,Chemistry
1116,52026,Estimated GFR (MDRD equation),Blood Gas
1620,53161,Estimated GFR (CKD- EPI Refit),Chemistry
1639,53180,Estimated GFR (CKD- EPI 2021),Blood Gas


In [37]:
troponin_ids = lab_ids[lab_ids['label'].str.contains('troponin', case=False, na=False)]
ck_ids = lab_ids[lab_ids['label'].str.contains('Creatine Kinase', case=False, na=False)]
haem_ids = lab_ids[lab_ids['label'].str.contains('hemoglobin', case=False, na=False)]
egfr_ids = lab_ids[lab_ids['label'].str.contains('gfr', case=False, na=False)]

In [38]:
egfr_ids

Unnamed: 0,itemid,label,category
118,50920,Estimated GFR (MDRD equation),Chemistry
886,51770,MDRDgfr,Chemistry
1116,52026,Estimated GFR (MDRD equation),Blood Gas
1620,53161,Estimated GFR (CKD- EPI Refit),Chemistry
1639,53180,Estimated GFR (CKD- EPI 2021),Blood Gas


In [34]:
haem_ids

Unnamed: 0,itemid,label,category
4,50805,Carboxyhemoglobin,Blood Gas
9,50811,Hemoglobin,Blood Gas
12,50814,Methemoglobin,Blood Gas
50,50852,% Hemoglobin A1c,Chemistry
53,50855,Absolute Hemoglobin,Chemistry
397,51212,Fetal Hemoglobin,Hematology
407,51222,Hemoglobin,Hematology
408,51223,Hemoglobin A2,Hematology
409,51224,Hemoglobin C,Hematology
410,51225,Hemoglobin F,Hematology


In [25]:
ck_ids

Unnamed: 0,itemid,label,category
108,50910,Creatine Kinase (CK),Chemistry
109,50911,"Creatine Kinase, MB Isoenzyme",Chemistry
717,51594,"Creatine Kinase, Isoenzyme BB",Chemistry
718,51595,"Creatine Kinase, Isoenzyme MB",Chemistry
719,51596,"Creatine Kinase, Isoenzyme MM",Chemistry


In [26]:
troponin_ids

Unnamed: 0,itemid,label,category
198,51002,Troponin I,Chemistry
199,51003,Troponin T,Chemistry
1528,52642,Troponin I,Chemistry


#### Read medications table

In [6]:
presc_data = dataframe_from_csv(os.path.join(mimic_path, 'hosp/emar.csv.gz')).reset_index()

In [None]:
def get_generic_drugs(mapping, df):
    """Takes NDC product table and prescriptions dataframe; adds column with NDC table's corresponding generic name"""

    def brand_to_generic(ndc):
        # We only want the first 2 sections of the NDC code: xxxx-xxxx-xx
        matches = list(re.finditer(r"-", ndc))
        if len(matches) > 1:
            ndc = ndc[:matches[1].start()]
        try:
            return mapping.loc[mapping.PRODUCTNDC == ndc].NONPROPRIETARYNAME.iloc[0]
        except:
            print("Error: ", ndc)
            return np.nan

    df['generic_drug_name'] = df['ndc'].apply(brand_to_generic)

In [7]:
presc_data.shape, presc_data.columns

((42808593, 12),
 Index(['subject_id', 'hadm_id', 'emar_id', 'emar_seq', 'poe_id', 'pharmacy_id',
        'enter_provider_id', 'charttime', 'medication', 'event_txt',
        'scheduletime', 'storetime'],
       dtype='object'))

In [8]:
presc_data[presc_data.medication.str.contains('aspirin', case=False, na=False)]

KeyboardInterrupt: 

In [45]:
presc_data.drug_type.value_counts()

drug_type
MAIN        16791812
BASE         3471112
ADDITIVE       29687
Name: count, dtype: int64

In [None]:
lab_ids = dataframe_from_csv(os.path.join(mimic_path, 'hosp/d_labitems.csv.gz'), compression=None, header=0, index_col=None)
lab_ids.shape

In [None]:
#### HS Troponin cleaning and extraction

# Filter lab_ids for rows where the label contains 'troponin'
troponin_ids = lab_ids[lab_ids['label'].str.contains('troponin', case=False, na=False)]

# Extract the itemid values for these rows
troponin_itemids = troponin_ids['itemid'].unique()

# Filter the lab_ids DataFrame to keep only rows where itemid is 51003
lab_troponin_t = labs[labs['itemid'] == 51003]
print(lab_troponin_t.shape)

# Fill NaN values in valuenum where comments start with '<' or 'LESS' with 0.01
lab_troponin_t.loc[
    lab_troponin_t['valuenum'].isna() & lab_troponin_t['comments'].str.startswith(('<', 'LESS')),
    'valuenum'
] = 0.01

# Fill NaN values in valuenum where comments start with '>' or 'GREATER' with 25
lab_troponin_t.loc[
    lab_troponin_t['valuenum'].isna() & lab_troponin_t['comments'].str.startswith(('>', 'GREATER')),
    'valuenum'
] = 25

# Display the updated DataFrame or the first few rows to confirm
lab_troponin_t.shape

# Check how many NaN values are left in valuenum
nans_left = lab_troponin_t['valuenum'].isna().sum()
print(f"Number of NaN values left in valuenum: {nans_left}")

# Drop rows where valuenum is still NaN
lab_troponin_t_cleaned = lab_troponin_t.dropna(subset=['valuenum'])

# Check the shape of the cleaned DataFrame to confirm
print(f"Shape of DataFrame after dropping NaNs: {lab_troponin_t_cleaned.shape}")

In [None]:
### Creatine Kinase

# Filter lab_ids for rows where the label contains 'Creatine Kinase'
ck_ids = lab_ids[lab_ids['label'].str.contains('Creatine Kinase', case=False, na=False)]

# Extract the itemid values for these rows
ck_itemids = ck_ids['itemid'].unique()

# Filter the lab_ids DataFrame to keep only rows where itemid is 51002
lab_ck = lab[lab['itemid'] == 50910]
lab_ck.shape