#### Evaluate extraction quality for lab test features

In [1]:
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib.patches import Patch
from matplotlib.dates import DateFormatter
from datetime import timedelta, datetime
from tqdm import tqdm
import numpy as np
from scipy import stats, special
from tableone import TableOne

import os
import json
import re
import pprint
import missingno as msno
from statannotations.Annotator import Annotator
import warnings

pd.set_option('display.max_rows', None)

In [2]:
mimic_path = '../../data/MIMIC-IV/mimiciv/3.1/'
mimic_ed_path = '../../data/MIMIC-IV/mimic-iv-ed/3.1/'
mimic_ecg_path = '../../data/MIMIC-IV/mimic-iv-ecg/1.0/'
out_path = '../outputs/ext_data/'

path_to_local = '../../data/MIMIC-IV/config/'

In [3]:
### Helper-functions for extracting EHR data
def dataframe_from_csv(path, compression='gzip', header=0, index_col=0, chunksize=None):
    return pd.read_csv(path, compression=compression, header=header, index_col=index_col, chunksize=None)

In [4]:
test_EHR = pl.read_csv(out_path + 'ehr_static.csv')
test_ts = pl.read_csv(out_path + 'events_ts.csv')

In [5]:
test_EHR.shape, test_EHR['subject_id'].n_unique(), test_ts.shape, test_ts['subject_id'].n_unique()

((41376, 84), 41376, (2388913, 7), 39020)

In [None]:
for lbl in test_ts.to_pandas()['label'].unique():
    test_tst = test_ts.to_pandas()
    print(lbl)
    print(test_tst[test_tst['label']==lbl]['value'].describe())

In [182]:
test_ts.columns

['subject_id', 'charttime', 'itemid', 'label', 'value', 'valueuom', 'linksto']

In [157]:
def read_labevents_table(
    mimic4_path: str,
    admits_last: pl.DataFrame | pl.LazyFrame,
    include_items: None,
    items_path: str = "../config/lab_items.txt",
) -> pl.LazyFrame:
    """
    Read and preprocess the labevents table from MIMIC-IV.

    Args:
        mimic4_path (str): Path to directory containing MIMIC-IV module files.
        admits_last (pl.DataFrame | pl.LazyFrame): Last admissions table for lookup.
        include_items (str): Path to file listing lab item IDs to include.

    Returns:
        pl.LazyFrame: Labevents table in long format.
    """
    if isinstance(admits_last, pl.LazyFrame):
        admits_last = admits_last.collect()

    #  Load in csv using polars lazy API (requires table to be in csv format)
    labs_data = pl.scan_csv(
        os.path.join(mimic4_path, "hosp/labevents.csv"), try_parse_dates=True
    )
    d_items = (
        pl.read_csv(os.path.join(mimic4_path, "hosp/d_labitems.csv.gz"))
        .lazy()
        .select(["itemid", "label"])
    )
    # merge labitem id's with dict
    labs_data = labs_data.join(d_items, how="left", on="itemid")
    # select relevant columns
    labs_data = labs_data.select(
        ["subject_id", "hadm_id", "charttime", "itemid", "label", "value", "valueuom", "comments"]
    ).with_columns(
        charttime=pl.col("charttime").cast(pl.Datetime), linksto=pl.lit("labevents")
    )
    labs_data = labs_data.with_columns(pl.col("hadm_id").cast(pl.Int64))
    labs_data = labs_data.with_columns(pl.col("subject_id").cast(pl.Int64))

    # get eligible lab tests prior to current episode
    labs_data = labs_data.join(
        admits_last[["subject_id", "hadm_id", "edregtime"]]
        .lazy()
        .with_columns(
            edregtime=pl.col("edregtime").str.to_datetime(strict=False)
        ),
        how="left",
        on=["subject_id", "hadm_id"],
    )
    labs_data = labs_data.filter((pl.col("charttime") <= pl.col("edregtime") + pl.duration(hours=3)))
    labs_data = labs_data.drop(["edregtime"])
    # get most common items (top 50 itemids by label)
    if include_items is None:
        lab_items = labs_data.groupby("itemid").agg(pl.count().alias("count")).sort("count", descending=True).head(50)
        ### Export items to file
        #lab_items.collect().write_csv("../config/lab_items.csv")
    if include_items is not None and items_path is not None:
        # read txt file containing list of ids
        with open(items_path) as f:
            lab_items = list(f.read().splitlines())

    labs_data = labs_data.filter(
        pl.col("itemid").cast(pl.Utf8).is_in(set(lab_items))
    )
    labs_data = labs_data.collect(streaming=True)
    labs_data = labs_data.sort(by=["subject_id", "hadm_id", "charttime"])

    return labs_data

In [None]:
def clean_labevents(labs_data: pl.LazyFrame) -> pl.LazyFrame:
    """
    Clean lab events by removing non-integer values and outliers.

    Args:
        labs_data (pl.LazyFrame): Lab events data.

    Returns:
        pl.LazyFrame: Cleaned lab events.
    """
    lab_events = labs_data.with_columns(
        pl.col("label")
        .str.to_lowercase()
        .str.replace(" ", "_")
        .str.replace(",", "")
        .str.replace('"', "")
        .str.replace(" ", "_"),
        pl.col("charttime").cast(pl.Utf8).str.replace("T", " ").str.strip_chars(),
    )

    #lab_events = lab_events.collect(streaming=True)
    lab_events = lab_events.with_columns(
        value=pl.when(pl.col("value") == ".").then(None).otherwise(pl.col("value"))
    )
    lab_events = lab_events.with_columns(
        value=pl.when(pl.col("value").str.contains("_|<|ERROR"))
        .then(None)
        .otherwise(pl.col("value"))
        .cast(
            pl.Float64, strict=False
        )  # Attempt to cast to Float64, set invalid values to None
    )
    '''
    labs_data = labs_data.drop_nulls()

    # Remove outliers using 5 std from mean

    lab_events = lab_events.with_columns(
        mean=pl.col("value").mean().over(pl.count("label"))
    )
    lab_events = lab_events.with_columns(
        std=pl.col("value").std().over(pl.count("label"))
    )

    lab_events = lab_events.filter(
        (pl.col("value") <= pl.col("mean") + pl.col("std") * 100)
        & (pl.col("value") >= pl.col("mean") - pl.col("std") * 100)
    ).drop(["mean", "std"])
    '''

    #lab_events = lab_events.collect(streaming=True)

    # Cardiovascular-specific lab value cleaning
    lab_events = clean_specific_lab_values(lab_events)

    # Extract troponin T measures
    lab_events = extract_troponin_t_measures(lab_events)
    lab_events = lab_events.drop_nulls()

    # Replace overlapping lab labels
    lab_events = lab_events.with_columns(
        pl.when(pl.col("label").is_in(['white_blood_cells', 'wbc', 'wbc_count']))
        .then(pl.lit('wbc'))
        .otherwise(pl.col("label"))
        .alias("label")
    )
    lab_events = lab_events.with_columns(
        pl.when(pl.col("label").is_in(['estimated_gfr_(mdrd equation)', 'egfr', 'gfr', 'egfr_(ckd-epi)']))
        .then(pl.lit('eGFR'))
        .otherwise(pl.col("label"))
        .alias("label")
    )
    lab_events = lab_events.with_columns(
        pl.when(pl.col("label").is_in(['creatine_kinase_(ck)', 'ck', 'creatine_kinase']))
        .then(pl.lit('creatine_kinase'))
        .otherwise(pl.col("label"))
        .alias("label")
    )
    lab_events = lab_events.with_columns(
        pl.when(pl.col("label").is_in(['creatine_kinase_mb isoenzyme', 'ck-mb', 'ck_mb', 'creatine_kinase_mb']))
        .then(pl.lit('creatine_kinase_mb'))
        .otherwise(pl.col("label"))
        .alias("label")
    )

    return lab_events


def clean_specific_lab_values(labs_data: pl.LazyFrame) -> pl.LazyFrame:
    """
    Apply specific cleaning rules for lab values.

    Args:
        labs_data (pl.LazyFrame): Lab events data with 'label', 'value', and 'comments' columns.

    Returns:
        pl.LazyFrame: Lab data with cleaned creatine kinase, hemoglobin, and eGFR values.
    """

    ## Troponin T
    cleaned_data = labs_data.with_columns(
        pl.when(pl.col("label") == "troponin_t")
        .then(
            pl.when(
                (pl.col("value").is_null()) &
                (pl.col("comments").str.starts_with('<0.01.'))
            )
            .then(0.005)
            .when(
                (pl.col("value").is_null()) &
                (pl.col("comments").str.starts_with('cTropnT > 0.10') | pl.col("comments").str.starts_with('CTROPNT > 0.10'))
            )
            .then(0.10)
            .otherwise(pl.col("value"))
        )
        .otherwise(pl.col("value"))
        .alias("value")
    )

    # Creatinine specific cleaning
    cleaned_data = cleaned_data.with_columns(
        pl.when(pl.col("label").str.contains("creatinine"))
        .then(
            pl.when(pl.col("value") < 0.1)
            .then(None)  # Below physiological minimum
            .when(pl.col("value") > 20.0)
            .then(None)  # Above physiological maximum (normal: 0.6-1.2 mg/dL)
            .when((pl.col("value") >= 50) & (pl.col("value") <= 2000))
            .then(pl.col("value") / 88.4)  # Convert µmol/L to mg/dL
            .otherwise(pl.col("value"))
        )
        .otherwise(pl.col("value"))
        .alias("value")
    )

    # Urea Nitrogen (BUN) specific cleaning
    cleaned_data = cleaned_data.with_columns(
        pl.when(pl.col("label").str.contains("urea_nitrogen"))
        .then(
            pl.when(pl.col("value") < 1.0)
            .then(None)  # Below physiological minimum
            .when(pl.col("value") > 200.0)
            .then(None)  # Above physiological maximum (normal: 7-20 mg/dL)
            .when((pl.col("value") >= 1.8) & (pl.col("value") <= 71.4))
            .then(pl.col("value") / 0.357)  # Convert mmol/L to mg/dL
            .otherwise(pl.col("value"))
        )
        .otherwise(pl.col("value"))
        .alias("value")
    )

    # Creatine Kinase MB (CK-MB) specific cleaning
    cleaned_data = cleaned_data.with_columns(
        pl.when(pl.col("label").str.contains("(?i)CK-MB Index|Creatine Kinase, MB Isoenzyme|Creatine Kinase, Isoenzyme MB"))
        .then(
            pl.when(pl.col("value") < 0)
            .then(None)  # Below physiological minimum
            .when(pl.col("value") > 500.0)
            .then(None)  # Above physiological maximum (normal: 0-3 ng/mL or 0-25 U/L)
            .otherwise(pl.col("value"))
        )
        .otherwise(pl.col("value"))
        .alias("value")
    )

    # eGFR specific cleaning
    egfr_pattern = r"(?i)(?:eGFR|estimated\s+GFR)[^0-9><]*(?:=|is likely|is approximately|:)?\s*(?:between\s*)?([><]?\s*\d+(?:\.\d+)?)(?:\s*(?:and|-|to)\s*([><]?\s*\d+(?:\.\d+)?))?"

    cleaned_data = cleaned_data.with_columns(
        pl.when(pl.col("label").str.contains("(?i)gfr"))
        .then(
            pl.when(pl.col("value").is_null() & pl.col("comments").is_not_null())
            .then(
                # Extract eGFR value from comments using regex
                pl.col("comments").str.extract(egfr_pattern, 1).str.strip_chars().str.replace(">", "").str.replace("<", "").cast(pl.Float64, strict=False)
            )
            .when(pl.col("comments").str.contains("(?i)>60|greater.*60"))
            .then(90.0)  # Use midpoint for >60 reporting
            .when(pl.col("value") < 0)
            .then(None)  # Remove negative values
            .when(pl.col("value") > 200)
            .then(None)  # Remove extreme outliers (normal: 90-120 mL/min/1.73m²)
            .otherwise(pl.col("value"))
        )
        .otherwise(pl.col("value"))
        .alias("value")
    )

    return cleaned_data


def extract_troponin_t_measures(labs_data: pl.LazyFrame) -> pl.LazyFrame:
    """
    Extract first, second, and third troponin T measures per hospital admission (hadm_id)
    ordered by charttime, and calculate delta troponin T (difference between highest and lowest).

    Creates new rows in the labs_data with specific labels:
    - 'first_troponin_t': First troponin T measurement
    - 'second_troponin_t': Second troponin T measurement
    - 'third_troponin_t': Third troponin T measurement
    - 'troponin_t_delta': Difference between max and min values

    Args:
        labs_data (pl.LazyFrame): Lab events data with columns: hadm_id, charttime, label, value

    Returns:
        pl.LazyFrame: Original labs_data with additional troponin T feature rows appended
    """
    # Filter for troponin T measurements and ensure required columns exist
    troponin_data = labs_data.filter(
        pl.col("label").str.contains("troponin_t")
    ).filter(
        pl.col("value").is_not_null() &
        pl.col("hadm_id").is_not_null() &
        pl.col("charttime").is_not_null()
    )

    # Convert charttime to datetime if it's a string
    if troponin_data.schema["charttime"] == pl.Utf8:
        troponin_data = troponin_data.with_columns(
            pl.col("charttime").str.to_datetime(strict=False).alias("charttime")
        )

    # Sort by hadm_id and charttime to get chronological order
    troponin_sorted = troponin_data.sort(["hadm_id", "charttime"])

    # Add row number within each admission to identify 1st, 2nd, 3rd measurements
    troponin_numbered = troponin_sorted.with_columns(
        pl.int_range(pl.len()).over("hadm_id").alias("measurement_order")
    )

    # Create pivot-like structure for first, second, third measurements
    troponin_pivot = troponin_numbered.group_by("hadm_id").agg([
        pl.col("value").filter(pl.col("measurement_order") == 0).first().alias("first_troponin_t"),
        pl.col("value").filter(pl.col("measurement_order") == 1).first().alias("second_troponin_t"),
        pl.col("value").filter(pl.col("measurement_order") == 2).first().alias("third_troponin_t"),
        pl.col("value").min().alias("troponin_t_min"),
        pl.col("value").max().alias("troponin_t_max"),
        pl.col("value").len().alias("troponin_t_count"),
        pl.col("subject_id").first().alias("subject_id"),
        pl.col("charttime").first().alias("base_charttime")  # Use first charttime as reference
    ])

    # Calculate delta troponin T (difference between highest and lowest)
    troponin_features = troponin_pivot.with_columns(
        pl.when(pl.col("troponin_t_count") > 1)
        .then(pl.col("troponin_t_max") - pl.col("troponin_t_min"))
        .otherwise(None)
        .alias("troponin_t_delta")
    )

    # Create new rows for each troponin feature

    # Create base template with required columns filled from troponin_features
    base_template = troponin_features.select([
        "hadm_id", "subject_id", "base_charttime"
    ]).with_columns([
        pl.col("base_charttime").alias("charttime"),
        pl.lit(None).alias("comments")  # Add comments column if it exists in original
    ])

    # First troponin T
    first_troponin_rows = base_template.join(
        troponin_features.select(["hadm_id", "first_troponin_t"]),
        on="hadm_id", how="inner"
    ).filter(
        pl.col("first_troponin_t").is_not_null()
    ).with_columns([
        pl.lit("first_troponin_t").alias("label"),
        pl.col("first_troponin_t").alias("value")
    ]).drop("first_troponin_t")

    # Second troponin T
    second_troponin_rows = base_template.join(
        troponin_features.select(["hadm_id", "second_troponin_t"]),
        on="hadm_id", how="inner"
    ).filter(
        pl.col("second_troponin_t").is_not_null()
    ).with_columns([
        pl.lit("second_troponin_t").alias("label"),
        pl.col("second_troponin_t").alias("value")
    ]).drop("second_troponin_t")

    # Third troponin T
    third_troponin_rows = base_template.join(
        troponin_features.select(["hadm_id", "third_troponin_t"]),
        on="hadm_id", how="inner"
    ).filter(
        pl.col("third_troponin_t").is_not_null()
    ).with_columns([
        pl.lit("third_troponin_t").alias("label"),
        pl.col("third_troponin_t").alias("value")
    ]).drop("third_troponin_t")

    # Troponin T delta
    delta_troponin_rows = base_template.join(
        troponin_features.select(["hadm_id", "troponin_t_delta"]),
        on="hadm_id", how="inner"
    ).filter(
        pl.col("troponin_t_delta").is_not_null()
    ).with_columns([
        pl.lit("troponin_t_delta").alias("label"),
        pl.col("troponin_t_delta").alias("value")
    ]).drop("troponin_t_delta")

    # Combine all feature rows
    all_troponin_features = pl.concat([
        first_troponin_rows,
        second_troponin_rows,
        third_troponin_rows,
        delta_troponin_rows
    ], how="vertical")

    # Add any missing columns that exist in original labs_data but not in our feature rows
    original_cols = set(labs_data.columns)
    feature_cols = set(all_troponin_features.columns)
    missing_cols = original_cols - feature_cols

    if missing_cols:
        for col in missing_cols:
            all_troponin_features = all_troponin_features.with_columns(
                pl.lit(None).alias(col)
            )

    # Ensure column order matches original
    all_troponin_features = all_troponin_features.select(labs_data.columns)

    # Match charttime dtype with original labs_data
    original_charttime_dtype = labs_data.schema["charttime"]
    if original_charttime_dtype == pl.Utf8:
        all_troponin_features = all_troponin_features.with_columns(
            pl.col("charttime").cast(pl.Utf8).alias("charttime")
        )
    # If original is datetime, keep it as datetime (already converted earlier)

    # Append new troponin feature rows to original labs_data
    enhanced_labs_data = pl.concat([labs_data, all_troponin_features], how="vertical")

    return enhanced_labs_data


In [159]:
full_labs = read_labevents_table(mimic_path, test_EHR.lazy(), include_items=True)

  labs_data = labs_data.join(d_items, how="left", on="itemid")
  labs_data = labs_data.join(


In [169]:
lab_events = full_labs.with_columns(
    pl.col("label")
    .str.to_lowercase()
    .str.replace(" ", "_")
    .str.replace(",", "")
    .str.replace('"', "")
    .str.replace(" ", "_"),
    pl.col("charttime").cast(pl.Utf8).str.replace("T", " ").str.strip_chars(),
)
lab_events = lab_events.with_columns(
    value=pl.when(pl.col("value") == ".").then(None).otherwise(pl.col("value"))
)

lab_events = lab_events.with_columns(
    value=pl.when(pl.col("value").str.contains("_|<|ERROR"))
    .then(None)
    .otherwise(pl.col("value"))
    .cast(
        pl.Float64, strict=False
    )  # Attempt to cast to Float64, set invalid values to None
)

In [170]:
lab_events = clean_specific_lab_values(lab_events)

In [171]:
lab_events = extract_troponin_t_measures(lab_events)

In [172]:
list(lab_events['label'].unique())

['estimated_gfr_(mdrd equation)',
 'c-reactive_protein',
 'wbc',
 'potassium',
 'first_troponin_t',
 'urea_nitrogen',
 'glucose',
 'hematocrit',
 'creatine_kinase_(ck)',
 'd-dimer',
 'hemoglobin',
 'sodium',
 'bicarbonate',
 'platelet_count',
 'second_troponin_t',
 'white_blood_cells',
 'creatine_kinase_mb isoenzyme',
 'troponin_t',
 'anion_gap',
 'creatinine',
 'ntprobnp',
 'wbc_count',
 'chloride',
 'troponin_t_delta']

In [165]:
 # Filter for troponin T measurements and ensure required columns exist
troponin_data = lab_events.filter(
    pl.col("label").str.contains("troponin_t")
).filter(
    pl.col("value").is_not_null() &
    pl.col("hadm_id").is_not_null() &
    pl.col("charttime").is_not_null()
)

# Convert charttime to datetime if it's a string
if troponin_data.schema["charttime"] == pl.Utf8:
    troponin_data = troponin_data.with_columns(
        pl.col("charttime").str.to_datetime(strict=False).alias("charttime")
    )

# Sort by hadm_id and charttime to get chronological order
troponin_sorted = troponin_data.sort(["hadm_id", "charttime"])

# Add row number within each admission to identify 1st, 2nd, 3rd measurements
troponin_numbered = troponin_sorted.with_columns(
    pl.int_range(pl.len()).over("hadm_id").alias("measurement_order")
)

# Create pivot-like structure for first, second, third measurements
troponin_pivot = troponin_numbered.group_by("hadm_id").agg([
    pl.col("value").filter(pl.col("measurement_order") == 0).first().alias("first_troponin_t"),
    pl.col("value").filter(pl.col("measurement_order") == 1).first().alias("second_troponin_t"),
    pl.col("value").filter(pl.col("measurement_order") == 2).first().alias("third_troponin_t"),
    pl.col("value").min().alias("troponin_t_min"),
    pl.col("value").max().alias("troponin_t_max"),
    pl.col("value").len().alias("troponin_t_count"),
    pl.col("subject_id").first().alias("subject_id"),
    pl.col("charttime").first().alias("base_charttime")  # Use first charttime as reference
])

# Calculate delta troponin T (difference between highest and lowest)
troponin_features = troponin_pivot.with_columns(
    pl.when(pl.col("troponin_t_count") > 1)
    .then(pl.col("troponin_t_max") - pl.col("troponin_t_min"))
    .otherwise(None)
    .alias("troponin_t_delta")
)

# Create new rows for each troponin feature

# Create base template with required columns filled from troponin_features
base_template = troponin_features.select([
    "hadm_id", "subject_id", "base_charttime"
]).with_columns([
    pl.col("base_charttime").alias("charttime"),
    pl.lit(None).alias("comments")  # Add comments column if it exists in original
])

# First troponin T
first_troponin_rows = base_template.join(
    troponin_features.select(["hadm_id", "first_troponin_t"]),
    on="hadm_id", how="inner"
).filter(
    pl.col("first_troponin_t").is_not_null()
).with_columns([
    pl.lit("first_troponin_t").alias("label"),
    pl.col("first_troponin_t").alias("value")
]).drop("first_troponin_t")

# Second troponin T
second_troponin_rows = base_template.join(
    troponin_features.select(["hadm_id", "second_troponin_t"]),
    on="hadm_id", how="inner"
).filter(
    pl.col("second_troponin_t").is_not_null()
).with_columns([
    pl.lit("second_troponin_t").alias("label"),
    pl.col("second_troponin_t").alias("value")
]).drop("second_troponin_t")

# Third troponin T
third_troponin_rows = base_template.join(
    troponin_features.select(["hadm_id", "third_troponin_t"]),
    on="hadm_id", how="inner"
).filter(
    pl.col("third_troponin_t").is_not_null()
).with_columns([
    pl.lit("third_troponin_t").alias("label"),
    pl.col("third_troponin_t").alias("value")
]).drop("third_troponin_t")

# Troponin T delta
delta_troponin_rows = base_template.join(
    troponin_features.select(["hadm_id", "troponin_t_delta"]),
    on="hadm_id", how="inner"
).filter(
    pl.col("troponin_t_delta").is_not_null()
).with_columns([
    pl.lit("troponin_t_delta").alias("label"),
    pl.col("troponin_t_delta").alias("value")
]).drop("troponin_t_delta")

# Combine all feature rows
all_troponin_features = pl.concat([
    first_troponin_rows,
    second_troponin_rows,
    third_troponin_rows,
    delta_troponin_rows
], how="vertical")

# Add any missing columns that exist in original labs_data but not in our feature rows
original_cols = set(lab_events.columns)
feature_cols = set(all_troponin_features.columns)
missing_cols = original_cols - feature_cols

if missing_cols:
    for col in missing_cols:
        all_troponin_features = all_troponin_features.with_columns(
            pl.lit(None).alias(col)
        )

# Ensure column order matches original
all_troponin_features = all_troponin_features.select(lab_events.columns)

In [180]:
lab_events_trop = lab_events.to_pandas()[(lab_events.to_pandas()['label']=='third_troponin_t')]
lab_events_trop.value.value_counts()

Series([], Name: count, dtype: int64)

In [99]:
lab_events.shape

(94639, 9)

In [100]:
lab_events_trop = lab_events.to_pandas()[lab_events.to_pandas()['label'].str.contains('troponin')]
lab_events_trop.shape

(3231, 9)

In [102]:
lab_events_trop.value.value_counts()

Series([], Name: count, dtype: int64)

In [60]:
pattern = r"(?i)(?:eGFR|estimated\s+GFR)[^0-9><]*(?:=|is likely|is approximately|:)?\s*(?:between\s*)?([><]?\s*\d+(?:\.\d+)?)(?:\s*(?:and|-|to)\s*([><]?\s*\d+(?:\.\d+)?))?"

text = """Using this patient's age, gender, and serum creatinine value of 1.0,.  Estimated GFR = 52 if non African-American (mL/min/1.73 m2).  Estimated GFR = 63 if African-American (mL/min/1.73 m2).  For comparison, mean GFR for age group 70+ is 75 (mL/min/1.73 m2).  GFR<60 = Chronic Kidney Disease, GFR<15 = Kidney Failure."""

matches = re.findall(pattern, text)
egfr_values = [" ".join([m[0], m[1]]).strip() for m in matches if any(m)]
print(egfr_values[0])

52
