# Apply Regular Expression

In [3]:
import re
import pandas as pd

In [4]:
df = pd.read_parquet("latest_notes.parquet")

In [5]:
concept_patterns = {
    "cml_diagnosed": r"\b(chronic (myelogenous|myeloid) leukemia|cml)\b",
    "aml_diagnosed": r"\b(acute (myelogenous|myeloid) leukemia|aml)\b",
    "blast_phase_cml": r"\b(blast[- ]?(phase|crisis) (cml|chronic (myelogenous|myeloid) leukemia))\b",
    "acute_phase_cml": r"\b(acute[- ]?phase (cml|chronic (myelogenous|myeloid) leukemia))\b",
    "bmt_history": r"\b(bone marrow transplant|hematopoietic stem cell transplant|stem cell transplant|sct|bmt|hsct)\b",
    "imatinib_mentioned": r"\b(imatinib|gleevec|sti571)\b",
    "related_drugs_mentioned": r"\b(dasatinib|sprycel|nilotinib|tasigna|bosutinib|bosulif|ponatinib|iclusig)\b",
}

negation_regex = r"\b(no|denies|without|not on|never received|negative for|no history of)\b"


In [6]:
def extract_concepts(note, patterns, negation_pattern=None):
    results = {}
    for concept, pattern in patterns.items():
        matches = []
        for m in re.finditer(pattern, str(note), flags=re.IGNORECASE):
            span_start = max(0, m.start() - 50)
            context = note[span_start:m.start()]
            if negation_pattern and re.search(negation_pattern, context, flags=re.IGNORECASE):
                continue
            matches.append(m.group(0))
        results[concept] = bool(matches)
    return results


In [7]:
concept_cols = list(concept_patterns.keys())
for col in concept_cols:
    df[col] = False  # initialize columns

df[concept_cols] = df["note_text"].apply(
    lambda text: pd.Series(extract_concepts(text, concept_patterns, negation_regex))
)


# True/False Count for concepts

In [21]:
concept_cols = [col for col in df.columns if col != "patient_id"]

for concept in concept_cols:
    print(f"\nConcept: {concept}")
    print(df[concept].value_counts())



Concept: person_id
659         1
7841216     1
7879067     1
7873427     1
7867523     1
           ..
2736376     1
2734645     1
2731392     1
2722691     1
11356653    1
Name: person_id, Length: 1762, dtype: int64

Concept: start_date
2009-06-04    4
1992-04-09    3
2013-11-01    3
2010-07-03    3
2014-06-05    3
             ..
2004-04-09    1
1998-06-17    1
2010-12-28    1
2012-08-14    1
2012-06-25    1
Name: start_date, Length: 1566, dtype: int64

Concept: end_date
2025-03-03    1715
2025-04-18       3
2025-03-17       2
2025-05-18       1
2025-03-13       1
2025-03-27       1
2026-09-19       1
2040-06-03       1
2040-09-26       1
2025-03-10       1
2025-04-17       1
2040-05-22       1
2025-03-21       1
2025-09-23       1
2100-10-05       1
2113-12-11       1
2025-06-02       1
2027-06-04       1
2025-07-26       1
2025-07-07       1
2025-06-28       1
2025-07-02       1
2025-08-09       1
2025-07-08       1
2025-04-05       1
2151-03-25       1
2042-04-30       1
2025-12-

# Grabbing person_source_value for imatinib_mentioned - Emerse lookup

In [19]:
filtered_df = df[df['imatinib_mentioned'] == True]
person_source_value = filtered_df['person_source_value']
person_source_value.to_csv('reg_expression_pv.txt', sep='\t', index=False)

In [17]:
person_source_value

23      D385802200C9C2
31      D704504CB3C107
41      D5BE13A816EDD6
50      D1E855FFF0B02F
108     DE07C035B0C3C3
             ...      
1709    D97C20AB58FF6B
1713    D2B077B2EF1D92
1731    D05071833CAA95
1751    DE60297A4E36D1
1759    D9646D1413897D
Name: person_source_value, Length: 191, dtype: object