In [11]:
import pandas as pd
import os   
import numpy as np
import re, unicodedata
import unicodedata
from rapidfuzz import process, fuzz
from typing import List
from umlsparser import UMLSParser



##preprocess and set up medication column

In [None]:
OPT_neuro_medications = pd.read_excel('temp/raw/OPT N Meds Arm 6-8.xlsx', engine='openpyxl')[['record_id', 
        'date_med_entered', 'med_name',
       'type_in_med', 'med_freq',
       'med_freq_oth', 'med_start', 'non_psych_med_baseline___1', 
       'med_reason']]

OPT_parent_medication = pd.read_csv('temp/raw/OPTIMUM_Medication_1.24.25.csv', skiprows=1)[['record_id', 
        'date_med_entered', 'med_name',
       'type_in_med', 'med_freq',
       'med_freq_oth', 'med_start', 'non_psych_med_baseline___1', 
       'med_reason']]

# concat both df as OPT_medications

OPT_medications = pd.concat([OPT_neuro_medications, OPT_parent_medication], 
                          axis=0,  # Concatenate vertically
                          ignore_index=True)  

# make a new column "medication" that takes either type_in_med or med_name if either is empty

OPT_medications['medication'] = OPT_medications['type_in_med'].combine_first(OPT_medications['med_name']).str.upper()

# drop rows where medication is empty
OPT_medications = OPT_medications[OPT_medications['medication'].notna()]


In [None]:
import re, unicodedata
import pandas as pd

# ------- regex vocab -------
UNITS = r"(mcg|μg|ug|mg|g|gram|ml|mL|l|iu|units?|meq|mmol|%)"
FORM_WORDS = r"(tab(let)?s?|cap(s|sules)?|susp(ension)?|sol(ution)?|syrup|drops?|spray|neb|cream|ointment|gel|patch|lozenge|supp(ository)?|inhaler|powder|granules?)"
ROUTE_WORDS = r"(po|oral|iv|im|sc|subcut(an(eous)?)?|sl|subling(ual)?|top(ical)?|ophth(al)?|otic|nasal|intranasal|inhal(ed|ation)?|rectal|vaginal|transderm(al)?|buccal)"
FREQ_WORDS  = r"(qd|od|qam|qpm|qhs|bid|tid|qid|q\d+h|q\d+hr|qod|prn|stat|hs|am|pm)"
REL_WORDS   = r"(xr|sr|dr|er|cr|la|xl|ir)"
SALT_WORDS  = r"(hcl|hydrochloride|hydrobromide|succinate|tartrate|maleate|mesylate|besylate|acetate|phosphate|sulfate|lactate|nitrate|carbonate|sodium|potassium|calcium|magnesium)"

DROP_TOKENS = {
    "TAB","TABS","TABLET","TABLETS","CAP","CAPS","CAPSULE","CAPSULES",
    "SOLUTION","SUSP","SUSPENSION","SYRUP","DROPS","INJECTION","TOPICAL",
    "PATCH","CREAM","OINTMENT","GEL",
    "PRN","BID","TID","QID","QD","OD","QHS","QAM","QPM","HS","AM","PM",
    "PO","IV","IM","SC","SL","SUBLINGUAL","INHALER","NEB","NASAL",
    "XR","SR","ER","CR","LA","XL","IR",
}

LEADING_MODIFIERS = {
    "EXTENDED", "RELEASE", "EXTENDED-RELEASE", "SUSTAINED", "CONTROLLED",
    "DELAYED", "IMMEDIATE", "PROLONGED", "ENTERIC", "COATED", "CHEWABLE",
    "LONG", "ACTING", "ORAL"  # sometimes appears as fluff
}

# patterns to *preserve whole token* if they match
PROTECT_REGEXES = [
    re.compile(r"^VITAMIN\s+[A-Z0-9]+(\s+[A-Z0-9]+)?$"),        # VITAMIN D, VITAMIN D3, VITAMIN B 12, etc.
    re.compile(r"^FISH\s+OIL$"),
    re.compile(r"^[A-Z]+\s+OIL$"),                               # e.g., CASTOR OIL (keeps both words)
    re.compile(r"^POLYETHYLENE\s+GLYCOL(\s+\d+)?$"),            # PEG / PEG 3350
    re.compile(r"^[A-Z]+\s+(GLYCOL|ACID|PEROXIDE|ALCOHOL)$"),   # generic 2-word chemicals
]

def _nfkc_upper(s: str) -> str:
    s = "" if not isinstance(s, str) else s
    return unicodedata.normalize("NFKC", s).upper().strip()

def _drop_parentheticals(s: str) -> str:
    return re.sub(r"\([^)]*\)", " ", s)

def _drop_numbers_units(s: str) -> str:
    s = re.sub(rf"\b\d+(\.\d+)?\s*(/{0,1}\s*\d+(\.\d+)?\s*)*{UNITS}\b", " ", s)
    s = re.sub(rf"\b\d+(\.\d+)?\s*/\s*\d+(\.\d+)?\b", " ", s)
    s = re.sub(r"\b\d+(\.\d+)?\b", " ", s)  # stray numbers
    return s

def _drop_keywords(s: str) -> str:
    for pat in [FORM_WORDS, ROUTE_WORDS, FREQ_WORDS, REL_WORDS, SALT_WORDS]:
        s = re.sub(rf"\b{pat}\b", " ", s)
    return s

def _normalize_separators(s: str) -> str:
    s = s.replace("+", " + ").replace("/", " / ").replace("&", " & ")
    s = re.sub(r"[^\w\s\+\/&-]", " ", s)  # allow hyphen in things like EXTENDED-RELEASE
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def _tokenize_med(raw: str) -> List[str]:
    s = _nfkc_upper(raw)
    if not s: return []
    s = _drop_parentheticals(s)
    s = _drop_numbers_units(s)
    s = _drop_keywords(s)
    s = _normalize_separators(s)
    if not s: return []
    return re.split(r"\s*(?:/|&|\+)\s*", s)

def _strip_leading_modifiers(words: List[str]) -> List[str]:
    # remove only leading fluff tokens
    i = 0
    while i < len(words) and (words[i] in LEADING_MODIFIERS or words[i] in DROP_TOKENS):
        i += 1
    return words[i:]

def _is_protected_phrase(tok: str) -> bool:
    return any(rx.match(tok) for rx in PROTECT_REGEXES)

def clean_medication_string(raw: str) -> List[str]:
    """
    Returns a list of cleaned ingredient-like tokens.
    - Removes dose/form/route/release/salts
    - Splits combos on /, +, &
    - Strips *leading* modifiers but preserves legitimate multiword ingredients
    - De-dupes while preserving order
    """
    parts = [p.strip() for p in _tokenize_med(raw) if p.strip()]
    out: List[str] = []
    for tok in parts:
        # remove generic noise tokens, but only at the start; keep rest together
        words = [w for w in tok.split() if w]  # keep all words for now
        words = _strip_leading_modifiers(words)

        # remove any remaining standalone DROP_TOKENS in the middle if they slipped through
        words = [w for w in words if w not in DROP_TOKENS]

        if not words:
            continue

        candidate = " ".join(words)

        # If it matches a protected multiword pattern (e.g., VITAMIN D3, FISH OIL), keep as-is
        if _is_protected_phrase(candidate):
            final = candidate
        else:
            # General rule: keep the full remaining phrase (NOT just the last word)
            # This avoids truncating 'FISH OIL' -> 'OIL' or 'POLYETHYLENE GLYCOL' -> 'GLYCOL'
            final = candidate

        if final:
            out.append(final)

    # de-duplicate while preserving order
    seen, dedup = set(), []
    for x in out:
        if x not in seen:
            seen.add(x)
            dedup.append(x)
    return dedup
# ---- apply to your df (same as before) ----
OPT_medications["clean_ingredients"] = OPT_medications["medication"].map(clean_medication_string)
OPT_medications["clean_medication"]  = OPT_medications["clean_ingredients"].apply(
    lambda xs: " + ".join(xs) if xs else pd.NA
)

# optional: drop rows that failed to yield a cleaned token
# OPT_medications = OPT_medications[OPT_medications["clean_medication"].notna()]

# optional: inspect coverage
# print(OPT_medications["clean_medication"].notna().mean())
# print(OPT_medications["clean_medication"].nunique(dropna=True))

# optional: save distinct cleaned strings for a quick audit
# (OPT_medications["clean_medication"]
#    .dropna()
#    .drop_duplicates()
#    .to_csv("OPT_cleaned_medication_unique.csv", index=False))


In [None]:
OPT_medications["clean_ingredients"] = OPT_medications["medication"].map(clean_medication_string)


### Open UMLS parser

In [None]:
from umlsparser import UMLSParser

umls = UMLSParser('/external/rprshnas01/netdata_kcni/dflab/data/BAARD/code/resources/umls/2025AA')

for cui, concept in umls.get_concepts().items():
    if 'ICD10CM' in concept.get_source_ids().keys():
        icd10ids = concept.get_source_ids().get('ICD10CM')
        print(icd10ids, concept.get_preferred_names_for_language('ENG')[0])

In [None]:
from umlsparser import UMLSParser
import collections

umls = UMLSParser('resources/umls/2025AA')
sources_counter = collections.defaultdict(int)
for cui, concept in umls.get_concepts().items():
    sources = concept.get_source_ids().keys()
    for source in sources:
        sources_counter[source] += 1
print('|SOURCE|COUNT|\n|------|-----|')
for source, count in sorted(sources_counter.items(), key=lambda t: t[1], reverse=True):
    print('|{}|{}|'.format(source, count))


In [12]:


umls = UMLSParser('C:\\Users\\Hassan\\Documents\\Projects\\baard\\resources\\umls\\2025AA')

for cui, concept in umls.get_concepts().items():
    tui = concept.get_tui()
    name_of_semantic_type = umls.get_semantic_types()[concept.get_tui()].get_name()
    for name in concept.get_names_for_language('ENG'):
        print(cui, name, tui, name_of_semantic_type)

INFO:umlsparser.UMLSParser:Initialising UMLSParser for basepath C:\Users\Hassan\Documents\Projects\baard\resources\umls\2025AA
INFO:umlsparser.UMLSParser:No language filtering applied.
Parsing UMLS concepts (MRCONSO.RRF): 0it [00:00, ?it/s]


UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 4668: character maps to <undefined>

In [None]:
os.getcwd()