In [None]:
# Clean a file by removing problematic variables by name
import csv

input_file = "/sbgenomics/workspace/output/CHS_cleaned/pht001492.tsv"
output_file = "/sbgenomics/workspace/output/CHS_cleaned/CHS_v7_c1/pht001492.tsv"
columns_to_drop = ["phv00107747", "phv00107748", "phv00107749", "phv00108147", "phv00108148", "phv00108149", "phv00108150"] 

# input_file = "/sbgenomics/workspace/output/CHS_cleaned/pht001494.tsv"
# output_file = "/sbgenomics/workspace/output/CHS_cleaned/CHS_v7_c1/pht001494.tsv"
# columns_to_drop = ["phv00109384", ] 

with open(input_file, newline="", encoding="utf-8") as infile:
    reader = csv.DictReader(infile, delimiter="\t")
    fieldnames = [fn for fn in reader.fieldnames if fn not in columns_to_drop]

    with open(output_file, "w", newline="", encoding="utf-8") as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter="\t")
        writer.writeheader()
        for row in reader:
            writer.writerow({k: v for k, v in row.items() if k in fieldnames})


In [None]:
from pathlib import Path
import re
import subprocess
from typing import Optional
import yaml

from linkml.validator.loaders import TsvLoader
from linkml.utils.schema_builder import SchemaBuilder

from linkml_runtime.linkml_model import SlotDefinition
from linkml_runtime import SchemaView

from linkml_map.session import Session
from linkml_map.transformer.object_transformer import ObjectTransformer

In [None]:
def fix_misindented_value_mappings(yaml_text):
    lines = yaml_text.splitlines()
    fixed = []
    i = 0
    while i < len(lines):
        line = lines[i]
        fixed.append(line)
        if re.match(r'^\s+populated_from:\s+\S+', line):
            pf_indent = len(line) - len(line.lstrip())
            # Peek ahead to see if value_mappings is wrongly indented
            if i + 1 < len(lines) and lines[i + 1].lstrip().startswith("value_mappings:"):
                vm_line = lines[i + 1]
                vm_indent = len(vm_line) - len(vm_line.lstrip())
                if vm_indent > pf_indent:
                    # Dedent value_mappings
                    fixed.append(" " * pf_indent + vm_line.lstrip())
                    i += 1
                    # Dedent any following lines more indented than vm_indent
                    while i + 1 < len(lines):
                        peek = lines[i + 1]
                        peek_indent = len(peek) - len(peek.lstrip())
                        if peek_indent > vm_indent:
                            i += 1
                            fixed.append(" " * (pf_indent + 2) + peek.lstrip())
                        else:
                            break
        i += 1
    return "\n".join(fixed)


In [None]:
def fix_nested_expr_under_populated_from(yaml_text: str) -> str:
    lines = yaml_text.splitlines()
    out_lines = []
    i = 0

    while i < len(lines):
        line = lines[i]

        if line.lstrip().startswith("populated_from:"):
            indent = len(line) - len(line.lstrip())
            next_line = lines[i + 1] if i + 1 < len(lines) else ""

            if next_line.lstrip().startswith("expr:"):
                # Rewrite: replace populated_from + expr with dedented expr
                out_lines.append(" " * indent + next_line.lstrip())
                i += 2

                # Append any deeper-indented lines
                while i < len(lines):
                    peek = lines[i]
                    if len(peek) - len(peek.lstrip()) > indent:
                        out_lines.append(" " * (indent + 2) + peek.lstrip())
                        i += 1
                    else:
                        break
                continue

        out_lines.append(line)
        i += 1

    return "\n".join(out_lines)


In [None]:
def fix_phv_brackets(expr: str) -> str:
    expr = re.sub(r'(?<![{])\b(phv\d{8})', r'{\1', expr)
    expr = re.sub(r'(phv\d{8})\b(?![}])', r'\1}', expr)
    return expr

def quote_curies(expr: str) -> str:
    return re.sub(r'\b([A-Z]+:\d+)\b', r'"\1"', expr)

def strip_outer_quotes(expr: str) -> str:
    expr = expr.strip()
    expr = re.sub(r"^['\"]+", "", expr)
    expr = re.sub(r"['\"]+$", "", expr)
    expr = expr.strip()
    return expr

def normalize_expr_string(expr: str) -> str:
    expr = strip_outer_quotes(expr)
    expr = re.sub(r'\s+#.*$', '', expr)
    expr = fix_phv_brackets(expr)
    expr = quote_curies(expr)
    expr = expr.replace('"', '\\"')  # Escape internal quotes
    return f"\"{expr}\""

def fix_expr_strings(pre_yaml_text: str) -> str:
    lines = pre_yaml_text.splitlines()
    out_lines = []
    i = 0

    while i < len(lines):
        line = lines[i]
        stripped = line.lstrip()
        indent = len(line) - len(stripped)

        if stripped.startswith("expr:"):
            expr_val = stripped[len("expr:"):].lstrip()
            i += 1
            # Collect multi-line expression if indented more
            while i < len(lines):
                next_line = lines[i]
                next_indent = len(next_line) - len(next_line.lstrip())
                if next_indent > indent:
                    expr_val += "" + next_line.strip()
                    i += 1
                else:
                    break
            # Normalize and re-quote
            quoted_expr = normalize_expr_string(expr_val)
            out_lines.append(" " * indent + f"expr: {quoted_expr}")
        else:
            out_lines.append(line)
            i += 1

    return "\n".join(out_lines)
    

In [None]:
def extract_quantity_subslots(slot_derivs, doc_index, parent_slot_name, populated_from, known_subslots):
    quantity_subslots = {}

    # 1. Pull nested subslots under e.g. range_low: { value_decimal: ... }
    nested = slot_derivs.pop(parent_slot_name, {}) if isinstance(slot_derivs.get(parent_slot_name), dict) else {}

    # 2. Scan for both nested and top-level flat entries
    for subkey in known_subslots:
        # Support flat key: e.g., value_quantity.unit
        flat_key = f"{parent_slot_name}.{subkey}"

        entry = (
            nested.pop(subkey, None)
            or slot_derivs.pop(flat_key, None)
            or slot_derivs.pop(subkey, None)  # e.g., value_decimal defined at top-level
        )

        if entry:
            if isinstance(entry, dict):
                pf = entry.get("populated_from")
                expr = entry.get("expr")
                value = entry.get("value")

                if pf and isinstance(pf, dict) and "expr" in pf:
                    quantity_subslots[subkey] = {"expr": pf["expr"]}
                elif expr:
                    quantity_subslots[subkey] = {"expr": expr}
                elif value:
                    quantity_subslots[subkey] = {"value": value}
                elif pf is None:
                    raise ValueError(f"[Doc {doc_index}] `{subkey}` in `{parent_slot_name}` has empty `populated_from:` and no `expr:`")
                elif isinstance(pf, str):
                    quantity_subslots[subkey] = {"populated_from": pf}
                else:
                    raise ValueError(f"[Doc {doc_index}] Malformed `{subkey}` in `{parent_slot_name}`: {entry}")
            else:
                raise ValueError(f"[Doc {doc_index}] Unexpected format for `{subkey}` in `{parent_slot_name}`: {entry}")

    if quantity_subslots:
        return {
            "object_derivations": [{
                "class_derivations": {
                    "Quantity": {
                        "populated_from": populated_from,
                        "slot_derivations": quantity_subslots
                    }
                }
            }]
        }

    return None

def refactor_value_quantity(documents):
    updated_docs = []

    quantity_subslots = ["value_decimal", "value_concept", "value_integer", "unit"]

    for doc_index, doc in enumerate(documents):
        cd = doc.get("class_derivations", {})
        for cls_name, cls_info in cd.items():
            slot_derivs = cls_info.get("slot_derivations", {})
            populated_from = cls_info.get("populated_from")

            # Apply to each target "container" slot
            for target_slot in ["value_quantity", "range_low", "range_high"]:
                result = extract_quantity_subslots(
                    slot_derivs,
                    doc_index,
                    target_slot,
                    populated_from,
                    quantity_subslots
                )
                if result:
                    slot_derivs[target_slot] = result

        updated_docs.append(doc)

    return updated_docs


In [None]:
# EXPR_PATTERN = re.compile(r'\b(case)\b|[()+*<>]|(?<!\w)[/-](?=\w)')
# EXPR_PATTERN = re.compile(r'\bcase\b|[()+*<>]|(?<!\w)[/-](?=\w)|(?<=\w)\s*[/-]\s*(?=\w)')
PHV_PATTERN = re.compile(r'\{phv\d{8}\}')
NUMERIC_ARITH_PATTERN = re.compile(r'\b\d+(\.\d+)?\s*[-+*/]\s*\d+(\.\d+)?\b')
KNOWN_CONSTANTS = {"{#}/wk", "10*6/uL", "10*3/uL", "g/dL", "mg/L", "mg/g", "U/L", "[IU]/L", "1/hr", "mg/dL", "MESA CLASSIC EXAM 2-3"}
SIMPLE_STRING_FIELDS = { "method_type", "observation_type", "vital_status"}

def is_const_expr(expr: str) -> bool:
    expr = expr.strip().strip("'\"")
    if not expr:
        return False
    if expr in KNOWN_CONSTANTS:
        return True
    if PHV_PATTERN.search(expr):
        return False
    if NUMERIC_ARITH_PATTERN.search(expr):
        return False
    return True

def const_expr_to_value_in_slot(slot_name: str, slot_deriv: dict | str) -> dict:
    if isinstance(slot_deriv, str) and slot_name in SIMPLE_STRING_FIELDS:
        # print(f"🛠 Fixing simple string slot: {slot_name} → value: {slot_deriv}")
        return {"value": slot_deriv.strip("'\""), "range": "string"}

    if isinstance(slot_deriv, dict):
        expr = slot_deriv.get("expr")

        if isinstance(expr, str):
            if expr.strip().startswith("case(") and "case(" in expr.split("case(", 1)[1]:
                print(f"❌ Found malformed case expression: {expr}")
            if is_const_expr(expr):
                slot_deriv["value"] = expr.strip("'\"")
                slot_deriv.pop("expr", None)
                slot_deriv.setdefault("range", "string")

    return slot_deriv

def const_expr_to_value_in_class(class_deriv: dict) -> None:
    slot_derivs = class_deriv.get("slot_derivations", {})
    # for slot_deriv in slot_derivs.values():
    #     const_expr_to_value_in_slot(slot_deriv)
    for slot_name, slot_deriv in list(slot_derivs.items()):
        slot_derivs[slot_name] = const_expr_to_value_in_slot(slot_name, slot_deriv)
        if isinstance(slot_deriv, dict):
            for obj in slot_deriv.get("object_derivations", []):
                if isinstance(obj, dict):
                    class_derivs = obj.get("class_derivations", {})
                    for inner_class in class_derivs.values():
                        if isinstance(inner_class, dict):
                            const_expr_to_value_in_class(inner_class)



def const_expr_to_value(parsed_docs: list[dict]) -> list[dict]:
    for doc in parsed_docs:
        for class_deriv in doc.get("class_derivations", {}).values():
            const_expr_to_value_in_class(class_deriv)
    return parsed_docs


In [None]:
def update_populated_from_with_pht(documents, phv_to_pht):
    def find_first_phv_in_slot(slot_derivations):
        for slot_value in slot_derivations.values():
            if isinstance(slot_value, dict):
                pf = slot_value.get("populated_from")
                expr = slot_value.get("expr")

                if isinstance(pf, str) and pf.startswith("phv"):
                    return pf
                if isinstance(expr, str):
                    match = re.search(r"(phv\d{8})", expr)
                    if match:
                        return match.group(1)
        return None

    def update_class_derivations(cls_derivations, doc_index, context="root", parent_pht=None):
        for cls_name, cls_info in cls_derivations.items():
            slot_derivs = cls_info.get("slot_derivations", {})
            pf = cls_info.get("populated_from")

            if pf == "FHS":
                phv = find_first_phv_in_slot(slot_derivs)
                if phv and phv in phv_to_pht:
                    new_pf = phv_to_pht[phv]
                    cls_info["populated_from"] = new_pf
                    parent_pht = new_pf  # propagate to children
                    # print(f"✅ Updated {context}.{cls_name} populated_from: {phv} -> {new_pf}")
                elif parent_pht:
                    cls_info["populated_from"] = parent_pht
                else:
                    print(f"⚠️ Warning: No matching phv for {context}.{cls_name} in doc {doc_index}")

            # Recurse into nested object_derivations
            for slot_name, slot_value in slot_derivs.items():
                if isinstance(slot_value, dict) and "object_derivations" in slot_value:
                    for obj in slot_value["object_derivations"]:
                        inner_cls_derivs = obj.get("class_derivations")
                        if inner_cls_derivs:
                            update_class_derivations(inner_cls_derivs, doc_index, context=f"{context}.{cls_name}.{slot_name}", parent_pht=parent_pht)

    for doc_index, doc in enumerate(documents):
        top_cd = doc.get("class_derivations", {})
        update_class_derivations(top_cd, doc_index)

    return documents

def load_phv_to_pht_map(file_path):
    with open(file_path) as f:
        return dict(line.strip().split(": ") for line in f if line.strip())


In [None]:
def extract_inline_comments(parsed_docs: list[dict]) -> list[dict]:
    def extract_comment_from_value(val: str) -> tuple[str, Optional[str]]:
        if "#" in val:
            main, comment = val.split("#", 1)
            return main.strip(), comment.strip()
        return val, None

    def fix_slot(slot: dict):
        if not isinstance(slot, dict):
            return
        for field in ("value", "expr"):
            val = slot.get(field)
            if isinstance(val, str) and "#" in val:
                clean_val, comment = extract_comment_from_value(val)
                slot[field] = clean_val
                if comment:
                    slot.setdefault("comments", []).append(comment)

    def recurse_class(class_deriv: dict):
        for slot_deriv in class_deriv.get("slot_derivations", {}).values():
            fix_slot(slot_deriv)
            if not isinstance(slot_deriv, dict):
              print(f"⚠️ Warning: No matching slot_deriv\n {slot_deriv}\n for\n {class_deriv}\n is not a dict")
              continue
            for obj in slot_deriv.get("object_derivations", []):
                if isinstance(obj, dict):
                    recurse_class(obj)

    for doc in parsed_docs:
        for class_deriv in doc.get("class_derivations", {}).values():
            recurse_class(class_deriv)

    return parsed_docs


In [None]:
# spec_dir = Path("/sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/CHS")
# output_dir = Path("/sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/CHS-ingest")

# spec_dir = Path("/sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/HCHS")
# output_dir = Path("/sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/HCHS-ingest")

spec_dir = Path("/sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/MESA/")
output_dir = Path("/sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/MESA-ingest")

SKIP_BASES = {"bld_pressure", "pr_qrs_qt", } 
# RUN_BASES = {"chloride_bld", }

# Files with issues
#   tak_betablk  -- enum derivation
#   bld_pressure  -- MeasurementObservationSet

for yaml_file in spec_dir.glob("*.yaml"):
    base = yaml_file.stem  # Strip .yaml
    output_file = f"{output_dir}/{base}.yaml"
    
    # if base not in RUN_BASES:
    #     # print(f"⏭️ Skipping {base}")
    #     continue
    
    if base in SKIP_BASES:
        print(f"⏭️ Skipping {base}")
        continue
    
    print(base)
    raw = "\n".join(line.rstrip() for line in yaml_file.read_text().splitlines())
    pre_yaml = "\n".join(line.rstrip() for line in raw.splitlines())
    pre_yaml = re.sub(r'^(\s*[^#:\n]+?):(?=\S)', r'\1: ', pre_yaml, flags=re.MULTILINE)
    pre_yaml = re.sub(r'^(\s*)populated from:(.*)$', r'\1populated_from:\2', pre_yaml, flags=re.MULTILINE)
    pre_yaml = fix_misindented_value_mappings(pre_yaml)
    pre_yaml = fix_nested_expr_under_populated_from(pre_yaml)
    pre_yaml = fix_expr_strings(pre_yaml)

    # Split into list of class derivations if not already a YAML list
    if pre_yaml.lstrip().startswith("- class_derivations:"):
        parsed_yaml = yaml.safe_load(pre_yaml)
    else:
        pre_yaml_parts = re.split(r'(?<=\n)(?=^class_derivations:\s*)', pre_yaml, flags=re.MULTILINE)
        parsed_yaml = [yaml.safe_load(doc) for doc in pre_yaml_parts]

    # parsed_yaml = refactor_value_quantity(parsed_yaml)
    parsed_yaml = const_expr_to_value(parsed_yaml)
    
    parsed_yaml = extract_inline_comments(parsed_yaml)
    
    # Dump to YAML
    with open(output_file, "w") as f:
        yaml.dump(parsed_yaml, f, sort_keys=False, allow_unicode=True)
print("Success!!!")


In [None]:
print(yaml.dump(parsed_yaml[0]))
print(pre_yaml)