In [8]:
from pathlib import Path
import re
import subprocess
import yaml

from linkml.validator.loaders import TsvLoader
from linkml.utils.schema_builder import SchemaBuilder

from linkml_runtime.linkml_model import SlotDefinition
from linkml_runtime import SchemaView

from linkml_map.session import Session
from linkml_map.transformer.object_transformer import ObjectTransformer

In [2]:
# Fix malformed yaml
def quote_expr_values(yaml_text):
    def replacer(match):
        indent = match.group(1)
        value = match.group(2).strip()

        if value.startswith('"') or value.startswith("'"):
            return match.group(0)
        if re.match(r'^[\w{}\s\*\+\-/().]+$', value):
            return f'{indent}expr: "{value}"'
        return match.group(0)

    pattern = re.compile(r'^(\s*)expr:\s+(.*)', re.MULTILINE)
    return pattern.sub(replacer, yaml_text)

# raw = Path("NHLBI-BDC-DMC-HV/priority_variables_transform/FHS/bdy_hgt.yaml").read_text()
# quoted_fixed = quote_expr_values(raw)
# split_blocks = re.split(r'(?<=\n)(?=^\s*class_derivations:\s*)', quoted_fixed, flags=re.MULTILINE)
# parsed_docs = [yaml.safe_load(doc) for doc in split_blocks]
# print(yaml.dump(parsed_docs))

In [3]:
def refactor_value_quantity(documents):
    updated_docs = []

    for doc_index, doc in enumerate(documents):
        cd = doc.get("class_derivations", {})
        for cls_name, cls_info in cd.items():
            slot_derivs = cls_info.get("slot_derivations", {})
            populated_from = cls_info.get("populated_from")

            # Extract and clean up the value_decimal slot
            value_decimal_entry = slot_derivs.pop("value_decimal", None)
            cleaned_value_decimal = None
            if value_decimal_entry:
                if isinstance(value_decimal_entry, dict):
                    pf = value_decimal_entry.get("populated_from")
                    expr = value_decimal_entry.get("expr")

                    if pf and isinstance(pf, dict) and "expr" in pf:
                        cleaned_value_decimal = {"expr": pf["expr"]}
                    elif expr:
                        cleaned_value_decimal = {"expr": expr}
                    elif pf is None:
                        raise ValueError(f"[Doc {doc_index}] `value_decimal` has an empty `populated_from:` and no `expr:`")
                    elif isinstance(pf, str):
                        cleaned_value_decimal = {"populated_from": pf}
                    else:
                        raise ValueError(f"[Doc {doc_index}] Malformed `value_decimal`: {value_decimal_entry}")
                else:
                    raise ValueError(f"[Doc {doc_index}] Unexpected `value_decimal` format: {value_decimal_entry}")

            # Extract and clean up the value_quantity.unit slot
            value_unit_entry = slot_derivs.pop("value_quantity.unit", None)
            cleaned_unit = None
            if value_unit_entry:
                if isinstance(value_unit_entry, dict):
                    pf = value_unit_entry.get("populated_from")
                    expr = value_unit_entry.get("expr")

                    if pf and isinstance(pf, dict) and "expr" in pf:
                        cleaned_unit = {"expr": pf["expr"]}
                    elif expr:
                        cleaned_unit = {"expr": expr}
                    elif pf is None:
                        raise ValueError(f"[Doc {doc_index}] `value_quantity.unit` has an empty `populated_from:` and no `expr:`")
                    elif isinstance(pf, str):
                        cleaned_unit = {"populated_from": pf}
                    else:
                        raise ValueError(f"[Doc {doc_index}] Malformed `value_quantity.unit`: {value_unit_entry}")
                else:
                    raise ValueError(f"[Doc {doc_index}] Unexpected `value_quantity.unit` format: {value_unit_entry}")

            # If either was valid, repackage into a nested Quantity
            if cleaned_value_decimal or cleaned_unit:
                quantity_deriv = {
                    "class_derivations": {
                        "Quantity": {
                            "populated_from": populated_from,
                            "slot_derivations": {}
                        }
                    }
                }
                if cleaned_value_decimal:
                    quantity_deriv["class_derivations"]["Quantity"]["slot_derivations"]["value_decimal"] = cleaned_value_decimal
                if cleaned_unit:
                    quantity_deriv["class_derivations"]["Quantity"]["slot_derivations"]["unit"] = cleaned_unit

                slot_derivs["value_quantity"] = {
                    "object_derivations": [quantity_deriv]
                }

        updated_docs.append(doc)

    return updated_docs



# refactored_docs = refactor_value_quantity(parsed_docs)

# print(yaml.dump(refactored_docs))

In [26]:
def refactor_value_quantity(documents):
    updated_docs = []

    for doc_index, doc in enumerate(documents):
        cd = doc.get("class_derivations", {})
        for cls_name, cls_info in cd.items():
            slot_derivs = cls_info.get("slot_derivations", {})
            populated_from = cls_info.get("populated_from")

            # Extract and clean slots to move into Quantity
            quantity_subslots = {}
            for slot in ["value_decimal", "value_concept", "value_integer", "value_quantity.unit"]:
                # Support nested key for value_quantity.unit
                key_in_slot_derivs = slot if slot in slot_derivs else slot.split(".")[-1]
                entry = slot_derivs.pop(slot, None) or slot_derivs.pop(key_in_slot_derivs, None)
                if entry:
                    if isinstance(entry, dict):
                        pf = entry.get("populated_from")
                        expr = entry.get("expr")

                        if pf and isinstance(pf, dict) and "expr" in pf:
                            quantity_subslots[slot.split(".")[-1]] = {"expr": pf["expr"]}
                        elif expr:
                            quantity_subslots[slot.split(".")[-1]] = {"expr": expr}
                        elif pf is None:
                            raise ValueError(f"[Doc {doc_index}] `{slot}` has an empty `populated_from:` and no `expr:`")
                        elif isinstance(pf, str):
                            quantity_subslots[slot.split(".")[-1]] = {"populated_from": pf}
                        else:
                            raise ValueError(f"[Doc {doc_index}] Malformed `{slot}`: {entry}")
                    else:
                        raise ValueError(f"[Doc {doc_index}] Unexpected `{slot}` format: {entry}")

            # If any were found, create a nested Quantity class derivation
            if quantity_subslots:
                quantity_deriv = {
                    "class_derivations": {
                        "Quantity": {
                            "populated_from": populated_from,
                            "slot_derivations": quantity_subslots
                        }
                    }
                }

                slot_derivs["value_quantity"] = {
                    "object_derivations": [quantity_deriv]
                }

        updated_docs.append(doc)

    return updated_docs

In [4]:
def update_populated_from_with_pht(documents, phv_to_pht):
    import re

    def find_first_phv_in_slot(slot_derivations):
        for slot_value in slot_derivations.values():
            if isinstance(slot_value, dict):
                pf = slot_value.get("populated_from")
                expr = slot_value.get("expr")

                if isinstance(pf, str) and pf.startswith("phv"):
                    return pf
                if isinstance(expr, str):
                    match = re.search(r"(phv\d{8})", expr)
                    if match:
                        return match.group(1)
        return None

    def update_class_derivations(cls_derivations, doc_index, context="root"):
        for cls_name, cls_info in cls_derivations.items():
            slot_derivs = cls_info.get("slot_derivations", {})
            pf = cls_info.get("populated_from")

            if pf == "FHS":
                phv = find_first_phv_in_slot(slot_derivs)
                if phv and phv in phv_to_pht:
                    new_pf = phv_to_pht[phv]
                    cls_info["populated_from"] = new_pf
                    # print(f"✅ Updated {context}.{cls_name} populated_from: {phv} -> {new_pf}")
                else:
                    print(f"⚠️ Warning: No matching phv for {context}.{cls_name} in doc {doc_index}")

            # Recurse into nested object_derivations
            for slot_name, slot_value in slot_derivs.items():
                if isinstance(slot_value, dict) and "object_derivations" in slot_value:
                    for obj in slot_value["object_derivations"]:
                        inner_cls_derivs = obj.get("class_derivations")
                        if inner_cls_derivs:
                            update_class_derivations(inner_cls_derivs, doc_index, context=f"{context}.{cls_name}.{slot_name}")

    for doc_index, doc in enumerate(documents):
        top_cd = doc.get("class_derivations", {})
        update_class_derivations(top_cd, doc_index)

    return documents

def load_phv_to_pht_map(file_path):
    with open(file_path) as f:
        return dict(line.strip().split(": ") for line in f if line.strip())

# for phv in $(grep -ho 'phv[0-9]\{8\}' bdy_hgt.yaml | sort -u); do grep -l "$phv" ../../../output/FHS_v31_c1/*.tsv \
# | sed -E "s|.*/(pht[0-9]{6,}).tsv|$phv: \1|"; done > phv_to_pht.txt

# phv_to_pht = load_phv_to_pht_map("NHLBI-BDC-DMC-HV/priority_variables_transform/FHS/phv_to_pht.txt")

# phv_to_pht = {
#     "phv00000680": "pht000009",
#     "phv00001036": "pht000009",
#     "phv00001367": "pht000012",
#     "phv00001559": "pht000012",
#     "phv00002207": "pht000016",
#     "phv00002425": "pht000016",
# }

# pht006027


# pht_replace_docs = update_populated_from_with_pht(refactored_docs, phv_to_pht)

# Dump to YAML
# with open("NHLBI-BDC-DMC-HV/priority_variables_transform/FHS-ingest/" + "bdy_hgt" + ".yaml", "w") as f:
#     yaml.dump(pht_replace_docs, f, sort_keys=False, allow_unicode=True)

# print(yaml.dump(pht_replace_docs))
# print("Success!!!")

In [5]:
raw = Path("NHLBI-BDC-DMC-HV/priority_variables_transform/FHS/bdy_hgt.yaml").read_text()
quoted_fixed = quote_expr_values(raw)
split_blocks = re.split(r'(?<=\n)(?=^\s*class_derivations:\s*)', quoted_fixed, flags=re.MULTILINE)
parsed_docs = [yaml.safe_load(doc) for doc in split_blocks]

refactored_docs = refactor_value_quantity(parsed_docs)

phv_to_pht = load_phv_to_pht_map("NHLBI-BDC-DMC-HV/priority_variables_transform/FHS/phv_to_pht.txt")

pht_replace_docs = update_populated_from_with_pht(refactored_docs, phv_to_pht)

# Dump to YAML
with open("NHLBI-BDC-DMC-HV/priority_variables_transform/FHS-ingest/" + "bdy_hgt" + ".yaml", "w") as f:
    yaml.dump(pht_replace_docs, f, sort_keys=False, allow_unicode=True)

In [17]:
# Set base name
spec_dir = "NHLBI-BDC-DMC-HV/priority_variables_transform/FHS"
base = "afib"
data_dir = "output/FHS_v31_c1"

# Run the shell command to regenerate phv_to_pht.txt
subprocess.run(
    f"""for phv in $(grep -ho 'phv[0-9]\\{{8\\}}' {spec_dir}/{base}.yaml | sort -u); do \
    grep -l "$phv" {data_dir}/*.tsv | \
    sed -E "s|.*/(pht[0-9]{{6,}}).tsv|$phv: \\1|"; done > phv_to_pht.txt""",
    shell=True, check=True,
)

# Load and process the YAML
raw = Path(f"NHLBI-BDC-DMC-HV/priority_variables_transform/FHS/{base}.yaml").read_text()
quoted_fixed = quote_expr_values(raw)
split_blocks = re.split(r'(?<=\n)(?=^\s*class_derivations:\s*)', quoted_fixed, flags=re.MULTILINE)
parsed_docs = [yaml.safe_load(doc) for doc in split_blocks]

refactored_docs = refactor_value_quantity(parsed_docs)

phv_to_pht = load_phv_to_pht_map("NHLBI-BDC-DMC-HV/priority_variables_transform/FHS/phv_to_pht.txt")

pht_replace_docs = update_populated_from_with_pht(refactored_docs, phv_to_pht)

# Dump to YAML
with open(f"NHLBI-BDC-DMC-HV/priority_variables_transform/FHS-ingest/{base}.yaml", "w") as f:
    yaml.dump(pht_replace_docs, f, sort_keys=False, allow_unicode=True)


In [36]:
spec_dir = Path("NHLBI-BDC-DMC-HV/priority_variables_transform/FHS")
data_dir = "output/FHS_v31_c1"
output_dir = Path("NHLBI-BDC-DMC-HV/priority_variables_transform/FHS-ingest")

for yaml_file in spec_dir.glob("*.yaml"):
    base = yaml_file.stem  # Strip .yaml

    # Not completed: cig_smoke, creat_bld, edu_lvl, fam_income, fast_gluc_bld, glucose_bld, hdl, hip_circ
    #                hist_cor_angio, hist_cor_bypg, hist_my_inf, hrtrt, hypertension, insulin_in_blood
    start_at = "icam"
    if base < start_at:
        continue
    
    print(base)

    # Run the shell command to regenerate phv_to_pht.txt
    result = subprocess.run(
        f"""for phv in $(grep -ho 'phv[0-9]\\{{8\\}}' {spec_dir}/{base}.yaml | sort -u); do \
        grep -l "$phv" {data_dir}/*.tsv | \
        sed -E "s|.*/(pht[0-9]{{6,}}).tsv|$phv: \\1|"; done""",
        shell=True, check=True, capture_output=True, text=True,
    )
    phv_to_pht = dict(line.split(": ") for line in result.stdout.strip().splitlines())

    # Load and process the YAML
    raw = yaml_file.read_text()
    quoted_fixed = quote_expr_values(raw)
    split_blocks = re.split(r'(?<=\n)(?=^\s*class_derivations:\s*)', quoted_fixed, flags=re.MULTILINE)
    parsed_docs = [yaml.safe_load(doc) for doc in split_blocks]

    refactored_docs = refactor_value_quantity(parsed_docs)

    # phv_to_pht = load_phv_to_pht_map("NHLBI-BDC-DMC-HV/priority_variables_transform/FHS/phv_to_pht.txt")

    pht_replace_docs = update_populated_from_with_pht(refactored_docs, phv_to_pht)

    # Dump to YAML
    output_file = f"{output_dir}/{base}.yaml"
    with open(output_file, "w") as f:
        yaml.dump(pht_replace_docs, f, sort_keys=False, allow_unicode=True)


icam
insulin_in_blood


ScannerError: while scanning an alias
  in "<unicode string>", line 6, column 25:
            populated_from: ***
                            ^
expected alphabetic or numeric character, but found '*'
  in "<unicode string>", line 6, column 26:
            populated_from: ***
                             ^

In [5]:
# Person - top level class
person_yaml = yaml.safe_load("""
class_derivations:
  Person:
    populated_from: pht000009
    slot_derivations:
      species:
        expr: "'Homo Sapiens'"
      identity:
        populated_from: dbGaP_Subject_ID
""")

# # Dump to YAML
# with open(var_dir + "person" + ".yaml", "w") as f:
#     yaml.dump(person_yaml, f, sort_keys=False, allow_unicode=True)

In [4]:
source_sv = SchemaView("/sbgenomics/workspace/output/Schema_FHS_v31_c1/schema-automator-data/Schema_FHS_v31_c1.yaml")
source_schema = source_sv.schema

target_sv = SchemaView("NHLBI-BDC-DMC-HM/src/bdchm/schema/bdchm.yaml")
target_schema = target_sv.schema

var_dir = "NHLBI-BDC-DMC-HV/priority_variables_transform/FHS/"

In [6]:
transform_spec = person_yaml

data_loader = TsvLoader("/sbgenomics/workspace/output/FHS_v31_c1/pht000009.tsv")
data_rows = data_loader.iter_instances()

first_row = next(data_rows)
cur_row = first_row

# Create ObjectTransformer and apply transformation
transformer = ObjectTransformer(unrestricted_eval=True)
transformer.source_schemaview = SchemaView(source_schema)
transformer.target_schemaview = SchemaView(target_schema)
transformer.create_transformer_specification(transform_spec)

result = transformer.map_object(cur_row, source_type="pht000009")

print(result)
print("Transformation Successful!")

{'species': 'Homo Sapiens', 'identity': [16956]}
Transformation Successful!


In [7]:
# Participant - top level class for study data
participant_yaml = yaml.safe_load("""
class_derivations:
  Participant:
    populated_from: pht000009
    slot_derivations:
      # associated_participant: 
      #   populated_from: phv00007675
      identity:
        populated_from: dbGaP_Subject_ID
      member_of_research_study:
        expr: "'FHS'"
""")

# # Dump to YAML
# with open(var_dir + "participant" + ".yaml", "w") as f:
#     yaml.dump(participant_yaml, f, sort_keys=False, allow_unicode=True)

In [8]:
# Load MeasurementObservation class derivations
bdy_hgt = refactored_docs
# bdy_hgt = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bdy_hgt" + ".yaml")))
# bdy_wgt = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bdy_wgt" + ".yaml")))
# bmi = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bmi" + ".yaml")))
# bp_diastolic = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bp_diastolic" + ".yaml")))
# bp_systolic = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bp_systolic" + ".yaml")))
# fev1 = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "fev1" + ".yaml")))
# fev1_fvc = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "fev1_fvc" + ".yaml")))
# fvc = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "fvc" + ".yaml")))
# hrt_rt = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "hrt_rt" + ".yaml")))
# spo2 = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "spo2" + ".yaml")))

# Get the demography slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_exposures_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("exposures", {})

# Add the Demography object_derivation to the demography slot
participant_exposures_slot.setdefault("object_derivations", bdy_hgt)
# participant_exposures_slot.setdefault("object_derivations", [
#     bdy_hgt,
#     tak_betablk,
#     tak_adrenergics,
#     tak_cort_steroid_resp,
#     tak_cort_steroid_oral,
#     tak_anabolic_steroid,
# ])

[{'class_derivations': {'MeasurementObservation': {'populated_from': 'pht000009',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00001036'},
     'associated_visit': {'expr': "'FHS ORIGINAL EXAM 4'"},
     'observation_type': {'expr': "'OBA:VT0001253'"},
     'value_quantity': {'object_derivations': [{'class_derivations': {'Quantity': {'populated_from': 'pht000009',
          'slot_derivations': {'value_decimal': {'expr': '{phv00000680} * 2.54'},
           'unit': {'expr': "'cm'"}}}}}]}}}}},
 {'class_derivations': {'MeasurementObservation': {'populated_from': 'pht000009',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00001036'},
     'associated_visit': {'expr': "'FHS ORIGINAL EXAM 1'"},
     'observation_type': {'expr': "'OBA:VT0001253'"},
     'value_quantity': {'object_derivations': [{'class_derivations': {'Quantity': {'populated_from': 'pht000009',
          'slot_derivations': {'value_decimal': {'expr': '{phv00000539} * 2.54'

In [9]:
transform_spec = participant_yaml

input_data = cur_row

# Create ObjectTransformer and apply transformation
transformer = ObjectTransformer(unrestricted_eval=True)
transformer.source_schemaview = SchemaView(source_schema)
transformer.target_schemaview = SchemaView(target_schema)
transformer.create_transformer_specification(transform_spec)

result = transformer.map_object(input_data, source_type="pht000009")

# print(result)
print("Transformation Successful!")

{'identity': [16956], 'member_of_research_study': 'FHS', 'exposures': [{'associated_participant': 1, 'associated_visit': 'FHS ORIGINAL EXAM 4', 'observation_type': 'OBA:VT0001253', 'value_quantity': {'value_decimal': 162.56, 'unit': 'cm'}}, {'associated_participant': 1, 'associated_visit': 'FHS ORIGINAL EXAM 1', 'observation_type': 'OBA:VT0001253', 'value_quantity': {'value_decimal': 162.56, 'unit': 'cm'}}, {'associated_participant': 1, 'associated_visit': 'FHS ORIGINAL EXAM 5', 'observation_type': 'OBA:VT0001253', 'value_quantity': {'value_decimal': 162.56, 'unit': 'cm'}}]}
Transformation Successful!


In [8]:
# Let's start with one simple condition
angina = yaml.safe_load(open(str(var_dir + "angina" + ".yaml")))
# print(yaml.dump(angina))

# Get the conditions slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_exposures_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("conditions", {})

# Add the conditions object_derivation to the demography slot
participant_exposures_slot.setdefault("object_derivations", [
    angina,
    # asthma,
    # copd,
    # diabetes,
    # hist_hrt_failure,
    # hist_my_inf,
    # hyperten,
    # pad,
    # slp_ap,
    # stroke,
    # stroke_isch_atk,
])

[{'class_derivations': {'Condition': {'populated_from': 'pht000030',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00056635'},
     'associated_visit': {'expr': "'FHS OFFSPRING BASELINE'"},
     'condition_concept': {'expr': "'HP:0001681'"},
     'condition_status': {'populated_from': 'phv00055298',
      'value_mappings': {'0': 'ABSENT', '1': 'PRESENT', '8': 'UNKNOWN'}},
     'condition_provenance': {'expr': "'CLINICAL_DIAGNOSIS'"},
     'relationship_to_participant': {'expr': "'ONESELF'"}}}}}]

In [10]:
transform_spec = participant_yaml

input_data = cur_row

# Create ObjectTransformer and apply transformation
transformer = ObjectTransformer(unrestricted_eval=True)
transformer.source_schemaview = SchemaView(source_schema)
transformer.target_schemaview = SchemaView(target_schema)
transformer.create_transformer_specification(transform_spec)

result = transformer.map_object(input_data, source_type="pht000030")

print(result)
print("Transformation Successful!")

{'identity': [16957], 'member_of_research_study': 'FHS', 'conditions': [{'associated_participant': None, 'associated_visit': 'FHS OFFSPRING BASELINE', 'condition_concept': 'HP:0001681', 'condition_status': None, 'condition_provenance': 'CLINICAL_DIAGNOSIS', 'relationship_to_participant': 'ONESELF'}]}
Transformation Successful!


In [None]:
transform_spec = refactored_docs


data_loader = TsvLoader("/sbgenomics/workspace/output/FHS_v31_c1/pht000009.tsv")
data_rows = data_loader.iter_instances()

first_row = next(data_rows)
cur_row = first_row

input_data = cur_row

# Create ObjectTransformer and apply transformation
transformer = ObjectTransformer(unrestricted_eval=True)
transformer.source_schemaview = SchemaView(source_schema)
transformer.target_schemaview = SchemaView(target_schema)
transformer.create_transformer_specification(transform_spec)

result = transformer.map_object(input_data, source_type="pht000009")

print(result)
print("Transformation Successful!")

In [None]:
# Turn sequential class_derivations (malformed yaml) into list of class derivations.

# Function to fix unquoted expr: text (malformed yaml)
def quote_expr_values(yaml_text):
    def replacer(match):
        indent = match.group(1)
        value = match.group(2).strip()

        # Don't quote if already quoted OR looks like a quoted literal
        if value.startswith('"') or value.startswith("'"):
            return match.group(0)

        # Don't quote if it's a simple scalar (e.g., a single variable)
        if re.match(r'^[\w{}\s\*\+\-/().]+$', value):
            return f'{indent}expr: "{value}"'

        # Otherwise, leave it as-is
        return match.group(0)

    pattern = re.compile(r'^(\s*)expr:\s+(.*)', re.MULTILINE)
    return pattern.sub(replacer, yaml_text)

# Read the raw YAML as text
raw = Path("NHLBI-BDC-DMC-HV/priority_variables_transform/FHS/bdy_hgt_test.yaml").read_text()
# print(raw)

quoted_fixed = quote_expr_values(raw)
# Split while *keeping* 'class_derivations:' in each result, skip first line
split_blocks = re.split(r'(?<=\n)(?=^\s*class_derivations:\s*)', quoted_fixed, flags=re.MULTILINE)
# print(split_blocks)

# print(split_blocks)
parsed_docs = [yaml.safe_load(doc) for doc in split_blocks]
print(yaml.dump(parsed_docs))


raw2 = Path("NHLBI-BDC-DMC-HV/priority_variables_transform/FHS/bdy_hgt_test2.yaml").read_text()
# print(raw2)

# Split while *keeping* 'class_derivations:' in each result, skip first line
split_blocks2 = re.split(r'(?<=\n)(?=^\s*class_derivations:\s*)', raw2, flags=re.MULTILINE)
# print(split_blocks2)
print(split_blocks == split_blocks2)
# quoted_fixed2 = quote_expr_values(raw2)
# print(quoted_fixed2)
parsed_docs2 = [yaml.safe_load(doc) for doc in split_blocks2]
# print(yaml.dump(parsed_docs2))

# parsed_docs = [yaml.safe_load(doc) for doc in quoted_fixed]

# Use these if we can't guarantee the first class_derivation is first line of file.
# # Split while *keeping* 'class_derivations:' in each result
# split_blocks = re.split(r'(?=^\s*class_derivations:\s*)', raw, flags=re.MULTILINE)
# # First block is likely empty or comments/header — skip it
# blocks = [b for b in split_blocks[1:]]
# parsed_docs = [yaml.safe_load(doc) for doc in blocks]

# print(yaml.dump(quoted_fixed))

In [None]:
# Turn sequential class_derivations (malformed yaml) into list of class derivations.

# Read the raw YAML as text
raw = Path("NHLBI-BDC-DMC-HV/priority_variables_transform/FHS/apnea_hypop_index.yaml").read_text()
print(raw)

# Split while *keeping* 'class_derivations:' in each result, skip first line
split_blocks = re.split(r'(?<=\n)(?=^\s*class_derivations:\s*)', raw, flags=re.MULTILINE)
parsed_docs = [yaml.safe_load(doc) for doc in split_blocks]

# Use these if we can't guarantee the first class_derivation is first line of file.
# # Split while *keeping* 'class_derivations:' in each result
# split_blocks = re.split(r'(?=^\s*class_derivations:\s*)', raw, flags=re.MULTILINE)
# # First block is likely empty or comments/header — skip it
# blocks = [b for b in split_blocks[1:]]
# parsed_docs = [yaml.safe_load(doc) for doc in blocks]

print(yaml.dump(parsed_docs))

In [None]:
import csv
from pathlib import Path

class LazySubjectDict(dict):
    """
    Lazily loads per-pht data for a single subject on demand.
    """
    def __init__(self, subject_id, data_dir):
        super().__init__()
        self.subject_id = subject_id
        self.data_dir = Path(data_dir)
        self._cache = {}

    def __getitem__(self, pht_id):
        if pht_id in self._cache:
            return self._cache[pht_id]

        file_path = self.data_dir / f"{pht_id}.tsv"
        if not file_path.exists():
            raise KeyError(f"No such file: {file_path}")

        with open(file_path, newline='') as f:
            reader = csv.DictReader(f, delimiter="\t")
            for row in reader:
                if row.get("dbGaP_Subject_ID") == self.subject_id:
                    self._cache[pht_id] = row
                    return row

        raise KeyError(f"Subject {self.subject_id} not found in {pht_id}")

In [None]:
source_sv = SchemaView("/sbgenomics/workspace/output/Schema_FHS_v31_c1/schema-automator-data/Schema_FHS_v31_c1.yaml")
source_schema = source_sv.schema

target_sv = SchemaView("NHLBI-BDC-DMC-HM/src/bdchm/schema/bdchm.yaml")
target_schema = target_sv.schema

data_loader = TsvLoader("/sbgenomics/workspace/output/FHS_v31_c1/pht000030.tsv")
data_rows = data_loader.iter_instances()

first_row = next(data_rows)
cur_row = first_row

var_dir = "NHLBI-BDC-DMC-HV/priority_variables_transform/FHS/"
print(cur_row)

In [None]:
# Person - top level class
person_yaml = yaml.safe_load("""
class_derivations:
  Person:
    populated_from: pht000030
    slot_derivations:
      species:
        expr: "'Homo Sapiens'"
      identity:
        populated_from: dbGaP_Subject_ID
""")

# # Dump to YAML
# with open(var_dir + "person" + ".yaml", "w") as f:
#     yaml.dump(person_yaml, f, sort_keys=False, allow_unicode=True)

In [None]:
# Participant - top level class for study data
participant_yaml = yaml.safe_load("""
class_derivations:
  Participant:
    populated_from: pht000030
    slot_derivations:
      # associated_participant: 
      #   populated_from: phv00007675
      identity:
        populated_from: dbGaP_Subject_ID
      member_of_research_study:
        expr: "'FHS'"
""")

# # Dump to YAML
# with open(var_dir + "participant" + ".yaml", "w") as f:
#     yaml.dump(participant_yaml, f, sort_keys=False, allow_unicode=True)

In [None]:
transform_spec = participant_yaml

input_data = cur_row

# Create ObjectTransformer and apply transformation
transformer = ObjectTransformer(unrestricted_eval=True)
transformer.source_schemaview = SchemaView(source_schema)
transformer.target_schemaview = SchemaView(target_schema)
transformer.create_transformer_specification(transform_spec)

result = transformer.map_object(input_data, source_type="pht000030")

print(result)
print("Transformation Successful!")

In [None]:
# Get the participants slot
person_class = person_yaml.setdefault("class_derivations", {}).setdefault("Person", {})
person_participants_slot = person_class.setdefault("slot_derivations", {}).setdefault("participants", {})

# Add the Participant object_derivation to the participants slot
person_participants_slot.setdefault("object_derivations", [ participant_yaml ])

# print(yaml.dump(person_yaml, sort_keys=False))

In [None]:
transform_spec = person_yaml

input_data = cur_row

# Create ObjectTransformer and apply transformation
transformer = ObjectTransformer(unrestricted_eval=True)
transformer.source_schemaview = SchemaView(source_schema)
transformer.target_schemaview = SchemaView(target_schema)
transformer.create_transformer_specification(transform_spec)

result = transformer.map_object(input_data, source_type="pht000030")

print(result)
print("Transformation Successful!")

In [None]:
# Let's start with one simple condition
angina = yaml.safe_load(open(str(var_dir + "angina" + ".yaml")))
# print(yaml.dump(angina))

# Get the conditions slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_exposures_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("conditions", {})

# Add the conditions object_derivation to the demography slot
participant_exposures_slot.setdefault("object_derivations", [
    angina,
    # asthma,
    # copd,
    # diabetes,
    # hist_hrt_failure,
    # hist_my_inf,
    # hyperten,
    # pad,
    # slp_ap,
    # stroke,
    # stroke_isch_atk,
])

In [None]:
print(yaml.dump(person_yaml))

In [None]:
transform_spec = person_yaml

data_loader = TsvLoader("/sbgenomics/workspace/output/FHS_v31_c1/pht000030.tsv")
data_rows = data_loader.iter_instances()

first_row = next(data_rows)
cur_row = first_row

other_data_loader = TsvLoader("/sbgenomics/workspace/output/FHS_v31_c1/pht000395.tsv")
other_data_rows = other_data_loader.iter_instances()

other_first_row = next(other_data_rows)
other_cur_row = other_first_row

input_data = {
    "pht000030": cur_row,
    "pht000395": other_cur_row
}


# Create ObjectTransformer and apply transformation
transformer = ObjectTransformer(unrestricted_eval=True)
transformer.source_schemaview = SchemaView(source_schema)
transformer.target_schemaview = SchemaView(target_schema)
transformer.create_transformer_specification(transform_spec)

result = transformer.map_object(input_data, source_type="FHS")

print(result)
print("Transformation Successful!")

In [None]:
# Load Condition class derivations
angina = yaml.safe_load(open(str(var_dir + "condition/" + "angina" + ".yaml")))
asthma = yaml.safe_load(open(str(var_dir + "condition/" + "asthma" + ".yaml")))
copd = yaml.safe_load(open(str(var_dir + "condition/" + "copd" + ".yaml")))
diabetes = yaml.safe_load(open(str(var_dir + "condition/" + "diabetes" + ".yaml")))
hist_hrt_failure = yaml.safe_load(open(str(var_dir + "condition/" + "hist_hrt_failure" + ".yaml")))
hist_my_inf = yaml.safe_load(open(str(var_dir + "condition/" + "hist_my_inf" + ".yaml")))
hyperten = yaml.safe_load(open(str(var_dir + "condition/" + "hyperten" + ".yaml")))
pad = yaml.safe_load(open(str(var_dir + "condition/" + "pad" + ".yaml")))
slp_ap = yaml.safe_load(open(str(var_dir + "condition/" + "slp_ap" + ".yaml")))
stroke = yaml.safe_load(open(str(var_dir + "condition/" + "stroke" + ".yaml")))
stroke_isch_atk = yaml.safe_load(open(str(var_dir + "condition/" + "stroke_isch_atk" + ".yaml")))

# Get the conditions slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_exposures_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("conditions", {})

# Add the conditions object_derivation to the demography slot
participant_exposures_slot.setdefault("object_derivations", [
    angina,
    asthma,
    copd,
    diabetes,
    hist_hrt_failure,
    hist_my_inf,
    hyperten,
    pad,
    slp_ap,
    stroke,
    stroke_isch_atk,
])

In [None]:
demography_yaml = yaml.safe_load("""
class_derivations:
  Demography:
    populated_from: COPDGene
    slot_derivations:
      associated_participant:
        populated_from: phv00159568
      sex:
        populated_from: phv00159571
        value_mappings:
          '1': OMOP:8507  # MALE
          '2': OMOP:8532  # FEMALE
      ethnicity:
        populated_from: phv00159573
        value_mappings:
          '1': HISPANIC_OR_LATINO
          '2': NOT_HISPANIC_OR_LATINO
      race:
        populated_from: phv00159572
        value_mappings:
          '1': OMOP:8527
          '2': OMOP:8516
          '3': OMOP:8515
          '4': OMOP:8557
          '5': OMOP:8657
          '6': OMOP:45880900
          '7': OMOP:8552
""")

# Dump to YAML
with open(var_dir + "demography" + ".yaml", "w") as f:
    yaml.dump(demography_yaml, f, sort_keys=False, allow_unicode=True)

In [None]:
# Get the demography slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_demography_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("demography", {})

# Add the Demography object_derivation to the demography slot
participant_demography_slot.setdefault("object_derivations", [ demography_yaml ])

In [None]:
# Load DrugExposure class derivations
tak_betablk_resp = yaml.safe_load(open(str(var_dir + "exposure/" + "tak_betablk_resp" + ".yaml")))
tak_betablk = yaml.safe_load(open(str(var_dir + "exposure/" + "tak_betablk" + ".yaml")))
tak_adrenergics = yaml.safe_load(open(str(var_dir + "exposure/" + "tak_adrenergics" + ".yaml")))
tak_cort_steroid_resp = yaml.safe_load(open(str(var_dir + "exposure/" + "tak_cort_steroid_resp" + ".yaml")))
tak_cort_steroid_oral = yaml.safe_load(open(str(var_dir + "exposure/" + "tak_cort_steroid_oral" + ".yaml")))
tak_anabolic_steroid = yaml.safe_load(open(str(var_dir + "exposure/" + "tak_anabolic_steroid" + ".yaml")))

# Get the demography slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_exposures_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("exposures", {})

# Add the Demography object_derivation to the demography slot
participant_exposures_slot.setdefault("object_derivations", [
    tak_betablk_resp,
    tak_betablk,
    tak_adrenergics,
    tak_cort_steroid_resp,
    tak_cort_steroid_oral,
    tak_anabolic_steroid,
])

In [None]:
# Load MeasurementObservation class derivations
bdy_hgt = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bdy_hgt" + ".yaml")))
bdy_wgt = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bdy_wgt" + ".yaml")))
bmi = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bmi" + ".yaml")))
bp_diastolic = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bp_diastolic" + ".yaml")))
bp_systolic = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bp_systolic" + ".yaml")))
fev1 = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "fev1" + ".yaml")))
fev1_fvc = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "fev1_fvc" + ".yaml")))
fvc = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "fvc" + ".yaml")))
hrt_rt = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "hrt_rt" + ".yaml")))
spo2 = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "spo2" + ".yaml")))

# Get the demography slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_exposures_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("exposures", {})

# Add the Demography object_derivation to the demography slot
participant_exposures_slot.setdefault("object_derivations", [
    bdy_hgt,
    tak_betablk,
    tak_adrenergics,
    tak_cort_steroid_resp,
    tak_cort_steroid_oral,
    tak_anabolic_steroid,
])

In [None]:
observation_yaml = yaml.safe_load("""
class_derivations:
  Observation:
    populated_from: COPDGene
    slot_derivations:
      associated_participant:
        populated_from: phv00159568
      observation_type:
        expr: "'OMOP:4282779'"  # Cigarette smoking status
      value_enum:
        expr: "'OMOP:40766945' if {phv00159749} == 1 else 'OMOP:45883458' if {phv00159747} == 1 else 'OMOP:45883537'"
""")

# Dump to YAML
with open(var_dir + "observation" + ".yaml", "w") as f:
    yaml.dump(observation_yaml, f, sort_keys=False, allow_unicode=True)

In [None]:
# Get the observations slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_observations_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("observations", {})

# Add the Demography object_derivation to the demography slot
participant_observations_slot.setdefault("object_derivations", [ observation_yaml ])

# print(yaml.dump(person_yaml))

In [None]:
# hist_cor_angio
hist_cor_angio = yaml.safe_load("""
class_derivations:
  Procedure:
    populated_from: COPDGene
    slot_derivations:
      associated_participant:
        populated_from: phv00159568
      procedure_concept:
        expr: "'OMOP:4184832'"  # Coronary angioplasty
      procedure_status:
        populated_from: phv00159632
        value_mappings:
          '0': ABSENT
          '1': PRESENT
""")

# Dump to YAML
with open(var_dir + "procedure/" + "hist_cor_angio" + ".yaml", "w") as f:
    yaml.dump(hist_cor_angio, f, sort_keys=False, allow_unicode=True)

In [None]:
transform_spec = hist_cor_angio

input_data = cur_row

# Create ObjectTransformer and apply transformation
transformer = ObjectTransformer(unrestricted_eval=True)
transformer.source_schemaview = SchemaView(source_schema)
transformer.target_schemaview = SchemaView(target_schema)
transformer.create_transformer_specification(transform_spec)

result = transformer.map_object(input_data, source_type="COPDGene")

print(result)
print("Transformation Successful!")

In [None]:
# hist_cor_bypg
hist_cor_bypg = yaml.safe_load("""
class_derivations:
  Procedure:
    populated_from: COPDGene
    slot_derivations:
      associated_participant:
        populated_from: phv00159568
      procedure_concept:
        expr: "'OMOP:4336464'"  #coronary bypass graft
      procedure_status:
        populated_from: phv00159631
        value_mappings:
          '0': ABSENT
          '1': PRESENT
""")

# Dump to YAML
with open(var_dir + "procedure/" + "hist_cor_bypg" + ".yaml", "w") as f:
    yaml.dump(hist_cor_bypg, f, sort_keys=False, allow_unicode=True)

In [None]:
# Get the observations slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_procedures_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("procedures", {})

# Add the Demography object_derivation to the demography slot
participant_procedures_slot.setdefault("object_derivations", [ hist_cor_angio, hist_cor_bypg ])

In [None]:
# edu_lvl
edu_lvl = yaml.safe_load("""
class_derivations:
  SdohObservation:
    populated_from: COPDGene
    slot_derivations:
      associated_participant:
        populated_from: phv00159568
      category:
        expr: "'EDUCATIONAL_ATTAINMENT'"
      value_enum:
        populated_from: phv00159773
        value_mappings:
          '1': 8TH_GRADE_OR_LESS
          '2': HIGH_SCHOOL_NO_DIPLOMA
          '3': HIGH_SCHOOL_GRADUATE_GED
          '4': SOME_COLLEGE_OR_TECH_NO_DEGREE
          '5': COLLEGE_OR_TECH_WITH_DEGREE
          '6': MASTERS_OR_DOCTORAL_DEGREE
""")

# Dump to YAML
with open(var_dir + "sdoh_observation/" + "edu_lvl" + ".yaml", "w") as f:
    yaml.dump(edu_lvl, f, sort_keys=False, allow_unicode=True)

In [None]:
# Get the observations slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_sdoh_observations_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("sdoh_observations", {})

# Add the Demography object_derivation to the demography slot
participant_sdoh_observations_slot.setdefault("object_derivations", [ edu_lvl ])

In [None]:
transform_spec = person_yaml

input_data = cur_row

# Create ObjectTransformer and apply transformation
transformer = ObjectTransformer(unrestricted_eval=True)
transformer.source_schemaview = SchemaView(source_schema)
transformer.target_schemaview = SchemaView(target_schema)
transformer.create_transformer_specification(transform_spec)

# Transform all rows
output_data = []
for row in data_rows:
    result = transformer.map_object(row, source_type="COPDGene")
    if result:  # Avoid None or empty dicts
        output_data.append(result)

# Final wrapped structure (key should match the collection slot, or be schema-compatible)
wrapped_output = {
    "persons": output_data
}

# Dump to YAML
with open("transformed_person_data_DS_CS.yaml", "w") as f:
    yaml.dump(wrapped_output, f, sort_keys=False, allow_unicode=True)

print("Transformation Successful!")

In [None]:
# Dump final Person class to YAML
with open(var_dir + "person" + ".yaml", "w") as f:
    yaml.dump(person_yaml, f, sort_keys=False, allow_unicode=True)