In [1]:
from itertools import islice
import os
from pathlib import Path
import subprocess
import time
import yaml

from linkml.validator.loaders import TsvLoader
from linkml_runtime import SchemaView
from linkml_map.transformer.object_transformer import ObjectTransformer

In [2]:
class DataLoader:
    def __init__(self, base_path):
        self.base_path = base_path

    def __getitem__(self, pht_id):
        file_path = os.path.join(self.base_path, f"{pht_id}.tsv")
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"No TSV file found for {pht_id} at {file_path}")
        return TsvLoader(os.path.join(self.base_path, f"{pht_id}.tsv")).iter_instances()

    def __contains__(self, pht_id):
        return os.path.exists(os.path.join(self.base_path, f"{pht_id}.tsv"))

In [3]:
def get_spec_files(directory, search_string):
    """
    Find YAML files in the directory that contain the search_string.
    Returns a sorted list of matching file paths.
    """
    directory = Path(directory)

    # grep -rl " Condition:"  ### Example
    result = subprocess.run(
        ['grep', '-rl', search_string, str(directory)],
        stdout=subprocess.PIPE,
        text=True,
        check=True
    )

    file_paths = [
        Path(p.strip()) for p in result.stdout.strip().split('\n')
        if p.strip().endswith(('.yaml', '.yml'))
    ]
    return sorted(file_paths, key=lambda p: p.stem)

In [4]:
def multi_spec_transform(data_loader, spec_files, source_schema, target_schema):
    for file in spec_files:
        print(f"{file.stem}", end='', flush=True)
        try:
            with open(file) as f:
                specs = yaml.safe_load(f)
            for block in specs:
                derivation = block["class_derivations"]
                print(".", end='', flush=True)
                for class_name, class_spec in derivation.items():
                    pht_id = class_spec["populated_from"]
                    rows = data_loader[pht_id]

                    transformer = ObjectTransformer(unrestricted_eval=True)
                    transformer.source_schemaview = SchemaView(source_schema)
                    transformer.target_schemaview = SchemaView(target_schema)
                    transformer.create_transformer_specification(block)

                    for row in rows:
                        mapped = transformer.map_object(row, source_type=pht_id)
                        yield mapped
            print('')
        except Exception as e:
            print(f"\n⚠️  Error processing {file}: {e.__class__.__name__} - {e}")
            print(block)
            import traceback
            traceback.print_exc()
            raise

In [5]:
def batched(iterable, batch_size):
    iterator = iter(iterable)
    for first in iterator:
        batch = [first, *islice(iterator, batch_size - 1)]
        yield batch

In [6]:
# source_sv = SchemaView("/sbgenomics/workspace/output/CHS/Schema_CHS_v7_c1/Schema_CHS_v7_c1.yaml")
# source_sv = SchemaView("/sbgenomics/workspace/output/HCHS_SOL_cleaned/Schema_HCHS_SOL_v1_c1.yaml")
# source_sv = SchemaView("/sbgenomics/workspace/output/MESA/Schema_MESA_v13_c1/Schema_MESA_v13_c1.yaml")
source_sv = SchemaView("/sbgenomics/workspace/output/CHS/Schema_CHS_v7_c1/Schema_CHS_v7_c1.yaml")
source_schema = source_sv.schema

target_sv = SchemaView("/sbgenomics/workspace/NHLBI-BDC-DMC-HM/src/bdchm/schema/bdchm.yaml")
target_schema = target_sv.schema

# var_dir = "/sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/CHS-ingest/"
# var_dir = "/sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/HCHS-ingest/"
# var_dir = "/sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/MESA-ingest/"
var_dir = "/sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/CHS-ingest/"

In [9]:
output_base = "sbgenomics/output-files"
# study_dir = "WHI-ingest" # "CHS-ingest"
study_dir = "CHS-ingest"
os.makedirs(f"/{output_base}/{study_dir}/", exist_ok=True)

# data_version = "WHI-v12-c2" # "CHS-v7-c4"
data_version = "CHS-v7-c4"
consent_label = "DS-CVD-IRB-NPU-MDS"
data_loader = DataLoader("/sbgenomics/workspace/output/CHS_cleaned/"+ data_version +"/")

entities = [
    # "Condition",
    "Demography",
    # "DrugExposure",
    # "MeasurementObservation",
    # "Observation",
    # "Person",
    # "Procedure"
]

start = time.perf_counter()
for entity in entities:
    print(f"Starting {entity}")
    spec_files = get_spec_files(var_dir, f" {entity}:")
    output_path = f"/{output_base}/{study_dir}/{data_version}-{entity}-{consent_label}-data.yaml"

    subset = spec_files
    # subset = [p for p in spec_files if p.stem == "bdy_hgt"]
    # subset = [p for p in spec_files if p.stem >= "stroke"]
    # subset = [p for p in spec_files if p.stem > "afib"]

    
    # with open(output_path, "w") as f:
    #     for batch in batched(multi_spec_transform(data_loader, subset, source_schema, target_schema), batch_size=100):
    #         yaml.dump_all(batch, f, explicit_start=True)

    all_results = []

    for batch in batched(multi_spec_transform(data_loader, subset, source_schema, target_schema), batch_size=100):
        all_results.extend(batch)  # Collect all batches into one list
    
    key_name = entity.lower() + "s"
    wrapped_output = {
        key_name: all_results
    }

    with open(output_path, "w") as f:
        yaml.dump(wrapped_output, f)
    
    print(f"{entity} Complete")

end = time.perf_counter()
print(f"Time: {end - start:.2f} seconds")

Starting Demography
demography..........
⚠️  Error processing /sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/CHS-ingest/demography.yaml: ValueError - Could not find what to derive from a source phv00197400_enum
{'class_derivations': {'Demography': {'populated_from': 'pht003699', 'slot_derivations': {'associated_participant': {'populated_from': 'phv00197394'}, 'sex': {'populated_from': 'phv00197400', 'value_mappings': {'M': "'OMOP:8507'", 'F': "'OMOP:8532'"}}}}}}


Traceback (most recent call last):
  File "/tmp/ipykernel_29266/1288493276.py", line 20, in multi_spec_transform
    mapped = transformer.map_object(row, source_type=pht_id)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sbgenomics/workspace/linkml-map/src/linkml_map/transformer/object_transformer.py", line 305, in map_object
    v = self.map_object(v, source_class_slot_range, target_range)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sbgenomics/workspace/linkml-map/src/linkml_map/transformer/object_transformer.py", line 193, in map_object
    return self.transform_enum(source_obj, source_type, source_obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sbgenomics/workspace/linkml-map/src/linkml_map/transformer/object_transformer.py", line 512, in transform_enum
    enum_deriv = self._get_enum_derivation(enum_name)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sbgenomics/workspace/linkml-

ValueError: Could not find what to derive from a source phv00197400_enum