In [None]:
from pathlib import Path
import os
# import re
import subprocess
import yaml

from linkml.validator.loaders import TsvLoader
# from linkml.utils.schema_builder import SchemaBuilder

# from linkml_runtime.linkml_model import SlotDefinition
from linkml_runtime import SchemaView

# from linkml_map.session import Session
from linkml_map.transformer.object_transformer import ObjectTransformer

In [None]:
# source_sv = SchemaView("/sbgenomics/workspace/output/HCHS_SOL_cleaned/Schema_HCHS_SOL_v1_c1.yaml")
source_sv = SchemaView("/sbgenomics/workspace/output/MESA/Schema_MESA_v13_c1/Schema_MESA_v13_c1.yaml")
source_schema = source_sv.schema

target_sv = SchemaView("/sbgenomics/workspace/NHLBI-BDC-DMC-HM/src/bdchm/schema/bdchm.yaml")
target_schema = target_sv.schema

# var_dir = "/sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/HCHS-ingest/"
var_dir = "/sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/MESA-ingest/"

In [None]:
class LazyTsvLoader:
    def __init__(self, base_path):
        self.base_path = base_path
        self.cache = {}

    def __getitem__(self, pht_id):
        if pht_id not in self.cache:
            file_path = os.path.join(self.base_path, f"{pht_id}.tsv")
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"No TSV file found for {pht_id} at {file_path}")
            loader = TsvLoader(file_path)
            self.cache[pht_id] = list(loader.iter_instances())
        return self.cache[pht_id]

    def __contains__(self, pht_id):
        return os.path.exists(os.path.join(self.base_path, f"{pht_id}.tsv"))

# lazy_loader = LazyTsvLoader("/sbgenomics/workspace/output/HCHS_SOL_cleaned/HCHS-SOL-v1-c1/")
lazy_loader = LazyTsvLoader("/sbgenomics/workspace/output/MESA_cleaned/MESA-v13-c1")

In [None]:
transform_yaml = yaml.safe_load(open(str(var_dir + "" + "afib" + ".yaml")))

all_results = []
for block in transform_yaml:
    derivation = block["class_derivations"]
    for class_name, class_spec in derivation.items():
        pht_id = class_spec["populated_from"]
        print(f"Processing {pht_id} for class {class_name}")

        rows = lazy_loader[pht_id]
        
        transformer = ObjectTransformer(unrestricted_eval=True)
        transformer.source_schemaview = SchemaView(source_schema)
        transformer.target_schemaview = SchemaView(target_schema)
        transformer.create_transformer_specification(block)

        for row in rows:
            mapped = transformer.map_object(row, source_type=pht_id)
            all_results.append(mapped)

with open("output.yaml", "w") as f:
    yaml.dump(all_results, f)

In [None]:
def load_yaml_lists_matching_string_fileblocks(directory, search_string):
    """
    Search for YAML files in the directory that contain the search_string
    and load all their contents into a list of (filename, yaml_block).

    Returns:
        list of tuples: [(filename, block), ...]
    """
    directory = Path(directory)

    result = subprocess.run(
        ['grep', '-rl', search_string, str(directory)],
        stdout=subprocess.PIPE,
        text=True,
        check=True
    )

    combined = []
    for file_path in result.stdout.strip().split('\n'):
        if file_path.endswith('.yaml') or file_path.endswith('.yml'):
            with open(file_path, 'r') as f:
                data = yaml.safe_load(f)
                if isinstance(data, list):
                    for block in data:
                        combined.append((file_path, block))
                else:
                    combined.append((file_path, data))
    def sort_key(item):
        filename = Path(item[0]).stem  # gets base name without path or extension
        return filename

    combined.sort(key=sort_key)
    return combined


In [None]:
def transform_all_blocks_fileblocks(lazy_loader, file_block_pairs, source_schema, target_schema):
    all_results = []
    old_filename = ""
    for filename, block in file_block_pairs:
        if filename != old_filename:
            # if old_filename != "":
                # print(old_filename)
            print(filename)
            old_filename = filename
        try:
            derivation = block["class_derivations"]
            print(f".", end='', flush=True)
            for class_name, class_spec in derivation.items():
                pht_id = class_spec["populated_from"]
                rows = lazy_loader[pht_id]

                transformer = ObjectTransformer(unrestricted_eval=True)
                transformer.source_schemaview = SchemaView(source_schema)
                transformer.target_schemaview = SchemaView(target_schema)
                transformer.create_transformer_specification(block)

                for row in rows:
                    mapped = transformer.map_object(row, source_type=pht_id)
                    all_results.append(mapped)
        except Exception as e:
            print(f"\n⚠️  Error processing {filename}: {e.__class__.__name__} - {e}")
            print(block)
            import traceback
            traceback.print_exc()
            raise

    return all_results


In [None]:
base = lambda pair: Path(pair[0]).stem

In [None]:
# file_block_pairs = load_yaml_lists_matching_string_fileblocks(var_dir, " Condition:")
# file_block_pairs = load_yaml_lists_matching_string_fileblocks(var_dir, " DrugExposure:")
# file_block_pairs = load_yaml_lists_matching_string_fileblocks(var_dir, " MeasurementObservation:")
# file_block_pairs = load_yaml_lists_matching_string_fileblocks(var_dir, " Observation:")
# file_block_pairs = load_yaml_lists_matching_string_fileblocks(var_dir, " Person:")
# file_block_pairs = load_yaml_lists_matching_string_fileblocks(var_dir, " Procedure:")

subset = file_block_pairs
# subset = [b for b in file_block_pairs if base(b) == "chloride_bld"]
# subset = [b for b in file_block_pairs if base(b) >= "fam_income"]
# subset = [b for b in file_block_pairs if base(b) > "hdl"]

transform_results = transform_all_blocks_fileblocks(lazy_loader, subset, source_schema, target_schema)
print("Transform Complete")


# with open("output.yaml", "w") as f:
#     yaml.dump(all_results, f)
# with open("/sbgenomics/output-files/CHS-ingest/CHS-ingest-"+ "v7-c1-" + "Procedure" + "-HMB-IRB-MDS-data.yaml", "w") as f:
#     yaml.dump(transform_results, f)

In [None]:
data_version = "MESA-v13-c1"
lazy_loader = LazyTsvLoader("/sbgenomics/workspace/output/MESA_cleaned/"+ data_version +"/")

entities = [ "Condition", "DrugExposure", "MeasurementObservation", "Observation", "Person", ] # "Procedure" ]

for entity in entities:
    file_block_pairs = load_yaml_lists_matching_string_fileblocks(var_dir, f" {entity}:")
    transform_results = transform_all_blocks_fileblocks(lazy_loader, file_block_pairs, source_schema, target_schema)
    print(f"{entity} Complete")
    output_path = f"/sbgenomics/output-files/MESA-ingest/{data_version}-{entity}-HMB-data.yaml"
    with open(output_path, "w") as f:
        yaml.dump(transform_results, f)
