In [20]:
from itertools import islice
import os
from pathlib import Path
import subprocess
import time
import yaml

from linkml.validator.loaders import TsvLoader
from linkml_runtime import SchemaView
from linkml_map.transformer.object_transformer import ObjectTransformer

import pandas as pd
import pprint

In [2]:
class DataLoader:
    def __init__(self, base_path):
        self.base_path = base_path

    def __getitem__(self, pht_id):
        file_path = os.path.join(self.base_path, f"{entity}.tsv")
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"No TSV file found for {entity} at {file_path}")
        return TsvLoader(os.path.join(self.base_path, f"{entity}.tsv")).iter_instances()

    def __contains__(self, pht_id):
        return os.path.exists(os.path.join(self.base_path, f"{entity}.tsv"))

In [3]:
# def get_spec_files(directory, search_string):
#     """
#     Find YAML files in the directory that contain the search_string.
#     Returns a sorted list of matching file paths.
#     """
#     directory = Path(directory)

#     result = subprocess.run(
#         ['grep', '-rl', search_string, str(directory)],
#         stdout=subprocess.PIPE,
#         text=True,
#         check=True
#     )

#     file_paths = [
#         Path(p.strip()) for p in result.stdout.strip().split('\n')
#         if p.strip().endswith(('.yaml', '.yml'))
#     ]
#     return sorted(file_paths, key=lambda p: p.stem)

In [4]:
# def multi_spec_transform(data_loader, spec_files, source_schema, target_schema):
#     print(f'\n** spec_files: {spec_files}')
#     for file in spec_files:
#         print(f"{file.stem}", end='', flush=True)
#         try:
#             with open(file) as f:
#                 specs = yaml.safe_load(f)
#                 print(f'\n** specs: {specs}')
#             for block in specs:
#                 derivation = block["class_derivations"]
#                 print(".", end='', flush=True)
#                 for class_name, class_spec in derivation.items():
#                     pht_id = class_spec["populated_from"]
#                     rows = data_loader[pht_id]

#                     transformer = ObjectTransformer(unrestricted_eval=True)
#                     transformer.source_schemaview = SchemaView(source_schema)
#                     transformer.target_schemaview = SchemaView(target_schema)
#                     transformer.create_transformer_specification(block)

#                     for row in rows:
#                         mapped = transformer.map_object(row, source_type=pht_id)
#                         yield mapped
#             print('')
#         except Exception as e:
#             print(f"\n⚠️  Error processing {file}: {e.__class__.__name__} - {e}")
#             print(block)
#             import traceback
#             traceback.print_exc()
#             raise

In [36]:
######################
#  FOR INCLUDE DATA  #
######################


def single_spec_transform(tsv_file, spec_file, source_schema, target_schema, target_class="Participant"):
    # Load your mapping spec
    with open(spec_file) as f:
        spec = yaml.safe_load(f)
#         print(f'\n** spec: {spec}')
        
#     # If spec is a list, merge all dicts
#     if isinstance(spec, list):
#         merged_spec = {}
#         for block in spec:
#             merged_spec.update(block)
#     else:
#         merged_spec = spec
    
    
#     # Find all blocks that map to your target class
#     class_blocks = [
#         block["class_derivations"][target_class]
#         for block in spec
#         if target_class in block.get("class_derivations", {})
#     ]
#     if not class_blocks:
#         raise ValueError(f"{target_class} not found in any class_derivations in {spec_file}")
#     print(f'\n** class_blocks: {class_blocks}')
    
#     class_spec = class_blocks[0] if class_blocks else None
#     print(f'\n** class_spec: {class_spec}')
    
    
    # Load your TSV data
    data = pd.read_csv(tsv_file, sep="\t").to_dict(orient="records")
    
    # Cast field values to strings
    fields_to_cast = ['gender', 'race', 'ethnicity']
    data = [
        {k: (str(v) if k in fields_to_cast and v is not None else v) for k, v in row.items()}
        for row in data
    ]
    print(f'\n** data: {data[:3]}')
    

    

    # Another option to fix value_mappings - Set up the transformer once
    transformer = ObjectTransformer(unrestricted_eval=True)
    transformer.source_schemaview = SchemaView(source_schema)
    transformer.target_schemaview = SchemaView(target_schema)

    results = []
    for block in spec:
        if 'class_derivations' in block and target_class in block['class_derivations']:
            class_spec = block['class_derivations'][target_class]
            # Load this class mapping spec
            transformer.create_transformer_specification(block)
            for row in data:
                mapped = transformer.map_object(row, source_type=class_spec["populated_from"])
                results.append(mapped)
            break  # Only process the first matching block

    return results
    
    
    
    
    
    
#     # Set up the transformer
#     transformer = ObjectTransformer(unrestricted_eval=True)
#     transformer.source_schemaview = SchemaView(source_schema)
#     transformer.target_schemaview = SchemaView(target_schema)
#     transformer.create_transformer_specification(merged_spec)
    
    
    # DEBUG
#     print("\n--- Mapping spec for sex field ---")
#     pprint.pprint(class_spec['slot_derivations'].get('sex'))
#     print("\n--- Entire Mapping spec ---")
#     pprint.pprint(merged_spec)

    
    # Map each row and yield the result
#     row_count = 0
#     for row in data:
#         if row_count < 5:
#             print(f'\n** row: {row}')
#         row_count += 1
#         mapped = transformer.map_object(row, source_type=class_spec["populated_from"])
#         yield mapped

# Example usage:
results = list(single_spec_transform(
    tsv_file="../data/BrainPower-STUDY/raw_data/TSV/demographics.tsv",
    spec_file="../data/BrainPower-STUDY/model_transformation/brain_power_transformation_PARTICIPANT-ONLY.yaml",
    source_schema="../data/BrainPower-STUDY/study_specific_model/BrainPower_INCLUDE_SCHEMA.yaml",
    target_schema="../data/BrainPower-STUDY/include_schema/include_schema.yaml",
    target_class="Participant"  # or whatever your class is called
))


with open("demographics_transformed.yaml", "w") as f:
    yaml.safe_dump(results, f, sort_keys=False, allow_unicode=True)



** data: [{'id': 1301, 'timepoint': 1, 'gender': '2', 'level_support': 1, 'race': '5', 'ethnicity': '2'}, {'id': 1302, 'timepoint': 1, 'gender': '2', 'level_support': 1, 'race': '6', 'ethnicity': '2'}, {'id': 1303, 'timepoint': 1, 'gender': '2', 'level_support': 1, 'race': '5', 'ethnicity': '2'}]


In [24]:
import linkml_map
print(linkml_map.__version__)


AttributeError: module 'linkml_map' has no attribute '__version__'

In [None]:
# def batched(iterable, batch_size):
#     iterator = iter(iterable)
#     for first in iterator:
#         batch = [first, *islice(iterator, batch_size - 1)]
#         yield batch

In [None]:
# source_sv = SchemaView("/sbgenomics/workspace/output/WHI/Schema_WHI_v12_c1/Schema_WHI_v12_c1.yaml")
source_sv = SchemaView("../data/BrainPower-STUDY/study_specific_model/BrainPower_INCLUDE_SCHEMA.yaml")
source_schema = source_sv.schema

target_sv = SchemaView("../data/BrainPower-STUDY/include_schema/include_schema.yaml")
target_schema = target_sv.schema

#var_dir = "/sbgenomics/workspace/NHLBI-BDC-DMC-HV/priority_variables_transform/WHI-ingest/"
var_dir = "../data/BrainPower-STUDY/raw_data/TSV/"

In [None]:
output_base = "../data"
study_dir = "BrainPower-TEST"
os.makedirs(f"{output_base}/{study_dir}/", exist_ok=True)

data_loader = DataLoader("../data/BrainPower-STUDY/raw_data/TSV/")

entities = [
    #"Study",
    "Participant",
    #"Condition"
]

start = time.perf_counter()

for entity in entities:
    print(f"Starting {entity}")
    #spec_files = get_spec_files(var_dir, f" {entity}:")
    spec_files = [Path("../data/BrainPower-STUDY/model_transformation/brain_power_transformation.yaml")]
    output_path = f"{output_base}/{study_dir}/{entity}-data.yaml"

    # subset = spec_files
#     subset = [p for p in spec_files if p.stem == "demo"]
    # subset = [p for p in spec_files if p.stem >= "stroke"]
    # subset = [p for p in spec_files if p.stem > "afib"]
    subset = spec_files

    
    with open(output_path, "w") as f:
        for batch in batched(multi_spec_transform(data_loader, subset, source_schema, target_schema), batch_size=100):
            yaml.dump_all(batch, f, explicit_start=True)
#         for batch in batched(multi_spec_transform(data_loader, mapping_spec_file, source_schema, target_schema), batch_size=100):
#                     yaml.dump_all(batch, f, explicit_start=True)
    
        print(f"{entity} Complete")

end = time.perf_counter()
print(f"Time: {end - start:.2f} seconds")

In [None]:
# # Step 1: Read your input data file (demographics.tsv)
# df = pd.read_csv("../data/BrainPower-STUDY/raw_data/TSV/demographics.tsv", sep="\t")
# data_records = df.to_dict(orient="records")
# display(df.head())

# # Step 2: Set up your transformation config
# source_schema = "../data/BrainPower-STUDY/study_specific_model/BrainPower_INCLUDE_SCHEMA.yaml"
# map_yaml = "../data/BrainPower-STUDY/model_transformation/brain_power_transformation.yaml"
# target_schema = "../data/BrainPower-STUDY/include_schema/include_schema.yaml"


# # Step 3: Initialize the transformer
# ot = ObjectTransformer(
#     source_schema_path=source_schema,
#     map_path=map_yaml,
#     target_schema_path=target_schema,
#     output_path="../data/BrainPower-STUDY/transformed_data/demographics_transformed.yaml",  # or .json as desired
#     input_format="tsv",
#     target_class="Participant"
# )


# # Transform and write output
# transformed = ot.transform_file("../data/BrainPower-STUDY/raw_data/TSV/demographics.tsv")


# with open("demographics_transformed.yaml", "w") as f:
#     yaml.safe_dump(transformed, f, sort_keys=False, allow_unicode=True)

# print("Transformed data written to demographics_transformed.yaml")



