In [1]:
import yaml

from linkml.validator.loaders import TsvLoader
from linkml.utils.schema_builder import SchemaBuilder

from linkml_runtime.linkml_model import SlotDefinition
from linkml_runtime import SchemaView

from linkml_map.session import Session
from linkml_map.transformer.object_transformer import ObjectTransformer

In [23]:
# HMB Data set files
# source_sv = SchemaView("/sbgenomics/project-files/COPDGene_HMB_Schema.yaml")
# data_loader = TsvLoader("/sbgenomics/project-files/phs000179.v6.pht002239.v4.p2.c1.COPDGene_Subject_Phenotypes.HMB_phv_clean.tsv")
# output_filename = "transformed_person_data_HMB.yaml"

# DS_CS Data set files
source_sv = SchemaView("/sbgenomics/project-files/COPDGene_DS_CS_Schema.yaml")
data_loader = TsvLoader("/sbgenomics/project-files/phs000179.v6.pht002239.v4.p2.c2.COPDGene_Subject_Phenotypes.DS-CS_phv_clean.tsv")
output_filename = "transformed_person_data_DS_CS.yaml"


source_schema = source_sv.schema
target_schema = SchemaView("NHLBI-BDC-DMC-HM/src/bdchm/schema/bdchm.yaml").schema
data_rows = data_loader.iter_instances()

first_row = next(data_rows)
cur_row = first_row

var_dir = "NHLBI-BDC-DMC-HV/priority_variables_transform/copdgene-linkml-map/var_files/"

In [3]:
# Person - top level class
person_yaml = yaml.safe_load("""
class_derivations:
  Person:
    populated_from: COPDGene
    slot_derivations:
      species:
        expr: "'Homo Sapiens'"
      identity:
        populated_from: dbGaP_Subject_ID
""")

# # Dump to YAML
# with open(var_dir + "person" + ".yaml", "w") as f:
#     yaml.dump(person_yaml, f, sort_keys=False, allow_unicode=True)

In [4]:
# Participant - top level class for study data
participant_yaml = yaml.safe_load("""
class_derivations:
  Participant:
    populated_from: COPDGene
    slot_derivations:
      associated_participant: 
        populated_from: phv00159568
      identity:
        populated_from: dbGaP_Subject_ID
      member_of_research_study:
        expr: "'COPDGene'"
      # originating_site:
      #   populated_from: phv00159569
      # study_arm:
      #   populated_from: phv00278174
""")

# # Dump to YAML
# with open(var_dir + "participant" + ".yaml", "w") as f:
#     yaml.dump(participant_yaml, f, sort_keys=False, allow_unicode=True)

In [5]:
transform_spec = person_yaml

input_data = cur_row

# Create ObjectTransformer and apply transformation
transformer = ObjectTransformer(unrestricted_eval=True)
transformer.source_schemaview = SchemaView(source_schema)
transformer.target_schemaview = SchemaView(target_schema)
transformer.create_transformer_specification(transform_spec)

result = transformer.map_object(input_data, source_type="COPDGene")

print("Transformation Successful!")

Transformation Successful!


In [6]:
# Get the participants slot
person_class = person_yaml.setdefault("class_derivations", {}).setdefault("Person", {})
person_participants_slot = person_class.setdefault("slot_derivations", {}).setdefault("participants", {})

# Add the Participant object_derivation to the participants slot
person_participants_slot.setdefault("object_derivations", [ participant_yaml ])

# print(yaml.dump(person_yaml, sort_keys=False))

[{'class_derivations': {'Participant': {'populated_from': 'COPDGene',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00159568'},
     'identity': {'populated_from': 'dbGaP_Subject_ID'},
     'member_of_research_study': {'expr': "'COPDGene'"}}}}}]

In [7]:
transform_spec = person_yaml

input_data = cur_row

# Create ObjectTransformer and apply transformation
transformer = ObjectTransformer(unrestricted_eval=True)
transformer.source_schemaview = SchemaView(source_schema)
transformer.target_schemaview = SchemaView(target_schema)
transformer.create_transformer_specification(transform_spec)

result = transformer.map_object(input_data, source_type="COPDGene")

print(result)
print("Transformation Successful!")

{'species': 'Homo Sapiens', 'identity': [432557], 'participants': [{'associated_participant': 'COPDGene_G90453', 'identity': [432557], 'member_of_research_study': 'COPDGene'}]}
Transformation Successful!


In [8]:
# Load Condition class derivations
angina = yaml.safe_load(open(str(var_dir + "condition/" + "angina" + ".yaml")))
asthma = yaml.safe_load(open(str(var_dir + "condition/" + "asthma" + ".yaml")))
copd = yaml.safe_load(open(str(var_dir + "condition/" + "copd" + ".yaml")))
diabetes = yaml.safe_load(open(str(var_dir + "condition/" + "diabetes" + ".yaml")))
hist_hrt_failure = yaml.safe_load(open(str(var_dir + "condition/" + "hist_hrt_failure" + ".yaml")))
hist_my_inf = yaml.safe_load(open(str(var_dir + "condition/" + "hist_my_inf" + ".yaml")))
hyperten = yaml.safe_load(open(str(var_dir + "condition/" + "hyperten" + ".yaml")))
pad = yaml.safe_load(open(str(var_dir + "condition/" + "pad" + ".yaml")))
slp_ap = yaml.safe_load(open(str(var_dir + "condition/" + "slp_ap" + ".yaml")))
stroke = yaml.safe_load(open(str(var_dir + "condition/" + "stroke" + ".yaml")))
stroke_isch_atk = yaml.safe_load(open(str(var_dir + "condition/" + "stroke_isch_atk" + ".yaml")))

# Get the demography slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_exposures_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("conditions", {})

# Add the Demography object_derivation to the demography slot
participant_exposures_slot.setdefault("object_derivations", [
    angina,
    asthma,
    copd,
    diabetes,
    hist_hrt_failure,
    hist_my_inf,
    hyperten,
    pad,
    slp_ap,
    stroke,
    stroke_isch_atk,
])

[{'class_derivations': {'Condition': {'populated_from': 'COPDGene',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00159568'},
     'condition_concept': {'expr': "'HP:0001681'"},
     'condition_provenance': {'expr': "'PATIENT_SELF-REPORTED_CONDITION'"},
     'relationship_to_participant': {'expr': "'SELF'"},
     'condition_status': {'populated_from': 'phv00159608',
      'value_mappings': {'0': 'ABSENT', '1': 'PRESENT'}}}}}},
 {'class_derivations': {'Condition': {'populated_from': 'COPDGene',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00159568'},
     'condition_concept': {'expr': "'MONDO:0004979'"},
     'condition_status': {'expr': "case( ({phv00159701} == 0, 'ABSENT'), ({phv00159701} == 1, case( ({phv00159705} == 0, 'HISTORICAL'), ({phv00159705} == 1, 'PRESENT'), ({phv00159705} == 3, case( ({phv00159704} == 3, 'UNKNOWN'), (True, 'HISTORICAL') )) )), ({phv00159701} == 3, case( ({phv00159705} == 0, case( ({phv00159704} == 1, 

In [9]:
demography_yaml = yaml.safe_load("""
class_derivations:
  Demography:
    populated_from: COPDGene
    slot_derivations:
      associated_participant:
        populated_from: phv00159568
      sex:
        populated_from: phv00159571
        value_mappings:
          '1': OMOP:8507  # MALE
          '2': OMOP:8532  # FEMALE
      ethnicity:
        populated_from: phv00159573
        value_mappings:
          '1': HISPANIC_OR_LATINO
          '2': NOT_HISPANIC_OR_LATINO
      race:
        populated_from: phv00159572
        value_mappings:
          '1': OMOP:8527
          '2': OMOP:8516
          '3': OMOP:8515
          '4': OMOP:8557
          '5': OMOP:8657
          '6': OMOP:45880900
          '7': OMOP:8552
""")

# # Dump to YAML
# with open(var_dir + "demography" + ".yaml", "w") as f:
#     yaml.dump(demography_yaml, f, sort_keys=False, allow_unicode=True)

In [10]:
# Get the demography slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_demography_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("demography", {})

# Add the Demography object_derivation to the demography slot
participant_demography_slot.setdefault("object_derivations", [ demography_yaml ])

[{'class_derivations': {'Demography': {'populated_from': 'COPDGene',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00159568'},
     'sex': {'populated_from': 'phv00159571',
      'value_mappings': {'1': 'OMOP:8507', '2': 'OMOP:8532'}},
     'ethnicity': {'populated_from': 'phv00159573',
      'value_mappings': {'1': 'HISPANIC_OR_LATINO',
       '2': 'NOT_HISPANIC_OR_LATINO'}},
     'race': {'populated_from': 'phv00159572',
      'value_mappings': {'1': 'OMOP:8527',
       '2': 'OMOP:8516',
       '3': 'OMOP:8515',
       '4': 'OMOP:8557',
       '5': 'OMOP:8657',
       '6': 'OMOP:45880900',
       '7': 'OMOP:8552'}}}}}}]

In [11]:
# Load DrugExposure class derivations
tak_betablk_resp = yaml.safe_load(open(str(var_dir + "exposure/" + "tak_betablk_resp" + ".yaml")))
tak_betablk = yaml.safe_load(open(str(var_dir + "exposure/" + "tak_betablk" + ".yaml")))
tak_adrenergics = yaml.safe_load(open(str(var_dir + "exposure/" + "tak_adrenergics" + ".yaml")))
tak_cort_steroid_resp = yaml.safe_load(open(str(var_dir + "exposure/" + "tak_cort_steroid_resp" + ".yaml")))
tak_cort_steroid_oral = yaml.safe_load(open(str(var_dir + "exposure/" + "tak_cort_steroid_oral" + ".yaml")))
tak_anabolic_steroid = yaml.safe_load(open(str(var_dir + "exposure/" + "tak_anabolic_steroid" + ".yaml")))

# Get the demography slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_exposures_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("exposures", {})

# Add the Demography object_derivation to the demography slot
participant_exposures_slot.setdefault("object_derivations", [
    tak_betablk_resp,
    tak_betablk,
    tak_adrenergics,
    tak_cort_steroid_resp,
    tak_cort_steroid_oral,
    tak_anabolic_steroid,
])

[{'class_derivations': {'DrugExposure': {'populated_from': 'COPDGene',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00159568'},
     'drug_concept': {'expr': "'RXNORM:C07A'"},
     'expsoure_provenance': {'expr': "'PATIENT SELF-REPORTED MEDICATION'"},
     'exposure_status': {'populated_from': 'phv00159642',
      'value_mappings': {'0': 'ABSENT', '1': 'PRESENT'}},
     'route_concept': {'expr': "'RESPIRATORY TRACT'"}}}}},
 {'class_derivations': {'DrugExposure': {'populated_from': 'COPDGene',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00159568'},
     'drug_concept': {'expr': "'RXNORM:C07A'"},
     'expsoure_provenance': {'expr': "'PATIENT SELF-REPORTED MEDICATION'"},
     'exposure_status': {'populated_from': 'phv00159643',
      'value_mappings': {'0': 'ABSENT', '1': 'PRESENT'}}}}}},
 {'class_derivations': {'DrugExposure': {'populated_from': 'COPDGene',
    'slot_derivations': {'associated_participant': {'populated_from': 'p

In [12]:
# Load MeasurementObservation class derivations
bdy_hgt = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bdy_hgt" + ".yaml")))
bdy_wgt = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bdy_wgt" + ".yaml")))
bmi = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bmi" + ".yaml")))
bp_diastolic = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bp_diastolic" + ".yaml")))
bp_systolic = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "bp_systolic" + ".yaml")))
fev1 = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "fev1" + ".yaml")))
fev1_fvc = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "fev1_fvc" + ".yaml")))
fvc = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "fvc" + ".yaml")))
hrt_rt = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "hrt_rt" + ".yaml")))
spo2 = yaml.safe_load(open(str(var_dir + "measurement_observation/" + "spo2" + ".yaml")))

# Get the demography slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_exposures_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("measurements", {})

# Add the Demography object_derivation to the demography slot
participant_exposures_slot.setdefault("object_derivations", [
    bdy_hgt,
    bdy_wgt,
    bmi,
    bp_diastolic,
    bp_systolic,
    fev1,
    fev1_fvc,
    fvc,
    hrt_rt,
    spo2
])

[{'class_derivations': {'MeasurementObservation': {'populated_from': 'COPDGene',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00159568'},
     'observation_type': {'expr': "'OBA:VT0001253'"},
     'value_quantity': {'object_derivations': [{'class_derivations': {'Quantity': {'populated_from': 'COPDGene',
          'slot_derivations': {'value_decimal': {'populated_from': 'phv00159592'},
           'unit': {'expr': "'cm'"}}}}}]}}}}},
 {'class_derivations': {'MeasurementObservation': {'populated_from': 'COPDGene',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00159568'},
     'observation_type': {'expr': "'OBA:VT0001259'"},
     'value_quantity': {'object_derivations': [{'class_derivations': {'Quantity': {'populated_from': 'COPDGene',
          'slot_derivations': {'value_decimal': {'populated_from': 'phv00159591'},
           'unit': {'expr': "'kg'"}}}}}]}}}}},
 {'class_derivations': {'MeasurementObservation': {'populated_from': 'CO

In [13]:
observation_yaml = yaml.safe_load("""
class_derivations:
  Observation:
    populated_from: COPDGene
    slot_derivations:
      associated_participant:
        populated_from: phv00159568
      observation_type:
        expr: "'OMOP:4282779'"  # Cigarette smoking status
      value_enum:
        expr: "'OMOP:40766945' if {phv00159749} == 1 else 'OMOP:45883458' if {phv00159747} == 1 else 'OMOP:45883537'"
""")

# # Dump to YAML
# with open(var_dir + "observation" + ".yaml", "w") as f:
#     yaml.dump(observation_yaml, f, sort_keys=False, allow_unicode=True)

In [14]:
# Get the observations slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_observations_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("observations", {})

# Add the Demography object_derivation to the demography slot
participant_observations_slot.setdefault("object_derivations", [ observation_yaml ])

# print(yaml.dump(person_yaml))

[{'class_derivations': {'Observation': {'populated_from': 'COPDGene',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00159568'},
     'observation_type': {'expr': "'OMOP:4282779'"},
     'value_enum': {'expr': "'OMOP:40766945' if {phv00159749} == 1 else 'OMOP:45883458' if {phv00159747} == 1 else 'OMOP:45883537'"}}}}}]

In [15]:
# hist_cor_angio
hist_cor_angio = yaml.safe_load("""
class_derivations:
  Procedure:
    populated_from: COPDGene
    slot_derivations:
      associated_participant:
        populated_from: phv00159568
      procedure_concept:
        expr: "'OMOP:4184832'"  # Coronary angioplasty
      procedure_status:
        populated_from: phv00159632
        value_mappings:
          '0': ABSENT
          '1': PRESENT
""")

# # Dump to YAML
# with open(var_dir + "procedure/" + "hist_cor_angio" + ".yaml", "w") as f:
#     yaml.dump(hist_cor_angio, f, sort_keys=False, allow_unicode=True)

In [16]:
transform_spec = hist_cor_angio

input_data = cur_row

# Create ObjectTransformer and apply transformation
transformer = ObjectTransformer(unrestricted_eval=True)
transformer.source_schemaview = SchemaView(source_schema)
transformer.target_schemaview = SchemaView(target_schema)
transformer.create_transformer_specification(transform_spec)

result = transformer.map_object(input_data, source_type="COPDGene")

print(result)
print("Transformation Successful!")

{'associated_participant': 'COPDGene_G90453', 'procedure_concept': 'OMOP:4184832', 'procedure_status': 'PRESENT'}
Transformation Successful!


In [17]:
# hist_cor_bypg
hist_cor_bypg = yaml.safe_load("""
class_derivations:
  Procedure:
    populated_from: COPDGene
    slot_derivations:
      associated_participant:
        populated_from: phv00159568
      procedure_concept:
        expr: "'OMOP:4336464'"  #coronary bypass graft
      procedure_status:
        populated_from: phv00159631
        value_mappings:
          '0': ABSENT
          '1': PRESENT
""")

# # Dump to YAML
# with open(var_dir + "procedure/" + "hist_cor_bypg" + ".yaml", "w") as f:
#     yaml.dump(hist_cor_bypg, f, sort_keys=False, allow_unicode=True)

In [18]:
# Get the observations slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_procedures_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("procedures", {})

# Add the Demography object_derivation to the demography slot
participant_procedures_slot.setdefault("object_derivations", [ hist_cor_angio, hist_cor_bypg ])

[{'class_derivations': {'Procedure': {'populated_from': 'COPDGene',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00159568'},
     'procedure_concept': {'expr': "'OMOP:4184832'"},
     'procedure_status': {'populated_from': 'phv00159632',
      'value_mappings': {'0': 'ABSENT', '1': 'PRESENT'}}}}}},
 {'class_derivations': {'Procedure': {'populated_from': 'COPDGene',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00159568'},
     'procedure_concept': {'expr': "'OMOP:4336464'"},
     'procedure_status': {'populated_from': 'phv00159631',
      'value_mappings': {'0': 'ABSENT', '1': 'PRESENT'}}}}}}]

In [19]:
# edu_lvl
edu_lvl = yaml.safe_load("""
class_derivations:
  SdohObservation:
    populated_from: COPDGene
    slot_derivations:
      associated_participant:
        populated_from: phv00159568
      category:
        expr: "'EDUCATIONAL_ATTAINMENT'"
      value_enum:
        populated_from: phv00159773
        value_mappings:
          '1': 8TH_GRADE_OR_LESS
          '2': HIGH_SCHOOL_NO_DIPLOMA
          '3': HIGH_SCHOOL_GRADUATE_GED
          '4': SOME_COLLEGE_OR_TECH_NO_DEGREE
          '5': COLLEGE_OR_TECH_WITH_DEGREE
          '6': MASTERS_OR_DOCTORAL_DEGREE
""")

# Dump to YAML
with open(var_dir + "sdoh_observation/" + "edu_lvl" + ".yaml", "w") as f:
    yaml.dump(edu_lvl, f, sort_keys=False, allow_unicode=True)

In [20]:
# Get the observations slot on Participants class
participant_cls = participant_yaml.setdefault("class_derivations", {}).setdefault("Participant", {})
participant_sdoh_observations_slot = participant_cls.setdefault("slot_derivations", {}).setdefault("sdoh_observations", {})

# Add the Demography object_derivation to the demography slot
participant_sdoh_observations_slot.setdefault("object_derivations", [ edu_lvl ])

[{'class_derivations': {'SdohObservation': {'populated_from': 'COPDGene',
    'slot_derivations': {'associated_participant': {'populated_from': 'phv00159568'},
     'category': {'expr': "'EDUCATIONAL_ATTAINMENT'"},
     'value_enum': {'populated_from': 'phv00159773',
      'value_mappings': {'1': '8TH_GRADE_OR_LESS',
       '2': 'HIGH_SCHOOL_NO_DIPLOMA',
       '3': 'HIGH_SCHOOL_GRADUATE_GED',
       '4': 'SOME_COLLEGE_OR_TECH_NO_DEGREE',
       '5': 'COLLEGE_OR_TECH_WITH_DEGREE',
       '6': 'MASTERS_OR_DOCTORAL_DEGREE'}}}}}}]

In [24]:
transform_spec = person_yaml

input_data = cur_row

# Create ObjectTransformer and apply transformation
transformer = ObjectTransformer(unrestricted_eval=True)
transformer.source_schemaview = SchemaView(source_schema)
transformer.target_schemaview = SchemaView(target_schema)
transformer.create_transformer_specification(transform_spec)

# Transform all rows
output_data = []
for row in data_rows:
    result = transformer.map_object(row, source_type="COPDGene")
    if result:  # Avoid None or empty dicts
        output_data.append(result)

# Final wrapped structure (key should match the collection slot, or be schema-compatible)
wrapped_output = {
    "persons": output_data
}

# Dump to YAML
with open(output_filename, "w") as f:
    yaml.dump(wrapped_output, f, sort_keys=False, allow_unicode=True)

print("Transformation Successful!")

Transformation Successful!


In [22]:
# Dump final Person class to YAML
with open(var_dir + "../person" + ".yaml", "w") as f:
    yaml.dump(person_yaml, f, sort_keys=False, allow_unicode=True)