## Generate UNOS Mappings and then extract the content from the response_json

In [None]:
# !curategpt make-unos-mapping-logic --prefix HP: --path ../stagedb -c ont_hp -d ../data/THORACIC_DATA_parsed_data_dict.json -o /data/unos_mapping_output.yaml

In [12]:
import yaml

def parse_and_categorize_yaml(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    parsed_data = []
    failed_data = []
    buffer = []
    recording = False

    for line in lines:
        line = line.strip()
        if "content: '```yaml" in line:
            recording = True
            buffer = []
        elif recording:
            if "```'" in line:
                recording = False
                yaml_block = "\n".join(buffer)
                if yaml_block:
                    try:
                        data = yaml.safe_load(yaml_block)
                        if data:
                            parsed_data.append(data)
                        else:
                            failed_data.append(yaml_block)
                    except yaml.YAMLError:
                        failed_data.append(yaml_block)
            else:
                buffer.append(line)

    return parsed_data, failed_data

In [26]:
file_path = '../../data/unos_mapping_output.yaml'
parsed_yaml, failed_yaml = parse_and_categorize_yaml(file_path)
# print("Parsed YAML:", parsed_yaml[:5])
# print("Failed YAML:", failed_yaml[:5])

print("Parsed YAML Count:", len(parsed_yaml))
print("Failed YAML Count:", len(failed_yaml))

Parsed YAML Count: 202
Failed YAML Count: 24


In [27]:
len(parsed_yaml) 

202

In [28]:
import json
import pandas as pd

# Load the JSON data
with open('../../data/THORACIC_DATA_snippet_parsed_data_dict.json', 'r') as file:
    thoracic_data = json.load(file)

# Convert JSON data where each key is the variable name
data_items = [{'Variable_name': key, **value} for key, value in thoracic_data.items()]

# Convert both lists of dictionaries to DataFrames
df_thoracic = pd.DataFrame(data_items)
df_yaml = pd.DataFrame(parsed_yaml)

# Merge on 'Variable_name'
df_merged = pd.merge(df_thoracic, df_yaml, on='Variable_name', how='outer')

# Specify the desired column order
column_order = [
    'Variable_name', 'description', 'form', 'var_start_date', 'var_end_date', 
    'form_section', 'data_type', 'sas_analysis_format', 'comment', 
    'HPO_term', 'HPO_label', 'function', 'observed_values', 'valid_values'
]

# Reorder the columns, placing missing columns with NaN if they don't exist
df_merged = df_merged.reindex(columns=column_order)

# Output the merged DataFrame to a TSV file
df_merged.to_csv('../../data/THORACIC_DATA_merged_data_dict_and_mappings.tsv', sep='\t', index=False)
df_merged

Unnamed: 0,Variable_name,description,form,var_start_date,var_end_date,form_section,data_type,sas_analysis_format,comment,HPO_term,HPO_label,function,observed_values,valid_values
0,ABN_CONGEN_DON,DDR:Structural Abnormalities //Congenital:,DDR,2004-06-30 00:00:00,,ORGAN RECOVERY,CHAR(1),,,HP:0001710,Congenital abnormality of the great vessels,x == ''Y'',[],
1,ABN_LVH_DON,DDR:Structural Abnormalities //LVH:,DDR,2004-06-30 00:00:00,,ORGAN RECOVERY,CHAR(1),,,HP:0001712,Left ventricular hypertrophy,x == ''Y'',[],
2,ABN_VALVES_DON,DDR:Structural Abnormalities //Valves:,DDR,2004-06-30 00:00:00,,ORGAN RECOVERY,CHAR(1),,,HP:0031653,Abnormal heart valve physiology,"x == ""Y""",[],
3,ABO,RECIPIENT BLOOD GROUP @ REGISTRATION,TCR,1987-10-01 00:00:00,,CLINICAL INFORMATION,C,ABO,,,,,"[O, A1, B, AB, A]","{'Null or Missing': 'Not Reported', 'A': 'A', ..."
4,ABO_DON,DONOR BLOOD TYPE,DDR/LDR,1987-10-01 00:00:00,,DONOR INFORMATION,C,ABO,,,,,"[, O, A1, B, AB, A, A2]","{'Null or Missing': 'Not Reported', 'A': 'A', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526,WL_ID_CODE,ENCRYPTED REGISTRATION IDENTIFIER,CALCULATED,,,,NUM,,,,,,"[297495, 245835, 651475, 626441, 115094, 65782...",
527,WL_ORG,ORGAN LISTED FOR,WL DATA,1987-10-01 00:00:00,,,CHAR(4),,,,,,"[LU, HL, HR]",
528,WORK_INCOME_TCR,WORK FOR INCOME AT REGISTRATION?,TCR,2004-06-30 00:00:00,,CANDIDATE INFORMATION,CHAR(1),,,HP:0000001,All,x == ''Y'',"[, N, U]",
529,WORK_INCOME_TRR,RECIPIENT WORK FOR INCOME AT TRANSPLANT?,TRR,2004-06-30 00:00:00,,PATIENT STATUS,CHAR(1),,,,,,"[, N]",
