# RECOVER Adult PIC-SURE Data Dictionary 
This notebook creates the RECOVER Adult Cohort Data Dictionary using the RECOVER data available via the PIC-SURE API through the development environment.

In [None]:
# Do installs
import sys
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.feather as feather
import re
import numpy as np

## Step 1: Gather data from PIC-SURE API (or load existing data)
In this step, we get:
1. PIC-SURE Dictionary for RECOVER Adult cohort
2. Participant-level data for RECOVER Adult cohort

In [None]:
# Do imports for PIC-SURE API
!{sys.executable} -m pip install --upgrade --force-reinstall git+https://github.com/hms-dbmi/pic-sure-python-client.git
!{sys.executable} -m pip install --upgrade --force-reinstall git+https://github.com/hms-dbmi/pic-sure-python-adapter-hpds.git
!{sys.executable} -m pip install --upgrade --force-reinstall git+https://github.com/hms-dbmi/pic-sure-biodatacatalyst-python-adapter-hpds.git
import PicSureClient
import PicSureBdcAdapter

In [None]:
# Connect to PIC-SURE API
PICSURE_network_URL = "https://picsure.biodatacatalyst.nhlbi.nih.gov/picsure"
token_file = "token.txt"

with open(token_file, "r") as f:
    my_token = f.read()
    
bdc = PicSureBdcAdapter.Adapter(PICSURE_network_URL, my_token)

In [None]:
# Search for RECOVER Adult (phs003463) variables in PIC-SURE Dictionary
dictionary = bdc.useDictionary().dictionary() # Set up the dictionary
all_variables = dictionary.find("phs003463") # Retrieve all variables you have access to
search_vars = all_variables.dataframe()
recover_vars = search_vars[search_vars.studyId == "phs003463"]

In [None]:
halfway = int(len(recover_vars)/2)
first_half = recover_vars[0:halfway]
second_half = recover_vars[halfway:]

In [None]:
# Get RECOVER Adult data
# Note: The RECOVER data is currently too large to export in one query, which is why two queries are created and executed
authPicSure = bdc.useAuthPicSure()
test_query1 = authPicSure.query()
test_query2 = authPicSure.query()
test_query1.anyof().add(first_half.HPDS_PATH)
test_query2.anyof().add(second_half.HPDS_PATH)
recover_results1 = test_query1.getResultsDataFrame(low_memory = False)
recover_results2 = test_query2.getResultsDataFrame(low_memory = False)

In [None]:
# Merge the results of the two queries
recover_results = recover_results1.merge(recover_results2, on = ["Patient ID", "\\_Parent Study Accession with Subject ID\\",
                                                                 "\\_Topmed Study Accession with Subject ID\\", "\\_consents\\"])

In [None]:
# Save as files to not rely on dev environment
feather.write_feather(recover_results, 'recover_export')
feather.write_feather(recover_vars, 'recover_variable_info')

In [None]:
# Load previously saved files
recover_vars = feather.read_feather('recover_variable_info')
recover_results = feather.read_feather('recover_export')

## Step 2: Create Data Dictionary - Define Functions
Set up the functions to review the data and create the data dictionary.

Dataframe  / Data Dictionary plan

| Variable / Field Name | Field Label | Dataset | Data Type | Field Attributes | Mapped Instrument | Number of participants |
| ------ | ----- | ----- | ----- | ----- | ----- | ----- |
| Encoded variable name | Decoded Variable Description | Dataset Name | Continuous or Categorical | Continuous: min/max/mean, Categorical: Date, ID, or # top 6 values | RedCap Instrument mapping | Number of participants with values |
| PIC-SURE name | PIC-SURE decoded description | Order enrollment enrollment, then enrollment demographics, then the other enrollment, then the follow up, then biospecimens, then fitbit | PIC-SURE data type | Continuous - only show 2 digits after decimal, Categorical: do top 6 values defined by number of participants with values | Based on document shared | Number of participants with values |



In [None]:
# Not all columns needed for RECOVER Adult Data dictionary
simplified = recover_vars[['columnmeta_var_id', 'columnmeta_description', 'columnmeta_var_group_id', 
             'columnmeta_var_group_description', 'columnmeta_data_type', 'columnmeta_min', 
             'columnmeta_max', 'values', 'HPDS_PATH']]

In [None]:
# Start with identifying the datasets used to map to instruments from RedCap form
# remove extra info in dataset, such as "enrollment_", "followup_XX_", and "miscellaneous_form_"
picsure_dt = simplified['columnmeta_var_group_id'].unique()
instruments = []
for dt in picsure_dt:
    instrum = re.sub("enrollment_|followup_\d{1,2}_|miscellaneous_form_|_baseline_arm_\d{1,2}|_followup_\d{1,2}_arm_\d{1,2}", '', dt)
    if instrum not in instruments:
        instruments.append(instrum)

for i in instruments:
    print(i)

In [None]:
# Map to RedCap instruments manually with human-readable name
instrument_mapping = {
                      "alcohol_and_tobacco": "Alcohol and Tobacco",
                      "assessment_scores": "Assessment Scores", "covid_treatment": "COVID Treatment",
                      "demographics": "Demographics", "disability": "Disability", 
                      "enrollment": "Enrollment", "long_covid_treatment_trial": "Long COVID Treatment",
                      "pasc_symptoms": "PASC Symptoms", "pregnancy": "Pregnancy", 
                      "recent_covid_treatment": "Recent COVID Treatment", "social_determinants_of_health": "Social Determinants of Health",
                      "tier_12_consent_tracking":"Tier 1-2 consent tracking", "withdrawal":"Withdrawal",
                      "alcohol_and_tobacco_followup": "Alcohol and Tobacco (Followup)", "new_covid_infection": "New COVID Infection",
                      "pregnancy_followup":"Pregnancy (Followup)", "visit_form": "Visit form",
                      "end_of_participation":"End Of Participation", "study_termination": "Study termination",
                      "psg_quality_summary_form": "PSG Quality Summary Form", 
                      "facility_sleep_study": "Facility Sleep Study", "neonatal_delivery_and_outcome_form":"Neonatal Delivery and Outcome Form",
                      "oral_glucose_test": "Oral Glucose Test", "colonoscopy":"Colonoscopy", "fibroscan": "Fibroscan",
                      "cardiac_mri_reading_center": "Cardiac MRI Reading Center", "clinical_labs": "Clinical Labs",
                      "research_labs": "Research Labs", "tier_1_office_visit": "Tier 1 Office Visit", "echocardiogram_with_strain":"Echocardiogram",
                      "pft_reading_center": "PFT Reading Center", "acth_and_cortisol_test": "ACTH and Cortisol Test",
                      "home_sleep_assessment": "Home Sleep Assessment", "biospecimens": "Biospecimens",
                      "sleep_reading_center": "Sleep Reading Center", "adult_delivery_and_outcome_form":"Adult Delivery and Outcome Form",
                      "formal_neuropsychological_testing": "Full neurocognitive testing", "comprehensive_audiometry": "Comprehensive Audiometry",
                      "nih_toolbox": "NIH Toolbox", "brain_mri_with_gadolinium": "Brain MRI", "brain_mri_quality_confirmation": "Brain MRI Quality Confirmation",
                      "neuropathy_examination":"Neuropathy Examination", "six_minute_walk_test": "Six Minute Walk Test",
                      "electrocardiogram": "Electrocardiogram", "social_determinants_of_health_followup":"Social Determinants of Health (Followup)",
                      "endopat_testing":"Endopat Testing", "chest_ct_reading_center": "Chest CT Reading Center",
                      "comorbidities": "Comorbidities", "pcl5":"PCL5", "disability": "Disability", "chest_ct":"Chest CT",
                      "cardiac_mri":"Cardiac MRI", "renal_ultrasound":"Renal Ultrasound", "cardiopulmonary_exercise_testing":"Cardiopulmonary Exercise Testing",
                      "pulmonary_function_tests": "Pulmonary Function Tests", "rehabilitation_testing":"Rehabilitation Testing",
                      "full_ent_examination":"Full ENT Examination", "gastric_emptying_study":"Gastric Emptying Study", 
                      "hepatitis_tests": "Hepatitis Tests", "home_polysomnography_with_ess_and_isi": "Home Polysomnography With ESS and ISI",
                      "serum_b12_and_methylmalonic_acid": "Serum B12 And Methylmalonic Acid", "mini":"MINI",
                      "mini_prequestionnaire": "MINI Prequestionnaire", "pg13r":"PG13r", "upsit_smell_test":"UPSIT Smell Test",
                      "vaccine_status":"Vaccine Status", "vision_testing":"Vision Testing", "change_in_symptoms_since_infection":"Change In Symptoms Since Infection",
                      "cpet_reading_center":"CPET Reading Center", "medication_changes":"Medication Changes",
                      "mhp_data":"MHP Data", "medications":"Medications", "wearable_data":"Wearable Data",
                      "audiometry_survey":"Audiometry Survey", "cardiovagal_innervation_testing":"Cardiovagal Innervation Testing",
                      "electromyography":"Electromyography", "nerve_conduction_study":"Nerve Conduction Study",
                      "plasma_catecholamine_testing": "Plasma Catecholamine Testing", "skin_biopsy":"Skin Biopsy",
                      "tilt_table_test": "Tilt Table Test", "drc_data": "DRC Data",
                      "facility_sleep_questionnaire_morning_after":"Facility Sleep Questionnaire Morning After",
                      "facility_sleep_questionnaire_night_before":"Facility Sleep Questionnaire Night Before",
                      "biostats_derived_core_proc":"Biostats Derived Core", 
                      "biostats_derived_symptoms":"Biostats Derived Symptoms", "biostats_derived_visits":"Biostats Derived Visits"
                      }

# visit_data, fitbit, miscellaneous_form_visit_data, biospecimens do not have redcap forms
# biostats_ datasets are Derived data sets, these are split by visit "visits_baseline_arm_1" - take off "arm_1" as well


In [None]:
# Map PIC-SURE dataset column via these mappings to create a new column
for_df_instrum = []
for dt in simplified['columnmeta_var_group_id']:
    instrum = re.sub("enrollment_|followup_\d{1,2}_|miscellaneous_form_|_baseline_arm_\d{1,2}|_followup_\d{1,2}_arm_\d{1,2}", '', dt)
    if instrum in instrument_mapping.keys():
        for_df_instrum.append(instrument_mapping[instrum])
    else:
        for_df_instrum.append('')

df_instrum = pd.DataFrame({"mapped_instrument": for_df_instrum})
#df_instrum.head()
simplified_with_instrum = pd.concat([simplified, df_instrum], axis=1)
simplified_with_instrum.head()

In [None]:
mvp = simplified_with_instrum[['columnmeta_var_id', 'columnmeta_description', 'columnmeta_var_group_id', 'columnmeta_var_group_description',
                               'columnmeta_data_type', 'mapped_instrument', 
                               'HPDS_PATH', 'values']]
mvp.columns = ["Variable Name", "Variable Description", "Dataset", "Dataset Description", "Data Type", 
               "Mapped Instrument", 'HPDS_PATH', "values"]

In [None]:
# Create the data dictionary functions

def continuous(hpds_path, recover_results):
    field_attributes = {}
    if type(recover_results[hpds_path].min()) == np.float64:
        field_attributes["min"] = round(recover_results[hpds_path].min(), 2)
        field_attributes["max"] = round(recover_results[hpds_path].max(), 2)
        field_attributes["mean"] = round(recover_results[hpds_path].mean(), 2)
    else:
        field_attributes["min"] = round(recover_results[hpds_path].min().iloc[0], 2)
        field_attributes["max"] = round(recover_results[hpds_path].max().iloc[0], 2)
        field_attributes["mean"] = round(recover_results[hpds_path].mean().iloc[0], 2)
    return field_attributes

def categorical(hpds_path, recover_results, data_dict):
    field_attributes = {}
    pattern = r'^\d{4}-\d{2}-\d{2}$'
    row = data_dict[data_dict.HPDS_PATH == hpds_path]
    #print(list(row["values"].iloc[0]))
    if "participant_id" in row["Variable Name"].iloc[0]:
        field_attributes = "ID"
    elif len(row["values"]) > 0 and bool(re.match(pattern, list(row["values"].iloc[0])[0])):
        field_attributes = "Date"
    elif len(row["values"]) > 0:
        row_values = list(row["values"].iloc[0])
        #print(row_values)
        for cat in row_values:
            if cat == "false":
                field_attributes[False] = sum(recover_results[hpds_path] == False)
            elif cat == "true":
                field_attributes[True] = sum(recover_results[hpds_path] == True)
            else:
                try:
                    field_attributes[cat] = sum(recover_results[hpds_path] == float(cat))
                except:
                    field_attributes[cat] = sum(recover_results[hpds_path] == cat)
        if len(field_attributes) > 5:
            field_attributes = dict(sorted(field_attributes.items(), key = lambda x: x[1], reverse = True)[:6])
    return field_attributes

# Code below to test functions
#x = categorical("\\phs003463\\enrollment_covid_treatment\\rx_carelevel_enrollment_covid_treatment\\", 
# recover_results, mvp)
#print(x)


In [None]:
# Cases for testing categorical options
# True/False spcmn:st_whl_bldt1_3m_a \\phs003463\\biospecimens\\SPCMN:ST_WHL_BLDT1_3M_A\\
# Multiple: alco_rxdrugspre_enrollment_alcohol_and_tobacco
# More than 6: rx_carelevel_enrollment_covid_treatment \\phs003463\\enrollment_covid_treatment\\rx_carelevel_enrollment_covid_treatment\\
# Participant ID: \\phs003463\\biospecimens\\participant_id\\
# Date: alcofu_colldt_followup_4_alcohol_and_tobacco_followup  \\phs003463\\followup_4_alcohol_and_tobacco_followup\\alcofu_colldt_followup_4_alcohol_and_tobacco_followup\\

In [None]:
# Create data dictionary using above functions
# Note - this chunk of code takes a while - about 15 mins

num_participants_final = []
field_attr_final = []

for i, row in mvp.iterrows():
    hpds_path = mvp.loc[i, "HPDS_PATH"]
    d_type = mvp.loc[i, "Data Type"]
    num_participant = recover_results[hpds_path].count()
    num_participants_final.append(num_participant)
    
    if d_type == "continuous":
        field_attr = continuous(hpds_path, recover_results)
    if d_type == "categorical":
        field_attr = categorical(hpds_path, recover_results, mvp)
    
    print(hpds_path)
    print(field_attr)

    field_attr_final.append(field_attr)


In [None]:
# Add information to dataframe
final_data_dict = pd.concat([mvp, pd.DataFrame({"Number Participants": num_participants_final}), 
                             pd.DataFrame({"Field Attributes": field_attr_final})], axis=1)
final_data_dict = final_data_dict[['Variable Name', 'Variable Description', 'Dataset', 'Dataset Description', 'Data Type', 'Mapped Instrument', 'Number Participants', 'Field Attributes']]
final_data_dict.tail()


In [None]:
# Sort by datasets outlined above
# Order enrollment enrollment, then enrollment demographics, then the other enrollment, then the follow up, then biospecimens, then fitbit
comp = list(final_data_dict['Dataset'].unique())
print(comp)

In [None]:
# After using a different tool to order the list, read in from text file and save as python list
with open("ordered_list.txt", "r") as instrum_list:
	lines = instrum_list.readlines()
	#print(lines)

ordered_list = []
for i in lines:
	as_list = i.split("\n")
	ordered_list.append(as_list[0])
	
print(ordered_list)

In [None]:
# Check to make sure none are missing
for item in ordered_list:
    if item not in comp:
        print(item)

In [None]:
# GPT code to reorder df

# Assume df is your pandas DataFrame with column "Dataset"
# ordered_list is the list of datasets in the desired order

# Create a categorical data type with the desired order
cat_dtype = pd.CategoricalDtype(categories=ordered_list, ordered=True)

# Convert the "Dataset" column to categorical with the desired order
final_data_dict['Dataset'] = final_data_dict['Dataset'].astype(cat_dtype)

# Sort the DataFrame based on the "Dataset" column
df_sorted = final_data_dict.sort_values(by='Dataset')

# Reset index if needed
df_sorted.reset_index(drop=True, inplace=True)

# Now df_sorted contains the DataFrame with rows ordered according to the ordered_list
df_sorted.head()

In [None]:
df_sorted.to_csv("RECOVER_Adult_PIC-SURE_Data_Dictionary_with_values.csv")