In [None]:
def getICDcodes(cidlist):
    
    cid = ','.join(map(str, cidlist))
    
    RetDys_sources = pd.io.gbq.read_gbq(f'''

    SELECT
        cr.concept_id_1 AS SNOMED_conceptID ,
        c1.concept_name AS SNOMED_name,
        cr.relationship_id,
        c2.concept_id AS ICD_conceptID,
        c2.vocabulary_id AS ICD_source,
        c2.concept_name AS ICD_name,
        c2.concept_code AS ICD_code

    FROM
        `{CDR}.concept_relationship` cr
        JOIN `{CDR}.concept` c2 on cr.concept_id_2 = c2.concept_id    
        JOIN `{CDR}.concept` c1 on cr.concept_id_1 = c1.concept_id

    WHERE
        #concept_id_1 IN (380395) #Retinal Dystrophy
        concept_id_1 IN (
            SELECT descendant_concept_id
            FROM `{CDR}.concept_ancestor`
            WHERE ancestor_concept_id IN ({cid}) 
        )
        AND
            cr.relationship_id = 'Mapped from'
        AND 
            c2.vocabulary_id IN ('ICD9CM', 'ICD10CM')
    '''
      ,  progress_bar_type="tqdm_notebook")
    
    return RetDys_sources
def reconstructICDcodes(cidlist):
    cid = ','.join(map(str, cidlist))
    reconstructed_ICD = pd.read_gbq(f'''
    SELECT 
        cr.concept_id_1 AS SNOMED_conceptID ,
        c1.concept_name AS SNOMED_name,
        cr.relationship_id,
        c2.concept_id AS ICD_conceptID,
        c2.vocabulary_id AS ICD_source,
        c2.concept_name AS ICD_name,
        c2.concept_code AS ICD_code

    FROM
        `{CDR}.concept_relationship` cr
        JOIN `{CDR}.concept` c2 ON cr.concept_id_2 = c2.concept_id    
        JOIN `{CDR}.concept` c1 ON cr.concept_id_1 = c1.concept_id
    WHERE
        -- domain_id = "Condition"
        cr.concept_id_1 IN (
            SELECT descendant_concept_id
            FROM `{CDR}.concept_ancestor`
            WHERE ancestor_concept_id IN (
                SELECT cr2.concept_id_2
                FROM `{CDR}.concept_relationship` cr2
                WHERE cr2.relationship_id = "Maps to"
                    AND cr2.concept_id_1 IN ({cid})
            )
        )
        AND
            cr.relationship_id = 'Mapped from'
        AND 
            c2.vocabulary_id IN ('ICD9CM', 'ICD10CM')
   
    ''', progress_bar_type="tqdm_notebook")

    return reconstructed_ICD
def getICD_personid(ICDcodes_list):
    #starting with list of codes, get personIDs mapped to them
    
    #retchordys_ICD_query = retchordys_ICD["C2_cid"].tolist()
    ICDcodes_query = ','.join(map(str, ICDcodes_list))
    
    ICD_person = pd.io.gbq.read_gbq(f'''

    SELECT
        #co.*,
        #co.condition_occurrence_id,
        co.person_id,
        #co.concept_id,
        co.condition_start_datetime,
        #co.condition_end_datetime,
        #co.visit_occurrence_id,
        co.condition_source_value,
        co.condition_source_concept_id,
        c.concept_name,
        c.vocabulary_id,
        c.concept_id,
        #p.person_id,
        p.birth_datetime,
        p.gender_source_value,
        p.sex_at_birth_source_value,
        p.race_source_value,
        p.ethnicity_source_value
        #p.*
    FROM
        `{CDR}.condition_occurrence` co
        JOIN `{CDR}.person` p on co.person_id = p.person_id
        JOIN `{CDR}.concept` c on co.condition_source_concept_id = c.concept_id
        JOIN `{CDR}.cb_search_person` sp 
          ON p.person_id = sp.person_id 
          AND sp.has_whole_genome_variant = 1 

    WHERE

        condition_source_concept_id IN ({ICDcodes_query}) #retinal dystrophy & choroidal dystrophy

    ''',
                                    progress_bar_type="tqdm_notebook",
      dialect='standard',
                                   )
    
    # Get the list of person IDs from the first query.
    person_ids = ICD_person['person_id'].unique().tolist()
    person_ids_query = ','.join(map(str, person_ids))
    
    # Run the consent query using the person IDs from the first query.
    consent_query = f'''
    SELECT DISTINCT person_id, MAX(observation_date) AS primary_consent_date
    FROM `{CDR}.concept`
    JOIN `{CDR}.concept_ancestor` ON concept_id = ancestor_concept_id
    JOIN `{CDR}.observation` ON descendant_concept_id = observation_source_concept_id
    WHERE concept_name = 'Consent PII' 
      AND concept_class_id = 'Module'
      AND person_id IN ({person_ids_query})
    GROUP BY person_id
    '''
    consent_dates = pd.read_gbq(consent_query, progress_bar_type="tqdm_notebook")
    
    # Merge the results on person_id to incorporate the consent date.
    final_result = ICD_person.merge(consent_dates, on='person_id', how='left')
    
    # convert date type
    final_result['primary_consent_date'] = pd.to_datetime(final_result['primary_consent_date'])
    final_result['primary_consent_date'] = final_result['primary_consent_date'].dt.tz_localize('UTC')



    
    return final_result
def calcAgeAtVisit(ICD_person_table):
    # Calculate the difference (result is a timedelta Series)
    ICD_person_table["AgeAtVisit"] = (
        ICD_person_table["condition_start_datetime"] - ICD_person_table["birth_datetime"]
    )

    # Convert the timedelta to years
    ICD_person_table["AgeAtVisit_Years"] = (
        ICD_person_table["AgeAtVisit"].dt.days / 365.25
    )
    
    return ICD_person_table
def calcAgeAtConsent(ICD_person_table):
    # Calculate the difference (result is a timedelta Series)
    ICD_person_table["AgeAtConsent"] = (
        ICD_person_table["primary_consent_date"] - ICD_person_table["birth_datetime"]
    )

    # Convert the timedelta to years
    ICD_person_table["AgeAtConsent_Years"] = (
        ICD_person_table["AgeAtConsent"].dt.days / 365.25
    )
    
    return ICD_person_table
def keepYoungestVisit(ICD_person_age_table):
    idx = ICD_person_age_table.groupby('person_id')['AgeAtVisit'].idxmin()

    # Keep only those rows
    filtered_df = ICD_person_age_table.loc[idx].reset_index(drop=True)
    return filtered_df
class TqdmFileWrapper:
    def __init__(self, file_obj, pbar):
        self.file_obj = file_obj
        self.pbar = pbar

    def read(self, size=-1):
        data = self.file_obj.read(size)
        self.pbar.update(len(data))
        return data

    def __getattr__(self, attr):
        # Delegate attribute access to the underlying file object
        return getattr(self.file_obj, attr)

def saveToBucket(df, df_filename, data_folder = "personID_concept"):
    # Save dataframe to local CSV file
    df.to_csv(df_filename, sep="\t", index=False)
    
    # Get bucket name from environment
    bucket_name = my_bucket
    if bucket_name.startswith("gs://"):
        bucket_name = bucket_name.replace("gs://", "")
    
    # Initialize Google Cloud Storage client and bucket
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    
    # Define destination path inside the bucket
    destination_blob_name = "data/" + data_folder + "/" + df_filename
    
    # Get the file size for the progress bar
    file_size = os.path.getsize(df_filename)
    
    # Create a blob instance
    blob = bucket.blob(destination_blob_name)
    
    # Open the file and wrap it to update progress
    with open(df_filename, 'rb') as file_obj, tqdm.tqdm(total=file_size, unit='B', unit_scale=True, desc="Uploading") as pbar:
        wrapped_file = TqdmFileWrapper(file_obj, pbar)
        # The upload_from_file method reads in chunks; each read will update the progress bar.
        blob.upload_from_file(wrapped_file)
    
    print("Upload complete!")
    
def getTable(table_name, folder = "personID_concept"):
    my_bucket = os.getenv('WORKSPACE_BUCKET')

    # copy csv file from the bucket to the current working space
    os.system(f"gsutil cp '{my_bucket}/data/{folder}/{table_name}' .")

    print(f'[INFO] {table_name} is successfully downloaded into your working space')
    # save dataframe in a csv file in the same workspace as the notebook
    table_read = pd.read_csv(table_name, sep="\t")
    return table_read
def getFile(table_name, folder):
    my_bucket = os.getenv('WORKSPACE_BUCKET')

    # copy csv file from the bucket to the current working space
    os.system(f"gsutil cp '{my_bucket}/data/{folder}/{table_name}' .")

    print(f'[INFO] {table_name} is successfully downloaded into your working space')
def makeMultiSQLqeury(queryList):
    #given list, makes a query to be pasted into SQL queries
    qid = ','.join(map(str, queryList))
    return qid

In [None]:
#get hereditary retinal and choroidal dystrophy ICD9 or ICD10 codes - starting example
retchordys_ICD = getICDcodes([380395, 4208199])
saveToBucket(retchordys_ICD, "retchordys_ICD_codes.tsv", "Concept_Sets_ICDcodes")

In [None]:
#Example - repeat for other sets
#get personIDs mapping to set
retchordys_ICD_query = retchordys_ICD["ICD_conceptID"].tolist()
retchordys_ICD_person = getICD_personid(retchordys_ICD_query)
retchordys_ICD_person_age = calcAgeAtVisit(retchordys_ICD_person)
retchordys_ICD_person_age = calcAgeAtConsent(retchordys_ICD_person_age)
retchordys_ICD_person_age_youngest = keepYoungestVisit(retchordys_ICD_person_age)

# repeat for all other code sets, use supp tables for nested sets (retdegen, screening) and plug them into getICD_personid
myopia_ICD = getICDcodes([379805])
hypermetropia_ICD = getICDcodes([376415])
pucker_ICD = getICDcodes([379010])
retdegen_ICD = getICDcodes([4318985])
AMD_specific_ICD = getICDcodes([374028])
ExudativeAMD_ICD = getICDcodes([376966])
NonexudativeAMD_ICD = getICDcodes([372629])
CME_ICD = getICDcodes([4105178,376115])

