In [None]:
# Step 1 - query the AoU database for personIDs with linked EHRs that have variants of interest

#initiailize

import pandas as pd
import os
import subprocess
from datetime import datetime
from matplotlib import pyplot
import matplotlib.pyplot as plt
import seaborn as sns

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# Get the BigQuery curated dataset for the current workspace context.
CDR = os.environ['WORKSPACE_CDR']

from google.cloud import bigquery
# Instantiate a BigQuery client
client = bigquery.Client()
#!pip install upsetplot #if necessary

from tqdm import tqdm

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_row', 7000)

In [None]:
# load in your table of variants
# in this example, it is df_IRD_concat and the variants are in the column "variant_id"
# make sure to replace this with your own dataframe and column name as needed

full_vid_list_unique = df_IRD_concat["variant_id"].tolist()
vid_list_str = ', '.join([f"'{vid}'" for vid in full_vid_list_unique])  # Quote each item

full_pid_vid_table = pd.read_gbq(f'''

SELECT
    vt.vid AS variant_id,
    p.person_id,
    p.year_of_birth,
    p.birth_datetime,
    p.gender_source_value,
    p.sex_at_birth_source_value,
    p.race_source_value,
    p.ethnicity_source_value
    
FROM
     -- Select persons filtered by variants
    (
        SELECT DISTINCT person_id, vid
        FROM `{CDR}.cb_variant_to_person`
        CROSS JOIN UNNEST(person_ids) AS person_id
        WHERE vid IN ({vid_list_str})
        
    ) vt
    JOIN `{CDR}.cb_search_person` sp 
      ON vt.person_id = sp.person_id 
      AND sp.has_ehr_data = 1
    LEFT JOIN `{CDR}.person` p ON vt.person_id = p.person_id


''')

# it is critical that sp.has_ehr_data = 1 to ensure that the person has linked EHR data
# as 25% of AoU participants with srWGS do not have linked EHR data

In [None]:
# to enable separation into categories by inheritance mode if desired
# load in the OMIM gene inheritances table
# here it is gene_inheritance_disease.txt
# optionally, annotate with entrez_ids

#donwload gene_inheritance files 
gene_inheritances = "Gene_Inheritance_Disease.txt"
gene_inheritances_file_path = subprocess.check_output(f"gsutil ls {my_bucket}/data/uploads/{gene_inheritances}", shell=True).decode('utf-8')

os.system(f"gsutil cp '{gene_inheritances_file_path}' .")

gene_inheritances = pd.read_csv(gene_inheritances)

# annotate with entrez ids
# if mygene is not installed, uncomment the following line to install it
#!pip install mygene
import mygene
mg = mygene.MyGeneInfo()

mgout = mg.querymany(gene_inheritances["Gene"].tolist(), scopes="symbol", 
                     fields = "symbol,entrezgene", species = "human", as_dataframe=True)

mgout = mgout[mgout.symbol.notna()]
mgout = mgout.reset_index()
mgout = mgout.drop(33, axis = 0)

gene_inheritances = pd.merge(gene_inheritances[['Gene', 'OMIM Inheritance', 'OMIM disease']], 
                             mgout[['query', 'entrezgene']], left_on="Gene", right_on ="query")

In [None]:
# example code to extract AD genes
inheritance_pattern = "^AD" #AD
genes_to_extract = gene_inheritances[gene_inheritances["OMIM Inheritance"].str.contains(f'{inheritance_pattern}', na=False)]
genes_to_extract
gene_list_extract = genes_to_extract["entrezgene"].tolist()
gene_list_extract = [int(x) for x in gene_list_extract]


AD_subset = full_pid_vid_table[full_pid_vid_table["GeneID_EG"].isin(gene_list_extract)]

In [None]:
# example code to extract XL males

inheritance_pattern = "XL" #XL
genes_to_extract = gene_inheritances[gene_inheritances["OMIM Inheritance"].str.contains(f'{inheritance_pattern}', na=False)]
#genes_to_extract
gene_list_extract = genes_to_extract["entrezgene"].tolist()
gene_list_extract = [int(x) for x in gene_list_extract]

XL_subset = full_pid_vid_table[full_pid_vid_table["GeneID_EG"].isin(gene_list_extract)]

XL_subset_male = XL_subset[XL_subset["sex_at_birth_source_value"].str.contains("SexAtBirth_Male")]