In [1]:
# Import libraries
import numpy as np
import os

# Imports for using and authenticating BigQuery
from google.colab import auth

In [2]:
auth.authenticate_user()

In [3]:
# Replace project_id with your Google Cloud Project ID. 
os.environ["GOOGLE_CLOUD_PROJECT"]='amplified-asset-330216'


In [4]:
import ipywidgets as widgets

print("Variables for Region (Type 1) Queries")

gnomad_version_widget_region = widgets.Dropdown(
    options=['v2_1_1_exomes', 'v2_1_1_genomes', 'v3_genomes'],
    value='v3_genomes',
    description='gnomAD version:',
    disabled=False,
    style={'description_width': 'initial'}
)

display(gnomad_version_widget_region)

chromosome_widget_region = widgets.Dropdown(
    options=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
             'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
             'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22',
             'chrX', 'chrY'],
    value='chr17',
    description='Chromosome:',
    disabled=False,
    style={'description_width': 'initial'}
)

display(chromosome_widget_region)

gene_symbol_widget_region= widgets.Text(
    value='BRCA1',
    placeholder='gene_symbol',
    description='Gene Symbol:',
    disabled=False,
    style={'description_width': 'initial'}
)

display(gene_symbol_widget_region)

Variables for Region (Type 1) Queries


Dropdown(description='gnomAD version:', index=2, options=('v2_1_1_exomes', 'v2_1_1_genomes', 'v3_genomes'), st…

Dropdown(description='Chromosome:', index=16, options=('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7',…

Text(value='BRCA1', description='Gene Symbol:', placeholder='gene_symbol', style=DescriptionStyle(description_…

In [5]:
# Set the variables for the rest of the Type 1 queries based on the values above.
gnomad_version_region=gnomad_version_widget_region.value
chromosome_region=chromosome_widget_region.value
gene_symbol_region=gene_symbol_widget_region.value


print('Running Region (Type 1) queries on gnomAD version: {}, chromosome: {}, gene symbol: {}'.format(
    gnomad_version_region,
    chromosome_region,
    gene_symbol_region
))

if gnomad_version_region.startswith('v3'):
  # Variant type (snv, indel, multi-snv, multi-indel, or mixed) is stored under difference columns in V2 and V3
  variant_type_col = 'variant_type'
  extra_columns = ''
else:
  variant_type_col = 'alternate_bases. allele_type'
  # These vep columns only exist in V2
  extra_columns = 'vep.STRAND AS STRAND, vep.Protein_position AS Protein_pos,'


Running Region (Type 1) queries on gnomAD version: v3_genomes, chromosome: chr17, gene symbol: BRCA1


In [6]:
from google.cloud import bigquery

client = bigquery.Client()

def run_query(query, verbose=True):
    query_job = client.query(query)
    result = None
    if verbose :
      result = query_job.to_dataframe(progress_bar_type='tqdm_notebook')
      gb_processed = (query_job.total_bytes_billed / 1024 ** 3)
      print('This query processed {} GB of data which is {}% of your 1 TB monthly free quota.'.format(gb_processed, round(gb_processed / 1024 * 100, 4)))
    else :
      result = query_job.to_dataframe()
    return result


In [10]:

query_template = """
SELECT DISTINCT 
       start_position,
       reference_bases,
       alternate_bases.alt,
       {VAR_TYPE_COL} AS variant_type,
FROM `bigquery-public-data.gnomAD.{GNOMAD_VER}__{CHROM}` AS main_table,
     main_table.alternate_bases AS alternate_bases
WHERE start_position >= {X} AND start_position <= {Y}
ORDER BY 1,2
"""
query = query_template.format(GNOMAD_VER=gnomad_version_region,
                              CHROM="chr19",
                              VAR_TYPE_COL="variant_type", X=58345197-150, Y=58345197+150)

pas_var_df = run_query(query, verbose=False)
pas_var_df.head()


Unnamed: 0,start_position,reference_bases,alt,variant_type
0,58345054,G,A,snv
1,58345055,C,T,snv
2,58345057,T,G,snv
3,58345059,A,C,snv
4,58345063,T,C,snv


In [7]:
import pandas as pd


In [8]:

polyadb_bed = pd.read_csv("../content/polyadb_coordinates_utr3_hg38.bed", sep='\t', header=None, names=['chrom', 'pas_pos', 'end', 'gene', 'gene_id', 'strand'])


In [9]:

polyadb_bed


Unnamed: 0,chrom,pas_pos,end,gene,gene_id,strand
0,chr19,58346883,58346884,A1BG,A1BG.1,-
1,chr19,58345849,58345850,A1BG,A1BG.2,-
2,chr19,58345799,58345800,A1BG,A1BG.3,-
3,chr19,58345197,58345198,A1BG,A1BG.4,-
4,chr19,58343947,58343948,A1BG,A1BG.5,-
...,...,...,...,...,...,...
100095,chr1,77562501,77562502,ZZZ3,ZZZ3.6,-
100096,chr1,77562439,77562440,ZZZ3,ZZZ3.7,-
100097,chr1,77562106,77562107,ZZZ3,ZZZ3.8,-
100098,chr1,77559986,77559987,ZZZ3,ZZZ3.9,-


In [11]:

#chrs = ["chr22"]
#chrs = ["chr1", "chr2", "chr3", "chr4", "chr5"]
#chrs = ["chr6", "chr7", "chr8", "chr9", "chr10"]
#chrs = ["chrX", "chrY"]
#chrs = ["chr16", "chr17", "chr18", "chr19", "chr20", "chr21"]
chrs = ["chr11", "chr12", "chr13", "chr14", "chr15"]

for chr in chrs :

  print("chromosome = " + str(chr))

  polyadb_bed_chr = polyadb_bed.query("chrom == '" + chr + "'").copy().reset_index(drop=True)
  polyadb_bed_chr_gene = polyadb_bed_chr.groupby(['gene']).agg({"pas_pos" : ["min", "max"]}).copy().reset_index()
  polyadb_bed_chr_gene.columns = ['_'.join(col).strip() for col in polyadb_bed_chr_gene.columns.values]

  polyadb_bed_chr_gene['pas_pos_min'] -= 150
  polyadb_bed_chr_gene['pas_pos_max'] += 150

  print("n genes = " + str(len(polyadb_bed_chr_gene)))

  dfs_chr = []

  i = 0
  for _, row in polyadb_bed_chr_gene.iterrows() :

    gene = row['gene_']
    start = row['pas_pos_min']
    end = row['pas_pos_max']

    if i % 100 == 0 :
      print("[" + str(i) + "] Running query for gene = " + str(gene))
      print("[" + str(i) + "]  - interval = " + str([start, end]))

    query_template = """
    SELECT reference_name AS CHROM, 
          start_position AS POS,
          reference_bases AS REF,
          alternate_bases.alt AS ALT,
          AN,
          alternate_bases.AC AS AC,
          alternate_bases.AF AS AF,
          {VAR_TYPE_COL} AS VTYPE,
    FROM `bigquery-public-data.gnomAD.{GNOMAD_VER}__{CHROM}` AS main_table,
        main_table.alternate_bases AS alternate_bases
    WHERE start_position >= {X} AND start_position <= {Y} AND {VAR_TYPE_COL} = 'snv'
    ORDER BY 1,2
    """
    query = query_template.format(GNOMAD_VER=gnomad_version_region,
                                  CHROM=chr,
                                  VAR_TYPE_COL="variant_type", X=start, Y=end)

    pas_var_df = None
    if i % 100 == 0 :
      pas_var_df = run_query(query, verbose=True)
    else :
      pas_var_df = run_query(query, verbose=False)

    if i % 100 == 0 :
      print("[" + str(i) + "]  - n vars = " + str(len(pas_var_df)))

    dfs_chr.append(pas_var_df)

    i += 1
  
  final_df_chr = pd.concat(dfs_chr).copy().reset_index(drop=True)

  #Save chromosome dataframe
  print("n vars (total for chr) = " + str(len(final_df_chr)))

  final_df_chr.to_csv("polyadb_gnomad_v3_" + str(chr) + ".csv", sep='\t', index=False)


chromosome = chr11
n genes = 979
[0] Running query for gene = AAMDC
[0]  - interval = [77905849, 77906149]


Downloading:   0%|          | 0/40 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[0]  - n vars = 40
[100] Running query for gene = C11orf1
[100]  - interval = [111883897, 111886106]


Downloading:   0%|          | 0/515 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[100]  - n vars = 515
[200] Running query for gene = CNTN5
[200]  - interval = [100358705, 100359005]


Downloading:   0%|          | 0/53 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[200]  - n vars = 53
[300] Running query for gene = FAM111B
[300]  - interval = [59124172, 59130941]


Downloading:   0%|          | 0/1224 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[300]  - n vars = 1224
[400] Running query for gene = IFT46
[400]  - interval = [118541451, 118544898]


Downloading:   0%|          | 0/734 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[400]  - n vars = 734
[500] Running query for gene = MICAL2
[500]  - interval = [12263613, 12273469]


Downloading:   0%|          | 0/2005 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[500]  - n vars = 2005
[600] Running query for gene = PACS1
[600]  - interval = [66244569, 66256886]


Downloading:   0%|          | 0/2461 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[600]  - n vars = 2461
[700] Running query for gene = RCE1
[700]  - interval = [66846352, 66846652]


Downloading:   0%|          | 0/86 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[700]  - n vars = 86
[800] Running query for gene = SPA17
[800]  - interval = [124694398, 124698440]


Downloading:   0%|          | 0/695 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[800]  - n vars = 695
[900] Running query for gene = TRIM44
[900]  - interval = [35806669, 35822988]


Downloading:   0%|          | 0/2843 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[900]  - n vars = 2843
n vars (total for chr) = 635392
chromosome = chr12
n genes = 895
[0] Running query for gene = A2M
[0]  - interval = [9067292, 9067883]


Downloading:   0%|          | 0/86 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[0]  - n vars = 86
[100] Running query for gene = C12orf43
[100]  - interval = [120999733, 121004023]


Downloading:   0%|          | 0/890 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[100]  - n vars = 890
[200] Running query for gene = CSRP2
[200]  - interval = [76858582, 76858882]


Downloading:   0%|          | 0/49 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[200]  - n vars = 49
[300] Running query for gene = GCN1
[300]  - interval = [120127088, 120127936]


Downloading:   0%|          | 0/156 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[300]  - n vars = 156
[400] Running query for gene = KLRC4-KLRK1
[400]  - interval = [10361468, 10372529]


Downloading:   0%|          | 0/2283 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[400]  - n vars = 2283
[500] Running query for gene = MYL6B
[500]  - interval = [56157807, 56158107]


Downloading:   0%|          | 0/82 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[500]  - n vars = 82
[600] Running query for gene = PPFIA2
[600]  - interval = [81258137, 81259606]


Downloading:   0%|          | 0/250 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[600]  - n vars = 250
[700] Running query for gene = SELPLG
[700]  - interval = [108620282, 108622447]


Downloading:   0%|          | 0/415 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[700]  - n vars = 415
[800] Running query for gene = TMEM106C
[800]  - interval = [47968276, 47970002]


Downloading:   0%|          | 0/307 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[800]  - n vars = 307
n vars (total for chr) = 636331
chromosome = chr13
n genes = 282
[0] Running query for gene = ABCC4
[0]  - interval = [95019708, 95020625]


Downloading:   0%|          | 0/145 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[0]  - n vars = 145
[100] Running query for gene = HSPH1
[100]  - interval = [31131691, 31137141]


Downloading:   0%|          | 0/1091 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[100]  - n vars = 1091
[200] Running query for gene = RPL21P28
[200]  - interval = [27256392, 27261006]


Downloading:   0%|          | 0/1079 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[200]  - n vars = 1079
n vars (total for chr) = 183621
chromosome = chr14
n genes = 523
[0] Running query for gene = ABCD4
[0]  - interval = [74285145, 74286280]


Downloading:   0%|          | 0/209 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[0]  - n vars = 209
[100] Running query for gene = CKB
[100]  - interval = [103519540, 103520092]


Downloading:   0%|          | 0/108 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[100]  - n vars = 108
[200] Running query for gene = GSTZ1
[200]  - interval = [77331310, 77338965]


Downloading:   0%|          | 0/1415 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[200]  - n vars = 1415
[300] Running query for gene = NUMB
[300]  - interval = [73265664, 73276725]


Downloading:   0%|          | 0/2058 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[300]  - n vars = 2058
[400] Running query for gene = SETD3
[400]  - interval = [99394939, 99398703]


Downloading:   0%|          | 0/750 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[400]  - n vars = 750
[500] Running query for gene = VTI1B
[500]  - interval = [67644128, 67651463]


Downloading:   0%|          | 0/1180 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[500]  - n vars = 1180
n vars (total for chr) = 379346
chromosome = chr15
n genes = 531
[0] Running query for gene = AAGAB
[0]  - interval = [67182148, 67202947]


Downloading:   0%|          | 0/4045 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[0]  - n vars = 4045
[100] Running query for gene = COMMD4
[100]  - interval = [75342616, 75345418]


Downloading:   0%|          | 0/551 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[100]  - n vars = 551
[200] Running query for gene = HCN4
[200]  - interval = [73319727, 73322488]


Downloading:   0%|          | 0/499 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[200]  - n vars = 499
[300] Running query for gene = MYO5C
[300]  - interval = [52192190, 52193097]


Downloading:   0%|          | 0/188 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[300]  - n vars = 188
[400] Running query for gene = SCAPER
[400]  - interval = [76347775, 76348465]


Downloading:   0%|          | 0/106 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[400]  - n vars = 106
[500] Running query for gene = ULK3
[500]  - interval = [74835985, 74836430]


Downloading:   0%|          | 0/86 [00:00<?, ?rows/s]

This query processed 0.009765625 GB of data which is 0.001% of your 1 TB monthly free quota.
[500]  - n vars = 86
n vars (total for chr) = 429147


In [None]:
%cd

/root


In [None]:
!ls

polyadb_gnomad_v3_chr22.csv


In [None]:
gnomad_version_region

'v3_genomes'