### Do the imports

In [1]:
from google.colab import auth
from google.cloud import bigquery
from google.auth import default
import pytz
import os
from datetime import date, datetime
import gspread
from google.colab import data_table
import matplotlib.pyplot as plt
from collections import defaultdict
import plotly.graph_objects as go
from IPython.display import display



###Setup and a Useful Function:

In [2]:
data_table.enable_dataframe_formatter()

def run_query(bq_client, query):
  selection_result = bq_client.query(query)
  df = selection_result.to_dataframe()
  return df


###Specify the project ID to be charged and create a BQ Client:

In [3]:
google_project_to_charge = "new-june2018-project"
bq_client = bigquery.Client(google_project_to_charge)


### Authenticate using Google Colab auth package:

In [4]:
auth.authenticate_user(project_id=google_project_to_charge)
creds, _ = default()

###Show Available TCGA Projects

This table shows the datasets you can select from:

In [5]:
project_query = '''
  SELECT * FROM `isb-cgc-bq.ISB_Regulome_Explorer.projects_tcga`
  ORDER BY disease_code, key
'''
df = run_query(bq_client, project_query)
data_table.DataTable(df, include_index=True, num_rows_per_page=50)


Unnamed: 0,disease_code,description,key
0,ACC,TCGA Adrenocortical Carcinoma (ACC) Manuscript,acc_11aug15
1,BLCA,TCGA Bladder Cancer (BLCA) Manuscript,blca_20may13_manuscript_tumor_only
2,BRCA,TCGA TCGA Breast Invasive Carcinoma (BRCA) Feb...,brca_03feb13
3,BRCA,TCGA Breast Invasive Carcinoma (BRCA) Manuscript,brca_manuscript
4,COADREAD,"TCGA Colorectal Adenocarcinoma (COAD, READ) Ma...",coadread_12apr12
5,ESCA_STAD,TCGA Gastroesophageal Cancer (ESCA) Manuscript,esca_stad_08feb16
6,GBM,TCGA Glioblastoma Multiforme (GBM) February 20...,gbm_06feb14_all
7,GBM,TCGA Glioblastoma Multiforme (GBM) February 20...,gbm_06feb14_classical
8,GBM,TCGA Glioblastoma Multiforme (GBM) February 20...,gbm_06feb14_mesenchymal
9,GBM,TCGA Glioblastoma Multiforme (GBM) February 20...,gbm_06feb14_neural


###Select the Project Number (0-24) From the Above Table You Wish to Explore

The BQ tables are clustered by project key!

In [6]:
dataset_index = 15
key = df['key'].tolist()[dataset_index]

###Specify the two feature types you want to compare:

In [7]:
#
# These are what the four-letter codes mean:
#
#  "Clinical": "CLIN",
#  "Somatic Copy Number": "CNVR",
#  "Gene Expression": "GEXP",
#  "Somatic Mutation": "GNAB",
#  "DNA Methylation": "METH",
#  "MicroRNA Expression": "MIRN",
#  "Protein Level - RPPA": "RPPA",

feature_1 = "GEXP"
feature_2 = "GEXP"

###Set the parameters for what assocations you want to look at, and the maximum number of results:


In [23]:
min_logged_pvalue_bonf = 20
lower_correlation_value = 0.65
upper_correlation_value = 0.75
max_results = 200

###Run the analysis:

In [24]:
analysis_query = '''
    WITH a1 as (SELECT id FROM `isb-cgc-bq.ISB_Regulome_Explorer.features_tcga`
                  WHERE source = "{feature_1}" AND dataset = "{dataset}"),
         b1 as (SELECT id FROM `isb-cgc-bq.ISB_Regulome_Explorer.features_tcga`
                  WHERE source = "{feature_2}" AND dataset = "{dataset}"),
         c1 as (SELECT * from `isb-cgc-bq.ISB_Regulome_Explorer.feature_associations_tcga`
                  WHERE (correlation >= {low_corr}) AND (correlation <= {high_corr})
                  AND logged_pvalue_bonf >= {pval}
                  AND dataset = "{dataset}")
    SELECT feature_1_id, feature_1, feature_2_id, feature_2, distance, logged_pvalue_bonf, logged_pvalue, correlation, num_samples,	dataset FROM c1
      INNER JOIN a1 on a1.id = c1.feature_1_id
      INNER JOIN b1 on b1.id = c1.feature_2_id
      ORDER BY ABS(correlation) desc LIMIT {lim}
    '''.format(feature_1=feature_1, feature_2=feature_2, dataset=key, low_corr=lower_correlation_value,
               high_corr=upper_correlation_value, pval=min_logged_pvalue_bonf, lim=max_results)
df = run_query(bq_client, analysis_query)
df


Unnamed: 0,feature_1_id,feature_1,feature_2_id,feature_2,distance,logged_pvalue_bonf,logged_pvalue,correlation,num_samples,dataset
0,18803,N:GEXP:IGFBP7:chr4:57897237:57976551:-:3490,38043,N:GEXP:THY1:chr11:119288655:119294246:-:7070,,27.1,35.2,0.75,193,lihc_11oct16
1,18768,N:GEXP:IFI16:chr1:158969758:159024945:+:3428,36777,N:GEXP:PTPLAD2:chr9:21003623:21031635:-:401494,,26.7,34.8,0.75,193,lihc_11oct16
2,18494,N:GEXP:HDAC7:chr12:48176507:48213763:-:51564,18634,N:GEXP:HOMER3:chr19:19017769:19052041:-:9454,,26.9,35.0,0.75,193,lihc_11oct16
3,17148,N:GEXP:DENND3:chr8:142138720:142205906:+:22898,18462,N:GEXP:HAPLN3:chr15:89420519:89438770:-:145864,,27.0,35.1,0.75,193,lihc_11oct16
4,35669,N:GEXP:MLL3:chr7:151832010:152133090:-:58508,37060,N:GEXP:RNF168:chr3:196195657:196230639:-:165918,,27.1,35.3,0.75,193,lihc_11oct16
...,...,...,...,...,...,...,...,...,...,...
195,19218,N:GEXP:KLHL28:chr14:45393527:45431179:-:54813,21567,N:GEXP:REST:chr4:57774042:57802010:+:5978,,26.8,35.0,0.75,193,lihc_11oct16
196,33154,N:GEXP:CDK12:chr17:37617739:37690800:+:51755,38797,N:GEXP:ZFR:chr5:32354456:32444844:-:51663,,27.1,35.2,0.75,193,lihc_11oct16
197,20007,N:GEXP:MRC2:chr17:60704762:60770952:+:9902,20795,N:GEXP:PDZD4:chrX:153067621:153096003:-:57595,,27.3,35.4,0.75,193,lihc_11oct16
198,16490,N:GEXP:CDC7:chr1:91966404:91991321:+:8317,33181,N:GEXP:CENPI:chrX:100353178:100417978:+:2491,,26.9,35.0,0.75,193,lihc_11oct16


###Select the Association Row Above You Want to Plot:


In [25]:
plot_row = 118


### Pull the Plot Points Out:

Note that we are using the ":"-separated lists of data values from the features table, which is convenient for plotting. For numerical analysis, use the tuple tables instead.

In [26]:
def pull_plot_points(bq_client, dataset, id):
  point_query = '''
    SELECT patient_values FROM `isb-cgc-bq.ISB_Regulome_Explorer.features_tcga`
      WHERE dataset = "{dataset}" AND id = {id}
    '''.format(dataset=dataset, id=id)
  df = run_query(bq_client, point_query)
  return df

def pull_patient_barcodes(bq_client, dataset):
  barcode_query = '''
    SELECT barcodes FROM `isb-cgc-bq.ISB_Regulome_Explorer.barcodes_tcga`
      WHERE dataset = "{dataset}"
    '''.format(dataset=dataset)
  df = run_query(bq_client, barcode_query)
  return df

fea_1= df['feature_1_id'].tolist()[plot_row]
label1 = df['feature_1'].tolist()[plot_row]
the_set = df['dataset'].tolist()[plot_row]
fea_2 = df['feature_2_id'].tolist()[plot_row]
label2 = df['feature_2'].tolist()[plot_row]

df2 = pull_plot_points(bq_client, the_set, fea_1)
points_x = df2['patient_values'].tolist()[0]

df3 = pull_plot_points(bq_client, the_set, fea_2)
points_y = df3['patient_values'].tolist()[0]

dfp = pull_patient_barcodes(bq_client, key)
barcodes = dfp['barcodes'].tolist()[0]

# Splitting on the ":" into arrays of values:
point_x_raw_array = points_x.split(':')
point_y_raw_array = points_y.split(':')
barcodes_array = barcodes.split(':')

# Only want to include points that are not "NA"!

num_points = len(point_x_raw_array)
x_num = []
y_num = []
bar_nums = []
for i in range(0, num_points):
  if point_x_raw_array[i] != "NA" and point_y_raw_array[i] != "NA":
    x_num.append(float(point_x_raw_array[i]))
    y_num.append(float(point_y_raw_array[i]))
    bar_nums.append(barcodes_array[i])


### Plot the Association!

With the data points pulled out, do the plot. <br>
**NOTE: If two points are on top of each other, only one will show up in the hover tooltip!**

In [27]:
layout = go.Layout(xaxis=go.layout.XAxis(title=label1), yaxis=go.layout.YAxis(title=label2))

# see https://stackoverflow.com/questions/69278251/plotly-including-additional-data-in-hovertemplate
fig = go.Figure(go.Scatter(x=x_num, y=y_num, mode='markers', marker_symbol='circle',
                           customdata=bar_nums,
                           hovertemplate='patient ID: %{customdata}',
                           marker=dict(color='rgba(135, 206, 250, 0.1)',
                                       size=8,
                                       line=dict(color='MediumPurple', width=2))), layout=layout)
fig.show()