# BigQuery Exploration

In [None]:
library(bigrquery)
library(repr)
bigrquery::bq_auth(path = "~/key-file")
project <- 'isb-cgc-training-001'

## SQL queries
SQL queries are fundamentally very similar to subsetting R data frames, where the syntax of subsetting an R data frame is as follows:

> *dataframe*[ ,*column* ][ *conditional* ]

The equivalent operation in SQL would look like this:

> SELECT *column* FROM *table* WHERE *conditional*



In [None]:
# a query to subset a table based on a chosen characteristic
case_sql <- "
SELECT
  submitter_id,
  demo__age_at_index,
  demo__vital_status
FROM `isb-cgc-bq.TCGA_versioned.clinical_gdc_r31`
WHERE demo__age_at_index >= 30 AND demo__age_at_index <= 80
"
case_query <- bq_project_query(project, query=case_sql)
case_df <- as.data.frame(bq_table_download(case_query))
head(case_df)

In [None]:
# a query to perform a simple calculation, average in this case
expr_sql <- "
SELECT
  sample_type_name,
  avg(fpkm_uq_unstranded) avg_fpkm
FROM `isb-cgc-bq.TCGA_versioned.RNAseq_hg38_gdc_r35`
WHERE gene_name = 'CCNB1'
GROUP BY sample_type_name
"

expr_query <- bq_project_query(project, query=expr_sql)
expr_df <- bq_table_download(expr_query)
head(expr_df)

In [None]:
# a query to join data from two different tables, on aliquot_barcode and gene_name in this case
sql <- "
SELECT
    rna.case_barcode, rna.sample_type_name, rna.gene_name,
    mut.HGVSc, rna.fpkm_uq_unstranded
FROM `isb-cgc-bq.TCGA_versioned.masked_somatic_mutation_hg38_gdc_r35` mut
JOIN `isb-cgc-bq.TCGA_versioned.RNAseq_hg38_gdc_r35` rna
    ON rna.sample_barcode = mut.sample_barcode_tumor
    AND rna.Ensembl_gene_id = mut.Gene
WHERE mut.project_short_name = 'TCGA-BRCA'"

query <- bq_project_query(project, query=sql)
df <- bq_table_download(query)
head(df)