In [None]:
library(bigrquery)
library(repr)
bigrquery::bq_auth(path = "~/key-file")

## Clinical and cohort queries
How old are the patients for each case in a given project?

In [None]:
case_sql <- "select
  submitter_id case_barcode,
  demo__age_at_index age,
  demo__vital_status vital
from `isb-cgc-bq.TCGA.clinical_gdc_current`
where proj__project_id = 'TCGA-BRCA'
"
case_query <- bq_project_query(query=case_sql)
case_df <- as.data.frame(bq_table_download(case_query))
head(case_df)

In [None]:
options(repr.plot.width=7, repr.plot.height=4)
case_num <- length(unique(case_df[,2]))
hist(case_df[,2], breaks=case_num/2, main='age at index', xlab='age (years)')

In [None]:
hist(case_df[case_df[,3] == 'Alive',2], 
     breaks=case_num/2, xlim=c(25,90), 
     col='steelblue1', main='', xlab='age (years)')
hist(case_df[case_df[,3] == 'Dead',2], 
     breaks=case_num/2, add=1, col='orange')
legend('topright', bty='n', legend=c('Alive', 'Dead'), fill=c('steelblue1', 'orange'))

## Gene expression
How can we retrieve the RNAseq expression (fpkm-uq) of a specific gene in all cases from all the tissue types available?

Table: *isb-cgc-bq.TCGA.RNAseq_hg38_gdc_current*

In [None]:
expr_sql <- "select
  case_barcode,
  aliquot_barcode,
  sample_type_name,
  HTSeq__FPKM_UQ
from `isb-cgc-bq.TCGA.RNAseq_hg38_gdc_current`
where project_short_name = 'TCGA-BRCA'
and gene_name = 'CCNB1'"

expr_query <- bq_project_query(query=expr_sql)
expr_df <- bq_table_download(expr_query)
head(expr_df)

In [None]:
tumor <- unlist(expr_df[expr_df[,3] == 'Primary Tumor',4])
normal <- unlist(expr_df[expr_df[,3] == 'Solid Tissue Normal',4])
metastatic <- unlist(expr_df[expr_df[,3] == 'Metastatic',4])

boxplot(list(normal, tumor, metastatic), names=c('normal', 'tumor', 'metastatic'))

We can also generate summary statistics directly in BigQuery.

In [None]:
summ_sql <- "select
  sample_type_name,
  avg(HTSeq__FPKM_UQ) avg_fpkm,
  stddev(HTSeq__FPKM_UQ) sdv_fpkm,
  max(HTSeq__FPKM_UQ) max_fpkm,
  min(HTSeq__FPKM_UQ) min_fpkm
from `isb-cgc-bq.TCGA.RNAseq_hg38_gdc_current`
where project_short_name = 'TCGA-BRCA'
and gene_name = 'CCNB1'
group by sample_type_name"

summ_query <- bq_project_query(query=summ_sql)
summ_df <- bq_table_download(summ_query)
head(summ_df)

## Genome annotation
We can retrieve Gencode annotation data for specific genes from BigQuery tables

Table: *isb-cgc-bq.GENCODE_versioned.annotation_gtf_hg38_v22*

In [None]:
annot_sql <- "select * from `isb-cgc-bq.GENCODE_versioned.annotation_gtf_hg38_v22` where gene_name = 'CCNB1' and transcript_name = 'CCNB1-001'"

annot_query <- bq_project_query(query=annot_sql)
annot_df <- bq_table_download(annot_query)
head(annot_df)

## Mutation data
Given the gene annotation data above we can query for mutations within a gene.

Table: *isb-cgc-bq.TCGA.somatic_mutation_hg38_gdc_current*

In [None]:
mut_sql <- "select case_barcode, 
    Chromosome, 
    Start_Position, 
    End_Position, 
    Reference_Allele, 
    Tumor_Seq_Allele1, 
    Tumor_Seq_Allele2
from `isb-cgc-bq.TCGA.somatic_mutation_hg38_gdc_current`
where Chromosome = 'chr5'
    and Start_Position > 69167135
    and End_Position < 69177358"

mut_query <- bq_project_query(query=mut_sql)
mut_df <- bq_table_download(mut_query)
head(mut_df)

In [None]:
options(repr.plot.width=15, repr.plot.height=4)
mut_count <- table(mut_df[,3])
max_count <- max(mut_count)
plot(NA, xlim=c(min(mut_df[,3])-1000, max(mut_df[,3])+1000), ylim=c(-0.3,max_count), 
     yaxt='n', bty='n', main='', xlab='Chr5 position', ylab='occurance count')
jnk <- apply(annot_df[annot_df$feature == 'exon',], 1, function(row){
    rect(row[4], -0.2, row[5], -0.1, col='green')
})
axis(2, at=seq(0,3,1))
jnk <- sapply(1:length(mut_count), function(ix){
    x <- as.numeric(names(mut_count)[ix])
    segments(x, 0, x, mut_count[ix])
    })

In [None]:
sql <- "select case_barcode, 
    Chromosome, 
    Start_Position, 
    End_Position, 
    Reference_Allele, 
    Tumor_Seq_Allele1, 
    Tumor_Seq_Allele2
from `isb-cgc-bq.TCGA.somatic_mutation_hg38_gdc_current`
where 
    Chromosome = 'chr9'
    and (
        (Start_Position < 69067135 and Start_Position > 69167135)
        or (End_Position > 69177358 and End_Position < 69277358)
        )"

query <- bq_project_query(query=sql)
df <- bq_table_download(query)
head(df)

## Table joins
How do we join clinical and mutation data?

Tables:
*   *isb-cgc-bq.TCGA.clinical_gdc_current*
*   *isb-cgc-bq.TCGA.somatic_mutation_hg38_gdc_current*

In [None]:
sql <- "select 
    mut.Chromosome, 
    mut.Start_Position, 
    mut.End_Position, 
    mut.Reference_Allele, 
    mut.Tumor_Seq_Allele1, 
    mut.Tumor_Seq_Allele2, 
    clin.demo__vital_status 
from `isb-cgc-bq.TCGA.clinical_gdc_current` clin
join `isb-cgc-bq.TCGA.somatic_mutation_hg38_gdc_current` mut 
    on clin.submitter_id = mut.case_barcode
where mut.project_short_name = 'TCGA-BRCA'"

query <- bq_project_query(query=sql)
df <- bq_table_download(query)
head(df)