In [None]:
#Load Libraries and authorize with BigQuery
library(bigrquery)
library(tidyverse)
library(survival)
library(ggfortify)
bigrquery::bq_auth(path = "~/key-file")
project <- 'isb-cgc-outreach'

# Survival Analysis and BigQuery

This notebook demonstrates how to use BigQuery to gather data to use in Survival analysis. We will be using mutation of the BRCA gene to predict the survival between mutation of BRCA and normal.



# Gather Data

## Retrive Patient BRCA Mutation Status from BigQuery
The first component of our data set is the status of the BRCA mutation for the patient. 

Table: *isb-cgc-bq.TCGA.somatic_mutation_hg38_gdc_current*

In [None]:
cohort_query <- "WITH t AS (
            SELECT case_id, Hugo_Symbol
            FROM `isb-cgc-bq.TCGA.somatic_mutation_hg38_gdc_current`
            WHERE
              project_short_name = 'TCGA-BRCA')
            SELECT DISTINCT case_id,
              CASE
                WHEN Hugo_Symbol = 'BRCA1' THEN 'brca1'
                WHEN Hugo_Symbol = 'BRCA2' THEN 'brca2'
                ELSE 'none'
              END
              AS brca
            FROM t
            ORDER BY brca"

In [None]:
# Run the query
cohort <- bq_project_query(project, cohort_query, quiet = TRUE) 
# Create a dataframe with the results from the query
cohort <- bq_table_download(cohort, quiet = TRUE)
# Show the dataframe
summary(cohort)

## Retrieve Clinical Data From BigQuery

The other important component of our data set is the patients vital status and either days to death or days to last follow up.

Table: *isb-cgc-bq.TCGA.clinical_gdc_current*

In [None]:
survival_query <- str_c("
  SELECT 
    case_id,
    demo__vital_status,
    demo__days_to_death,
    diag__days_to_last_follow_up
  FROM `isb-cgc-bq.TCGA.clinical_gdc_current`
  WHERE
    case_id IN ('", str_c(cohort$case_id, collapse = "', '"),"') AND
    demo__vital_status IS NOT NULL")

In [None]:
survival_request <- bq_project_query(project, survival_query)
survival_data <- bq_table_download(survival_request)
survival <- left_join(survival_data, cohort, key = "case_id")

# Clean Data

We want to make sure that the data is cleaned of duplicates, empty cells, missing data, and create one column for days to death and days to last follow up.

In [None]:
# Fill in NAs for alive cases with days to last followup #todo
for (row in 1:nrow( survival)) {
  if (survival$demo__vital_status[row] == 'Alive' && is.na(survival$demo__days_to_death[row])){
    survival$demo__days_to_death[row] <- survival$diag__days_to_last_follow_up[row]
  }
}

# Remove duplicates in the brca column keeping the mutation

survival <- arrange(survival, brca)
survival <- survival[!duplicated(survival$case_id),]

# Filter out cases marked as dead but have no data for days to death and negative days
survival <- filter(survival, !(demo__vital_status=="Dead"&demo__days_to_death=="NA")&demo__days_to_death>=1)

# Convert the vital status to numbers
survival$demo__vital_status <- ifelse(survival$demo__vital_status=='Alive', 0, 1)

# Analyze Data

Finally, we can create the survival analysis and plot the results.

In [None]:
# create a survival curve plot
autoplot(survfit(Surv(demo__days_to_death, demo__vital_status) ~ brca, data = survival)) +
  labs(title = "Survival Curve",
       y = "Percent Survival", 
       x = "Days") +
  theme(legend.title=element_blank())

In [None]:
# Analyze the differences between groups with a Log-Rank Test
survdiff(Surv(demo__days_to_death, demo__vital_status) ~ brca, data = survival)

While the curves appear to be different in the plots, the log rank test in `survdiff` indicates that there is no difference between the curves and that there is no difference in outcome based on the mutation of either BRCA gene.