# libraries and files

In [2]:
library(data.table)
library(GenomicDataCommons)
library(TCGAutils)
library(TCGAbiolinks)

In [3]:
# non-oncogenic gene sets
library(msigdbr)
library(ggplot2)
library(stats)
library(sets)
library(biomaRt)
library(clusterProfiler)
library(data.table)
library(stringi)
library(pheatmap)
library(tidyr)
library(ggpubr)
library(biomaRt)
human <- readRDS("/data/timonaj/biomart_orthologs/human.RDS")

# go
go_gene_sets <- msigdbr(species = "Homo sapiens", category = "C5")
unique_go_genes <- unique(go_gene_sets$gene_symbol)

# kegg
kegg_gene_sets <- msigdbr(species = "Homo sapiens", category = "C2", subcategory = "KEGG")
unique_kegg_genes <- unique(kegg_gene_sets$gene_symbol)

# oncogenic

## hallmark
H_gene_sets <- msigdbr(species = "Homo sapiens", category = "H")
unique_H_genes <- unique(H_gene_sets$gene_symbol)

## C2
c2_gene_sets <- msigdbr(species = "Homo sapiens", category = "C2")
unique_c2_genes <- unique(c2_gene_sets$gene_symbol)


## C4 CM
c4_gene_sets <- msigdbr(species = "Homo sapiens", category = "C4", subcategory = "CM")
unique_c4_genes <- unique(c4_gene_sets$gene_symbol)

## cosmic
cosmic_set <- fread("cancer_gene_census.csv")
unique_cosmic_genes <- unique(cosmic_set$`Gene Symbol`)

# Background set of genes
background_set <- fread("background_set.txt")

pathways <- readRDS("pathways.rds")

Registered S3 method overwritten by 'sets':
  method        from   
  print.element ggplot2


Attaching package: ‘sets’


The following object is masked from ‘package:msigdbr’:

    %>%


The following object is masked from ‘package:magrittr’:

    %>%


The following object is masked from ‘package:data.table’:

    set




Registered S3 method overwritten by 'enrichplot':
  method               from
  fortify.enrichResult DOSE

clusterProfiler v3.14.3  For help: https://guangchuangyu.github.io/software/clusterProfiler

If you use clusterProfiler in published research, please cite:
Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.


Attaching package: ‘tidyr’


The following object is masked from ‘package:sets’:

    %>%


The following object is masked from ‘package:GenomicDataCommons’:

    expand


The following object is masked from ‘pa

# data download

In [4]:
### files to be downloaded

# read in the .txt file for tissue type as well as the larger gene expression file\
ptm <- proc.time()
gene_tpm <- fread(file = "tcga_RSEM_gene_tpm")
gene_tpm_copy <- as.data.frame(gene_tpm)
print("download completed in")
print(proc.time() - ptm)

[1] "download completed in"
   user  system elapsed 
 35.022  11.421  52.976 


In [5]:
# length of the numerical tpm values since the first 2 columns are characters
tpm_length <- 2:length(gene_tpm_copy)

In [6]:
dim(gene_tpm_copy)

In [7]:
head(gene_tpm_copy)

Unnamed: 0_level_0,sample,TCGA-19-1787-01,TCGA-S9-A7J2-01,TCGA-G3-A3CH-11,TCGA-EK-A2RE-01,TCGA-44-6778-01,TCGA-F4-6854-01,TCGA-AB-2863-03,TCGA-C8-A1HL-01,TCGA-EW-A2FS-01,⋯,TCGA-DJ-A2QC-01,TCGA-A8-A09K-01,TCGA-61-1907-01,TCGA-IB-7885-01,TCGA-95-7947-01,TCGA-VQ-AA6F-01,TCGA-BR-8588-01,TCGA-24-2254-01,TCGA-DD-A115-01,TCGA-FV-A3I0-11
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,ENSG00000242268.2,-9.9658,0.2998,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,⋯,-1.3921,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-1.9379,-9.9658,-9.9658
2,ENSG00000259041.1,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,⋯,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658
3,ENSG00000270112.3,-3.816,-3.0469,-9.9658,-9.9658,-5.5735,-9.9658,-3.458,-9.9658,-5.0116,⋯,-6.5064,-9.9658,-9.9658,-9.9658,-6.5064,-9.9658,-9.9658,-5.5735,-9.9658,-9.9658
4,ENSG00000167578.16,5.2998,4.8881,3.5572,4.2563,5.3162,4.5161,3.6242,4.9782,5.7035,⋯,5.4591,5.5364,4.7798,5.0514,6.1607,5.2814,3.9599,5.076,4.026,3.0876
5,ENSG00000278814.1,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,⋯,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658
6,ENSG00000078237.5,3.5086,2.3019,0.099,3.6184,3.5633,2.9109,2.1638,2.6624,1.2696,⋯,1.5998,2.8219,2.5876,2.6232,2.9525,3.0774,3.4384,2.2082,2.1541,-0.5756


# tcga query

In [8]:
tcga_projects <- TCGAbiolinks:::getGDCprojects()$project_id[grep('TCGA',TCGAbiolinks:::getGDCprojects()$project_id)]
query<-GDCquery(tcga_projects,
         data.category = "Transcriptome Profiling",
         data.type = "Gene Expression Quantification")

--------------------------------------

o GDCquery: Searching in GDC database

--------------------------------------

Genome of reference: hg38

--------------------------------------------

oo Accessing GDC. This might take a while...

--------------------------------------------

ooo Project: TCGA-BRCA

ooo Project: TCGA-MESO

ooo Project: TCGA-CESC

ooo Project: TCGA-BLCA

ooo Project: TCGA-CHOL

ooo Project: TCGA-ACC

ooo Project: TCGA-DLBC

ooo Project: TCGA-HNSC

ooo Project: TCGA-COAD

ooo Project: TCGA-ESCA

ooo Project: TCGA-LAML

ooo Project: TCGA-KIRP

ooo Project: TCGA-KIRC

ooo Project: TCGA-GBM

ooo Project: TCGA-KICH

ooo Project: TCGA-READ

ooo Project: TCGA-PAAD

ooo Project: TCGA-LUAD

ooo Project: TCGA-OV

ooo Project: TCGA-LIHC

ooo Project: TCGA-LUSC

ooo Project: TCGA-LGG

ooo Project: TCGA-UCEC

ooo Project: TCGA-PRAD

ooo Project: TCGA-STAD

ooo Project: TCGA-THCA

ooo Project: TCGA-THYM

ooo Project: TCGA-UCS

ooo Project: TCGA-UVM

ooo Project: TCGA-PCPG

ooo

In [9]:
TCGAbiolinks:::getGDCprojects()$project_id[grep('TCGA',TCGAbiolinks:::getGDCprojects()$project_id)]

In [10]:
queryDF<- getResults(query)
project_types <- character(length(gene_tpm_copy)-1)

for(i in tpm_length) {
    sample <- colnames(gene_tpm_copy)[i]
    sample <- substr(sample,1,12)
    project_types[i-1] <- queryDF[queryDF$cases.submitter_id ==  sample,]$project[1]
}

In [11]:
summary(as.factor(project_types))

In [12]:
rownames(gene_tpm_copy)[i]

# functions

In [13]:
#### functions
# function to retrieve the mean zscore for a tissue type
get.most.expressed <- function(tissue_type) {
    # calculate mean z_score
    soi <- colnames(gene_tpm_copy[,tpm_length])[project_types == tissue_type & !is.na(project_types)]

    temp_samples <- gene_tpm_copy[,tpm_length][,soi]
    
    mean_zscore <- rowMeans(temp_samples, na.rm = F)
    
    # extract the columns for the tissues of interest, in this case we want Lung, Liver, Esophagus, Bladder, and Intestine
    tissue_mean_zscore <- data.frame(cbind("gene" = gene_tpm_copy[,1], "mean_Zscore" = mean_zscore))
    
    # select the top 1000 most highly expressed genes
    tissue_most_expressed <- tissue_mean_zscore[with(tissue_mean_zscore, order(mean_Zscore, decreasing = T)),]
    print(head(tissue_most_expressed))
    tissue_most_expressed <- tissue_most_expressed[1:1000,]
    
    current_hgnc <- getLDS(attributes=c("ensembl_gene_id"), filters="ensembl_gene_id",
                             values=gsub("\\..*","", tissue_most_expressed$gene),
                             mart=human,attributesL=c("hgnc_symbol"),
                             martL=human)
    
    return(current_hgnc$HGNC.symbol)
}

compute_enrichment <- function(foreground_genes,all_genes,background_genes=NULL,pathways=NULL) {
    if (is.null(pathways)) { 
        pathways <- load_pathways()
        pathways <- lapply( pathways, function(pathway_genes) {return(pathway_genes[pathway_genes %in% all_genes])})
    }
    fisher_enrichment_dt <- data.table(pathway=names(pathways),p_value=-1,odds_ratio=-1)
    if (is.null(background_genes)) {
        background_genes <- setdiff(all_genes,foreground_genes)
        #background_genes <- unique(unlist(pathways))
    }

    for (pathway in names(pathways)) {
        pathway_genes <- pathways[[pathway]]
        non_pathway_genes <- setdiff(all_genes,pathway_genes)

        num_in_pathway_and_foreground <- intersect(pathway_genes,foreground_genes) %>% length
        num_in_pathway_and_not_foreground <- intersect(pathway_genes,background_genes) %>% length
        num_not_in_pathway_and_foreground <- intersect(non_pathway_genes,foreground_genes) %>% length
        num_not_in_pathway_and_not_foreground <- intersect(non_pathway_genes,background_genes) %>% length
        fisher_mat <- matrix(c(num_in_pathway_and_foreground,num_in_pathway_and_not_foreground,
                              num_not_in_pathway_and_foreground,num_not_in_pathway_and_not_foreground),
                             nrow=2,ncol=2,byrow=T)
        
        test_res <- fisher.test(fisher_mat,alternative="g")
        pathway_ <- pathway
        fisher_enrichment_dt[pathway==pathway_,`:=`(p_value=test_res$p.value, odds_ratio=test_res$estimate,
        num_p_fg=num_in_pathway_and_foreground,num_p_bg=num_in_pathway_and_not_foreground,
        num_not_p_fg=num_not_in_pathway_and_foreground,num_not_p_bg=num_not_in_pathway_and_not_foreground)]
    }
    fisher_enrichment_dt[,q_value:=p.adjust(p_value)]

    return(fisher_enrichment_dt)
}

get_enrichment_data <- function(current_files, current_pathways){
    total_enrichment_pathways <- names(current_pathways)
    pathwaysDF <- data.frame("pathways" = total_enrichment_pathways)
    pathwaysDF_odds <- data.frame("pathways" = total_enrichment_pathways)
    for(i in 1:length(current_files)) {
        spec_exptype <- names(current_files)[i]
        
        current_hgnc <- getLDS(attributes=c("entrezgene_id"), filters="entrezgene_id",
                             values=current_files[[spec_exptype]],
                             mart=human,attributesL=c("hgnc_symbol"),
                             martL=human)
        total_genes <- current_hgnc$HGNC.symbol
        
        
        enrichment_test <- compute_enrichment(foreground_genes = total_genes,
                                               all_genes = background_set$gene,
                                               pathways = current_pathways)
        subset_enrichment_test  <- enrichment_test[,c("q_value")]
        subset_enrichment_test_odds  <- enrichment_test[,c("odds_ratio")]
        rownames(subset_enrichment_test) <-  enrichment_test$pathway
        rownames(subset_enrichment_test_odds) <-  enrichment_test$pathway

        pathwaysDF <- cbind(pathwaysDF, subset_enrichment_test)
        pathwaysDF_odds <- cbind(pathwaysDF_odds, subset_enrichment_test_odds)
        #saveRDS(subset_enrichment_test, file = new_file_name)
    }
    
    all_qvals <- pathwaysDF[,2:length(pathwaysDF)]
    rownames(all_qvals) <- total_enrichment_pathways
    colnames(all_qvals) <- names(current_files)
    
    all_odds <- pathwaysDF_odds[,2:length(pathwaysDF_odds)]
    rownames(all_odds) <- total_enrichment_pathways
    colnames(all_odds) <- names(current_files)
    
    enrichment_heatmap <- pheatmap(as.matrix(log(all_qvals)), fontsize = 8)
    return(list("matrix" = all_qvals, "odds_matrix" = all_odds, "heatmap" = enrichment_heatmap))
}
makeColorRampPalette <- function(colors, cutoff.fraction, num.colors.in.palette)
{
  stopifnot(length(colors) == 4)
  ramp1 <- colorRampPalette(colors[1:2])(num.colors.in.palette * cutoff.fraction)
  ramp2 <- colorRampPalette(colors[3:4])(num.colors.in.palette * (1 - cutoff.fraction))
  return(c(ramp1, ramp2))
}
change_matrix <- function(fdr_mat, odds_mat, threshhold) {
    new_mat <- matrix(nrow = nrow(fdr_mat), ncol = ncol(fdr_mat))
    
    for(i in 1:nrow(fdr_mat)) {
        for(j in 1:ncol(fdr_mat)) {
            
            if(fdr_mat[i,j] < .05) {
                new_mat[i,j] <- odds_mat[i,j]
            }else {
                new_mat[i,j] <- 0.001
            }
        }
    }
    
    rownames(new_mat) <- rownames(fdr_mat)
    colnames(new_mat) <- colnames(fdr_mat)
    
    cutoff.distance <- 0.002  
    cols <- makeColorRampPalette(c("white", "white",    # distances 0 to 0.002 colored from white to red
                                   "red", "blue"), # distances 0.002 to max(distmat) colored from green to black
                                 cutoff.distance / max(new_mat),
                                 100)
    freq_mat <- apply(new_mat, 1, function(x) {sum(x > threshhold)/nrow(new_mat)})
    freq_dat <- data.frame("cancertype" = names(freq_mat),
                     "fraction" = freq_mat)
    
    # Barplot
    bar_plot<- ggplot(freq_dat, aes(x=cancertype, y=fraction)) + 
    geom_bar(stat = "identity") +
    coord_flip()

    new_heatmap <- pheatmap(new_mat,
                            color = cols,
                            fontsize = 8)
    
    return(list("matrix"=new_mat, "heatmap" = new_heatmap, "barplot" = bar_plot))
    
}

# top 1000 project specific genes

In [14]:
cancer_top_1000 <- list()
for(i in 1:length(tcga_projects)) {
    top_1000 <- get.most.expressed(tcga_projects[i])
    cancer_top_1000[[tcga_projects[i]]] <- top_1000
}

                    gene      mean_Zscore
43217  ENSG00000237550.5 9.98670019265914
41096 ENSG00000135404.11  9.9574028825187
51704 ENSG00000167658.15 9.93701361989129
33801 ENSG00000100219.16  9.9286447847754
56772 ENSG00000135486.17 9.91615700196278
57481  ENSG00000163041.9 9.91544389706719
                    gene      mean_Zscore
25793 ENSG00000070756.13  9.9820023255814
23303  ENSG00000166794.4 9.97715697674419
39006 ENSG00000172270.18 9.91845930232558
49811  ENSG00000220842.6 9.90539651162791
53177 ENSG00000104529.17 9.90226279069767
3208  ENSG00000185201.16 9.88411395348837
                    gene      mean_Zscore
18879 ENSG00000170315.13 9.99025177993527
57481  ENSG00000163041.9 9.98092556634304
16025 ENSG00000067225.17 9.97515372168285
1938  ENSG00000163682.15 9.97483980582524
40962 ENSG00000123416.15 9.95027993527508
56772 ENSG00000135486.17 9.94450938511327
                    gene      mean_Zscore
10313  ENSG00000234745.9 9.91515414740577
2795  ENSG00000074800.13 9.8925894

In [15]:
summary(cancer_top_1000)

          Length Class  Mode     
TCGA-BRCA 1000   -none- character
TCGA-MESO  999   -none- character
TCGA-CESC 1000   -none- character
TCGA-BLCA 1000   -none- character
TCGA-CHOL 1000   -none- character
TCGA-ACC  1000   -none- character
TCGA-DLBC 1000   -none- character
TCGA-HNSC 1000   -none- character
TCGA-COAD 1000   -none- character
TCGA-ESCA 1000   -none- character
TCGA-LAML  999   -none- character
TCGA-KIRP 1000   -none- character
TCGA-KIRC 1000   -none- character
TCGA-GBM  1000   -none- character
TCGA-KICH 1000   -none- character
TCGA-READ 1000   -none- character
TCGA-PAAD 1000   -none- character
TCGA-LUAD 1000   -none- character
TCGA-OV    999   -none- character
TCGA-LIHC  999   -none- character
TCGA-LUSC 1000   -none- character
TCGA-LGG  1000   -none- character
TCGA-UCEC 1000   -none- character
TCGA-PRAD  998   -none- character
TCGA-STAD 1000   -none- character
TCGA-THCA 1000   -none- character
TCGA-THYM 1000   -none- character
TCGA-UCS  1000   -none- character
TCGA-UVM  1000

In [16]:
wrs_list <- readRDS("wrs_list.rds")
recurrent_wrs_list <- readRDS("recurrent_wrs_list.rds")

# Cancer enrichment Plots

In [17]:
summary(reccurent_wrs_list[["upregulated"]])

ERROR: Error in summary(reccurent_wrs_list[["upregulated"]]): object 'reccurent_wrs_list' not found


In [None]:
cancer_type_up <- get_enrichment_data(wrs_list[["upregulated"]],cancer_top_1000)
cancer_type_up_new <- change_matrix(cancer_type_up$matrix, cancer_type_up$odds_matrix,2.5)
cancer_type_up_new$barplot

In [None]:
cancer_type_down <- get_enrichment_data(wrs_list[["downregulated"]],cancer_top_1000)
cancer_type_down_new <- change_matrix(cancer_type_down$matrix, cancer_type_down$odds_matrix,4)
cancer_type_down_new$barplot

In [None]:
cancer_type_up_reccurent <- get_enrichment_data(recurrent_wrs_list[["upregulated"]],cancer_top_1000)
cancer_type_up_reccurent_new <- change_matrix(cancer_type_up_reccurent$matrix, cancer_type_up_reccurent$odds_matrix,2.5)
cancer_type_up_reccurent_new$barplot

In [None]:
summary(recurrent_wrs_list[["upregulated"]])

In [None]:
cancer_type_down_reccurent <- get_enrichment_data(recurrent_wrs_list[["downregulated"]],cancer_top_1000)
cancer_type_down_reccurent_new <- change_matrix(cancer_type_down_reccurent$matrix, cancer_type_down_reccurent$odds_matrix,5)
cancer_type_down_reccurent_new$barplot