In [None]:
library(rhdf5)
library(DESeq2)
library(biomaRt)
library(RColorBrewer)
library(VennDiagram)
library(dplyr)
library(ggrepel)
suppressPackageStartupMessages(library(ComplexHeatmap))
library(ggplot2)
library(fgsea)
library(msigdbr)
library(patchwork)
library(tidyverse)
library(viridis)
library(cowplot)
library(tximport)
library(circlize)


### Gene lists from literature

In [None]:
msig_df <- msigdbr(species = "Mus musculus")
test <- msig_df[grep("GSE23321", msig_df$gs_name ), ]



In [None]:
# qstem score used in https://www.nature.com/articles/s41590-022-01171-9#MOESM4
# positive and negative association
qstem_p <- as.vector(as.matrix(read.table(file.path("/net/shendure/vol8/projects/scifate_abadiek/Tcf7_genomics/Tcf7_RNA_analysis/gene_lists", "qstem_pos.txt"))))
qstem_n <- as.vector(as.matrix(read.table(file.path("/net/shendure/vol8/projects/scifate_abadiek/Tcf7_genomics/Tcf7_RNA_analysis/gene_lists", "qstem_neg.txt"))))
gs_names <-  as.vector(as.matrix(read.table(file.path("/net/shendure/vol8/projects/scifate_abadiek/Tcf7_genomics/Tcf7_RNA_analysis/gene_lists", "msigdbr_gs_name.txt"))))
gs_names_gsea <- as.vector(as.matrix(read.table(file.path("/net/shendure/vol8/projects/scifate_abadiek/Tcf7_genomics/Tcf7_RNA_analysis/gene_lists", "msigdbr_gs_name_gsea.txt"))))

# msigdbr
# get gene lists
msig_df <- msigdbr(species = "Mus musculus", category = "C7")
msig_df <- msig_df[which(msig_df$gs_name %in% gs_names), ]
# msig_df <- msig_df[grep("UP", msig_df$gs_name ), ]

msig_df_GR_EFFvMEM_UP <- msig_df[grep("GOLDRATH_EFF_VS_MEMORY_CD8_TCELL_UP", msig_df$gs_name ), ] # Goldrath 
msig_df_GR_EFFvMEM_DN <- msig_df[grep("GOLDRATH_EFF_VS_MEMORY_CD8_TCELL_DN", msig_df$gs_name ), ] # Goldrath 
msig_df_SK_D8_EFFvMEM_UP <- msig_df[grep("KAECH_DAY8_EFF_VS_MEMORY_CD8_TCELL_UP", msig_df$gs_name ), ] #Kaech D8
msig_df_SK_D8_EFFvMEM_DN <- msig_df[grep("KAECH_DAY8_EFF_VS_MEMORY_CD8_TCELL_DN", msig_df$gs_name ), ] #Kaech D8
msig_df_SK_D15_EFFvMEM_UP <- msig_df[grep("KAECH_DAY15_EFF_VS_MEMORY_CD8_TCELL_UP", msig_df$gs_name ), ] #Kaech D15
msig_df_SK_D15_EFFvMEM_DN <- msig_df[grep("KAECH_DAY15_EFF_VS_MEMORY_CD8_TCELL_DN", msig_df$gs_name ), ] #Kaech D15




In [None]:
EffvMem_UP <- unique(c(msig_df_GR_EFFvMEM_UP$gene_symbol, msig_df_SK_D8_EFFvMEM_UP$gene_symbol))
EffvMem_DN <- unique(c(msig_df_GR_EFFvMEM_DN$gene_symbol, msig_df_SK_D8_EFFvMEM_DN$gene_symbol))
gs_names_gsea <- as.vector(as.matrix(read.table(file.path("/net/shendure/vol8/projects/scifate_abadiek/Tcf7_genomics/Tcf7_RNA_analysis/gene_lists", "msigdbr_gs_name_gsea.txt"))))


### Insert kallisto abundances for DESeq2 analysis using tximport

In [None]:
RNA_path <- "/net/shendure/vol8/projects/scifate_abadiek/Tcf7_genomics/Tcf7_RNA_analysis"
main_dir <- "/net/shendure/vol8/projects/scifate_abadiek/Tcf7_genomics/Tcf7_RNA_analysis/kallisto"
sample_id <- dir(file.path(main_dir,"outs"))
sample_dir <- file.path(main_dir, "outs", sample_id)
files = file.path(sample_dir, "abundance.h5")

In [None]:
# remove LL files 
LL_ind <- grep("low_low", files)
files <- files[-LL_ind]
sample_id <- sample_id[-LL_ind]

# load ensemble to gene map generated from reference transcriptome
# ensemble2gene_raw <- read.table(file.path(main_dir, "transcripts_to_genes.txt"))
ensemble2gene <- read.table(file.path(main_dir, "transcripts_to_genes.txt"))
transcript_id <- ensemble2gene$V1
gene <- ensemble2gene$V3
ensemble2gene <- data.frame(transcript_id, gene)

# counts matrix from kallisto abundance files
txi <- tximport(files, type = "kallisto", tx2gene = ensemble2gene) # transcripts missing from tx2gene: 1673


# Load metadata 
s2c <- data.frame(sample = sample_id)
s2c <- s2c %>% separate(sample, into=c("mouse", "cond"), sep = "_", extra = "merge", remove = FALSE) 
s2c$cond <- factor(s2c$cond, levels = c("D3", "Naive", "Ag", "low_hi", "hi_hi")) # all comparisons will be done to D3
s2c<- s2c %>% add_column(cond_simp = "D3", .after = "cond")
s2c[which(s2c$cond %in% c("low_hi", "hi_hi")), ]$cond_simp <- "D9"
s2c[which(s2c$cond %in% c("Naive", "Ag")), ]$cond_simp <- "D0"
s2c$cond_simp <- factor(s2c$cond_simp, levels = c("D9", "D3", "D0")) 

names(txi)
names(txi$infReps)
colnames(txi$counts)

### Run DESeq

In [None]:
# create deseq object
dds <- DESeq2::DESeqDataSetFromTximport(txi, 
                                        colData = s2c,
                                        design= ~ cond) 
                                      
# remove low count genes
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds$cond <- relevel(dds$cond, ref = "D3")

# Run DESeq if not already saved
dds1 <- DESeq2::DESeq(dds)



### Save dds or load if already saved

In [None]:
saveRDS(dds1, file = file.path(RNA_path, "Tcf7_RNA_kallisto_dds_0LL.RDS"))

# dds1 <- readRDS(file.path(RNA_path, "Tcf7_RNA_kallisto_dds_0LL.RDS"))



### Comparisons
Remove low_low comparison


In [None]:
dat1 <- data.frame(gene = rownames(dds1)) # results table 

pair1 <- c("low_hi",
           "low_hi", "hi_hi",
           "low_hi", "hi_hi",
           "low_hi", "hi_hi",
           "Naive", "Naive", "Ag")
pair2 <- c("hi_hi",
           "Naive", "Naive", 
           "Ag", "Ag",
           "D3", "D3", 
           "Ag", "D3", "D3")
condtable<- matrix(c(pair1, pair2), ncol = 2)

z= length(condtable[,1])

for (i in 1:z){
    print(i)

    label = paste(condtable[i,1], "_", condtable[i,2], sep="")
    print(label)
    res <-results(dds1, cooksCutoff= FALSE,contrast=c("cond",condtable[i,1],condtable[i,2]))
    res.m=as.data.frame(res)
    colnames(res.m) = paste (label,colnames(res.m),sep=".")
    dat1<-cbind(dat1,(res.m),apply(res,1,function(x) ifelse(abs(x[1])>2 & x[6]<0.05,1,0))) # Change x[6] to x[5] to use p value instead of adjusted p value (not correct to do!)
    colnames(dat1)[ncol(dat1)] <- paste0(colnames(dat1)[ncol(dat1)-1],"_pass_filter") # FDR < 0.05; FC > 2
}

### p value visualization

In [None]:
hist(dat1$low_hi_hi_hi.pvalue,breaks = 100)
hist(dat1$low_hi_Naive.pvalue,breaks = 100)
hist(dat1$low_hi_Ag.pvalue,breaks = 100)
hist(dat1$low_hi_D3.pvalue,breaks = 100)
hist(dat1$hi_hi_Naive.pvalue,breaks = 100)
hist(dat1$hi_hi_Ag.pvalue,breaks = 100)
hist(dat1$hi_hi_D3.pvalue,breaks = 100)
hist(dat1$Naive_Ag.pvalue,breaks = 100)
hist(dat1$Naive_D3.pvalue,breaks = 100)
hist(dat1$Ag_D3.pvalue,breaks = 100)

### Generate heatmap

In [None]:
# Variable between LH and HH
dat1_LH_HH <- dat1[dat1$low_hi_hi_hi.padj_pass_filter ==1, ]
dds1_LH_HH <- dds1[dat1$low_hi_hi_hi.padj_pass_filter ==1, ]

# Aggregate variable enh between each D9 sample and each control
select <- unique(c(
which(dat1$low_hi_Naive.padj_pass_filter ==1),
which(dat1$low_hi_Ag.padj_pass_filter ==1),
which(dat1$low_hi_D3.padj_pass_filter ==1),
which(dat1$hi_hi_Naive.padj_pass_filter ==1),
which(dat1$hi_hi_Ag.padj_pass_filter ==1),
which(dat1$hi_hi_D3.padj_pass_filter ==1)))
dat1_D9_c <- dat1[select, ]
dds1_D9_c <- dds1[select, ]

# rlog
# rld_LH_HH <- rlog(dds1_LH_HH)
rld_D9_c <- rlog(dds1_D9_c)

# select data for heatmap
rld_hm <- rld_D9_c
dat1_hm <- dat1_D9_c
dds1_hm <- dds1_D9_c

# select most significant peaks by either p value or fold change
# get minimum of all tested p value
padj_cols <- grep("padj", colnames(dat1_hm))
padj_filt_cols <- grep("padj_pass_filter", colnames(dat1_hm))
padj_cols <- padj_cols[!(padj_cols %in% padj_filt_cols)]
dat1_hm$padj_min <- apply(dat1_hm[ , padj_cols], 1, min)
# take lowest n min adj p value
top_n <- head(order(dat1_hm$padj_min), n=500)
dat1_hm <- dat1_hm[top_n, ]
dds1_hm <- dds1_hm[top_n, ]
rld_hm <- rld_hm[top_n, ]

# heatmap matrix prep
mat <- assay(rld_hm)
colnames(mat)<- sample_id
mat_scale <- t(scale(t(mat)))

# genes to label Goldrath/Kaech eff
g_fig <- EffvMem_UP
genes_to_label_pos_4 <- which(rownames(mat_scale) %in% g_fig )
genes_to_label_4 <- rownames(mat_scale[genes_to_label_pos_4, ])  

# genes to label Goldrath/Kaech mem
g_fig <- EffvMem_DN
genes_to_label_pos_5 <- which(rownames(mat_scale) %in% g_fig )
genes_to_label_5 <- rownames(mat_scale[genes_to_label_pos_5, ])  

# column sample name annotations
labels <- c('Act', 'nm', 'nem', 'Act', 'nm', 'nem', 'Mem', 'N', 'Mem', 'N')
labels <- factor(labels, levels = c('Act','N', 'Mem', 'nm','nem' ))
ann <- data.frame(labels)
colnames(ann) <- c('Sample')
colors_anno = list('Sample' = c('Act' = '#A50F15', 'N' = 'green2', 'Mem' = 'darkgreen', 'nem' = 'deepskyblue1', 'nm' = 'blue2'))
colAnn <- HeatmapAnnotation(df = ann, 
  which = 'col',
  col = colors_anno, 
  annotation_width = unit(c(1, 4), 'cm'),
  gap = unit(1, 'mm'),
  annotation_legend_param = list(Sample = list(direction = "horizontal")))
                           


# annotate rows with genes
ha = rowAnnotation(
                   a4=anno_mark(at= genes_to_label_pos_4, labels = genes_to_label_4, which="row", labels_gp = gpar(col= "#A50F15",fontsize = 10)),
                   a5=anno_mark(at= genes_to_label_pos_5, labels = genes_to_label_5, which="row", labels_gp = gpar(col= "darkgreen",fontsize = 10))
                   ) 

# Complex heatmap

#k-means cluster the peaks
set.seed(123)
split = data.frame(cutree(hclust(dist(mat_scale)), k = 6))
split = split %>% dplyr::rename(clust = 1)
split$clust <- factor(split$clust, levels = c(6,4,5,3,2,1))

ht_list = Heatmap(mat_scale, 
    col = colorRamp2(seq(from=-2, to=2, by = (2--2)/12)[1:12], viridis::viridis(12)),
    name = "Scaled rlog", 
    show_column_names = FALSE, # false when using column annotation instead 
    show_row_names = FALSE, 
    row_names_gp = gpar(fontsize = 4), 
    cluster_columns = TRUE, 
    column_dend_reorder = c(1,8,10,1,8,10,5,3,5,3), # weights for reordering dendrogram 
    cluster_rows = TRUE, 
    show_column_dend = TRUE,
    show_row_dend = FALSE,
    width = unit(8, "cm"),
    clustering_method_rows = "complete",
    
    # split by dendrogram
    cluster_row_slices = FALSE, # manual ordering          
    row_split = split,   
    
    # annotation
    right_annotation = ha,
    top_annotation=colAnn,
                  
    heatmap_legend_param = list(direction = "horizontal")
   
)



ht_list = draw(ht_list, annotation_legend_side="right", heatmap_legend_side = "bottom")




In [None]:
pdf("RNA_hm_deseq_clustering-complete_adj-p_n500_dendro-split-6_cols-clustered_reorder_GRanno_2022-09v4_viridis_legend2_cluster_titles.pdf", width=8, height=12)
draw(ht_list)
dev.off()

### Output gene lists for each row cluster in heatmap

In [None]:
clusterlist = row_order(ht_list)

# output gene lists
clu_df <- lapply(names(clusterlist), function(i){
  out <- data.frame(GeneID = rownames(mat_scale[clusterlist[[i]],]),
                                             Cluster = paste0("row_cluster_", i),
                                             stringsAsFactors = FALSE)
     return(out)
   }) %>%  
     do.call(rbind, .)

# output each cluster list to txt file 
write(clu_df[which(clu_df$Cluster == 'row_cluster_1'), ]$GeneID, file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c1.txt"))
write(clu_df[which(clu_df$Cluster == 'row_cluster_2'), ]$GeneID,file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c2.txt"))
write(clu_df[which(clu_df$Cluster == 'row_cluster_3'), ]$GeneID, file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c3.txt"))
write(clu_df[which(clu_df$Cluster == 'row_cluster_4'), ]$GeneID,file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c4.txt"))
write(clu_df[which(clu_df$Cluster == 'row_cluster_5'), ]$GeneID,file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c5.txt"))
write(clu_df[which(clu_df$Cluster == 'row_cluster_6'), ]$GeneID,file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c6.txt"))


# additionally, subset matrix into different heatmap clusters and save these as csv
hm_df <- data.frame(mat_scale)
colnames(hm_df) <- c('M1_Act', 'M1_Mem_nm', 'M1_Mem_nem', 'M3_Act', 'M3_Mem_nm', 'M3_Mem_nem', 'M4_Mem', 'M4_N','M5_Mem', 'M5_N')

write.csv(hm_df[clusterlist[[1]],], file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_matrix_c6.csv"))
write.csv(hm_df[clusterlist[[2]],], file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_matrix_c4.csv"))
write.csv(hm_df[clusterlist[[3]],], file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_matrix_c5.csv"))
write.csv(hm_df[clusterlist[[4]],], file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_matrix_c3.csv"))
write.csv(hm_df[clusterlist[[5]],], file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_matrix_c2.csv"))
write.csv(hm_df[clusterlist[[6]],], file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_matrix_c1.csv"))

# or write all to same excel file with different sheets
library(openxlsx)
sheet_names <- names <- list('row_cluster_6' = hm_df[clusterlist[[1]],],
                             'row_cluster_4' = hm_df[clusterlist[[2]],],
                             'row_cluster_5' = hm_df[clusterlist[[3]],],
                             'row_cluster_3' = hm_df[clusterlist[[4]],],
                             'row_cluster_2' = hm_df[clusterlist[[5]],],
                             'row_cluster_1' = hm_df[clusterlist[[6]],])
openxlsx::write.xlsx(sheet_names, rowNames = TRUE, file = file.path('heatmap_cluster_genes', 'bulk_RNA_hm_6clust_matrix_row_clusters.xlsx'))
          

### Output gene lists for each row cluster, but now only for eff and mem annotation genes, separately

In [None]:
# First subset to only include effector genes
clu_df_eff <- clu_df %>% filter(GeneID %in% EffvMem_UP)

# output each cluster list to txt file 
write(clu_df_eff[which(clu_df_eff$Cluster == 'row_cluster_1'), ]$GeneID, file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c1_eff.txt"), ncolumns = length(which(clu_df_eff$Cluster == 'row_cluster_1')), sep = ", ")
write(clu_df_eff[which(clu_df_eff$Cluster == 'row_cluster_2'), ]$GeneID,file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c2_eff.txt"), ncolumns = length(which(clu_df_eff$Cluster == 'row_cluster_2')), sep = ", ")
write(clu_df_eff[which(clu_df_eff$Cluster == 'row_cluster_3'), ]$GeneID, file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c3_eff.txt"), ncolumns = length(which(clu_df_eff$Cluster == 'row_cluster_3')), sep = ", ")
write(clu_df_eff[which(clu_df_eff$Cluster == 'row_cluster_4'), ]$GeneID,file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c4_eff.txt"), ncolumns = length(which(clu_df_eff$Cluster == 'row_cluster_4')), sep = ", ")
# write(clu_df_eff[which(clu_df_eff$Cluster == 'row_cluster_5'), ]$GeneID,file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c5_eff.txt"), ncolumns = length(which(clu_df_eff$Cluster == 'row_cluster_5')), sep = ", ")
# write(clu_df_eff[which(clu_df_eff$Cluster == 'row_cluster_6'), ]$GeneID,file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c6_eff.txt"), ncolumns = length(which(clu_df_eff$Cluster == 'row_cluster_6')), sep = ", ")

# First subset to only include mem genes
clu_df_mem <- clu_df %>% filter(GeneID %in% EffvMem_DN)

# output each cluster list to txt file 
write(clu_df_mem[which(clu_df_mem$Cluster == 'row_cluster_1'), ]$GeneID, file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c1_mem.txt"), ncolumns = length(which(clu_df_mem$Cluster == 'row_cluster_1')), sep = ", ")
write(clu_df_mem[which(clu_df_mem$Cluster == 'row_cluster_2'), ]$GeneID,file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c2_mem.txt"), ncolumns = length(which(clu_df_mem$Cluster == 'row_cluster_2')), sep = ", ")
write(clu_df_mem[which(clu_df_mem$Cluster == 'row_cluster_3'), ]$GeneID, file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c3_mem.txt"), ncolumns = length(which(clu_df_mem$Cluster == 'row_cluster_3')), sep = ", ")
write(clu_df_mem[which(clu_df_mem$Cluster == 'row_cluster_4'), ]$GeneID,file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c4_mem.txt"), ncolumns = length(which(clu_df_mem$Cluster == 'row_cluster_4')), sep = ", ")
write(clu_df_mem[which(clu_df_mem$Cluster == 'row_cluster_5'), ]$GeneID,file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c5_mem.txt"), ncolumns = length(which(clu_df_mem$Cluster == 'row_cluster_5')), sep = ", ")
write(clu_df_mem[which(clu_df_mem$Cluster == 'row_cluster_6'), ]$GeneID,file.path("heatmap_cluster_genes", "bulk_RNA_hm_6clust_genelist_c6_mem.txt"), ncolumns = length(which(clu_df_mem$Cluster == 'row_cluster_6')), sep = ", ")


### Generate PCA 

In [None]:
# refactor 
rld_hm$cond <- factor(rld_hm$cond, levels = c('D3', 'Naive', 'Ag', 'hi_hi', 'low_hi'))


In [None]:
# colors 
cols <-  c("#A50F15", "green2", "darkgreen", "blue2", "deepskyblue1")

# plot theme
pca_theme <- theme( text=element_text(size=18))


# PCA using top 500 DEG with padj <.05 between controls and D9
pcaData <- plotPCA(rld_hm, intgroup=c("cond"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
g <- ggplot(pcaData, aes(PC1, PC2, color=cond)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"%")) +
  ylab(paste0("PC2: ",percentVar[2],"%")) + 
  coord_fixed() + scale_color_manual("",labels=c('Act', 'N', 'Mem', 'nm', 'nem'), values = cols) +
theme_bw(base_size=20) + theme(axis.text=element_text(size=14))

g
save_plot(g, file = "RNA_pca_D9_c_0LL.pdf")




### Generate Correlation map as alternative to PCA

In [None]:
mat_dist <- rld_hm

In [None]:
# choose matrix for sample distance plot
mat_dist <- rld_hm
mat_dist$cond_label <- c('Act', 'nm', 'nem', 'Act', 'nm', 'nem', 'Mem', 'N','Mem', 'N')

sampleDists <- dist(t(assay(mat_dist)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(mat_dist$cond_label, mat_dist$type)
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)

ht_corr = Heatmap(sampleDistMatrix, 
    name = "Sample Dist",
    show_column_names = FALSE, 
    show_row_names = TRUE, 
    # row_names_gp = gpar(fontsize = 2), 
    cluster_columns = TRUE, 
    column_dend_reorder = c(1,8,10,1,8,10,5,3,5,3), # weights for reordering dendrogram
    cluster_rows = TRUE, 
    row_dend_reorder = c(1,8,10,1,8,10,5,3,5,3), # weights for reordering dendrogram
    show_column_dend = TRUE, show_row_dend = TRUE,width = unit(8, "cm"),
    col = colors
)
ht_corr = draw(ht_corr)

pdf("RNA_corr-matrix.pdf", width=6, height=4)
ht_corr
dev.off()

### GSEA analysis

In [None]:
unique(msig_df_sel$gs_name)

In [None]:
# take subset of gene lists to perform GSEA for 
msig_df_sel <- msig_df[which(msig_df$gs_name %in% gs_names_gsea), ]

# gsea
msig_list = split(x = msig_df_sel$gene_symbol, f = msig_df_sel$gs_name)


# generate ranks from test statistic
dat1_gsea <- dat1_D9_c # for analysis, use only DE genes between D9 and controls
dat1_rank <- dat1_gsea[, grep("stat", colnames(dat1_gsea))]  # first get all test statistic columns

# stat for all comparisons of interest
# compare D9 to D3
ranks_low_hi_D3 <- deframe(data.frame(gene = rownames(dat1_rank), stat = dat1_rank$low_hi_D3.stat))
ranks_hi_hi_D3 <- deframe(data.frame(gene = rownames(dat1_rank), stat = dat1_rank$hi_hi_D3.stat))
# compare D9 to D0 Ag
ranks_low_hi_Ag <- deframe(data.frame(gene = rownames(dat1_rank), stat = dat1_rank$low_hi_Ag.stat))
ranks_hi_hi_Ag <- deframe(data.frame(gene = rownames(dat1_rank), stat = dat1_rank$hi_hi_Ag.stat))

# add comparison label column
gsea_low_hi_D3 <- fgsea(pathways = msig_list, stats=ranks_low_hi_D3) %>% add_column(comp = 'LH_D3')
gsea_hi_hi_D3 <- fgsea(pathways = msig_list, stats=ranks_hi_hi_D3)%>% add_column(comp = 'HH_D3')

gsea_low_hi_Ag <- fgsea(pathways = msig_list, stats=ranks_low_hi_Ag) %>% add_column(comp = 'LH_Ag')
gsea_hi_hi_Ag <- fgsea(pathways = msig_list, stats=ranks_hi_hi_Ag) %>% add_column(comp = 'HH_Ag')



In [None]:
gsea_all <- rbind(
                  gsea_low_hi_D3,
                  gsea_hi_hi_D3,
                  gsea_low_hi_Ag,
                  gsea_hi_hi_Ag
                 )

gsea_res_tidy <- gsea_all %>%
#     filter(padj<0.05) %>%
    as_tibble() %>%
    mutate(log_padj = -log10(padj))

# order pathways by factor levels
gsea_res_tidy$pathway <- factor(gsea_res_tidy$pathway, levels = c("KAECH_DAY8_EFF_VS_MEMORY_CD8_TCELL_DN",
    "GOLDRATH_EFF_VS_MEMORY_CD8_TCELL_DN",
     "KAECH_DAY8_EFF_VS_MEMORY_CD8_TCELL_UP",
                                                                  "GOLDRATH_EFF_VS_MEMORY_CD8_TCELL_UP"))
                                                                
                                                                  

# order comparisons by factor levels 
gsea_res_tidy$comp <- factor(gsea_res_tidy$comp, levels = c(
                                                           "LH_D3",
                                                           "HH_D3",
                                                           "LH_Ag",
                                                           "HH_Ag"))


 # Plot multiple comparisons at once                                                               
gsea_plot <- gsea_res_tidy %>% ggplot(., aes(comp, pathway)) +
geom_point(aes(size = log_padj, color = NES))+
    theme_minimal() + 
    scale_color_viridis_c()
#     scale_color_gradient2(low = "blue", mid = "white",
#                             high = "red") +
labs(size = "-log10(adj p-value)", color = "NES") +
theme(axis.title.x=element_blank(), axis.title.y=element_blank()) 

# plot single comparison
gsea_plot_LH <- gsea_res_tidy %>% filter(comp == "LH_D3") %>%
 ggplot(., aes(NES, pathway)) +
geom_point(aes(size = log_padj))+
    theme_minimal() + 
labs(size = "-log10(adj p-value)", color = "NES") +
theme(axis.title.y=element_blank()) 

# plot single comparison
gsea_plot_HH <- gsea_res_tidy %>% filter(comp == "HH_D3") %>%
 ggplot(., aes(NES, pathway)) +
geom_point(aes(size = log_padj))+
    theme_minimal() + 
labs(size = "-log10(adj p-value)", color = "NES") +
theme(axis.title.y=element_blank()) 

# plot LH and HH together
gsea_plot_LH_HH <- gsea_res_tidy %>% filter(comp %in% c("LH_D3","HH_D3")) %>%
 ggplot(., aes(NES, pathway)) +
geom_point(aes(size = log_padj, color = comp, alpha = .5))+
    theme_minimal() + 
labs(size = "-log10(adj p-value)", color = "NES") +
theme(axis.title.y=element_blank()) 

gsea_plot
gsea_plot_LH
gsea_plot_HH
gsea_plot_LH_HH



In [None]:
save_plot(gsea_plot, file = "gsea_all_v2.pdf", base_width = 10)
save_plot(gsea_plot_LH, file = "gsea_all_v3_LH.pdf", base_width = 8)
save_plot(gsea_plot_HH, file = "gsea_all_v3_HH.pdf", base_width = 8)
save_plot(gsea_plot_LH_HH, file = "gsea_all_v3_LH_HH.pdf", base_width = 8)
