# Interpretation Notebook

### Load Notch Results

In [None]:
res = read.csv("/data/srlab/lrumker/MCSC_Project/mcsc_scratch/sepsisres.csv")

In [None]:
head(res)

In [None]:
gene_expr = t(readRDS('/data/srlab/jkang/cna_sepsis/9_Sepsis_CNA/x_sparse.RDS'))

In [None]:
gene_expr[1:5,1:5]

In [None]:
# Confirm cell order
sum(res$index==colnames(gene_expr))

### Compute Gene Correlations

In [None]:
# Examine correlations only with variable genes
n_var_genes = 5000
gene_sds = apply(gene_expr, 1, FUN = sd)
gene_sds = gene_sds[order(gene_sds, decreasing = TRUE)]

In [None]:
var_genes = names(gene_sds)[1:n_var_genes]
var_gene_expr = gene_expr[match(var_genes, rownames(gene_expr)),]

In [None]:
cna_scores = res$ncorrs

In [None]:
dim(var_gene_expr)

In [None]:
corrs = cor(t(as.matrix(var_gene_expr)), cna_scores)
names(corrs) = var_genes

In [None]:
rankList = corrs

### Load Genesets

In [None]:
library(fgsea)
library(GSA)

In [None]:
geneset_files = c("/data/srlab/lrumker/MSigDB_Sets/c7.all.v7.0.symbols.gmt.txt", #Immune
"/data/srlab/lrumker/MSigDB_Sets/h.all.v7.0.symbols.gmt.txt", #Hallmark
"/data/srlab/lrumker/MSigDB_Sets/c2.all.v7.0.symbols.gmt.txt") #Cannonical

In [None]:
GeneSet_SizeBound_Upper = 500 # Maximum gene set size considered
GeneSet_SizeBound_Lower = 0 # Minimum gene set size considered

In [None]:
parseGeneSetGMT <-function(filepath, known_genes, sizeBound_Upper, sizeBound_Lower){
  Gene_Sets_DB = GSA.read.gmt(filepath)
  GeneSet_sizes = sapply(Gene_Sets_DB$genesets,length)
  recognized_genes = matrix(NA, nrow=length(GeneSet_sizes), ncol = max(GeneSet_sizes))
  for(i in c(1:length(GeneSet_sizes))){
    recognized_genes[i,c(1:GeneSet_sizes[i])] = Gene_Sets_DB$genesets[[i]]
  }
  recognized_genes = matrix(is.element(recognized_genes, known_genes), ncol = ncol(recognized_genes))
  GeneSet_sizes = apply(recognized_genes, 1, sum)
  retain_GeneSet = (GeneSet_sizes>=sizeBound_Lower)&(GeneSet_sizes<=sizeBound_Upper)
  Gene_Sets_DB$genesets = Gene_Sets_DB$genesets[retain_GeneSet]
  Gene_Sets_DB$geneset.names = Gene_Sets_DB$geneset.names[retain_GeneSet]
  Gene_Sets_DB$geneset.descriptions = Gene_Sets_DB$geneset.descriptions[retain_GeneSet]
  Gene_Sets_DB$geneset.sizes = GeneSet_sizes[retain_GeneSet]
  return(Gene_Sets_DB)
}

In [None]:
i_file = 3
Gene_Sets_DB = parseGeneSetGMT(geneset_files[i_file], names(rankList), GeneSet_SizeBound_Upper,
                               GeneSet_SizeBound_Lower)
input_genesets = Gene_Sets_DB$genesets
names(input_genesets) = Gene_Sets_DB$geneset.names


In [None]:
FGSEA_output = fgsea(input_genesets, rankList, nperm=100000, 
                         minSize = 15, maxSize = 500, nproc = 0,
                         gseaParam = 1, BPPARAM = NULL)
FGSEA_output = FGSEA_output[order(FGSEA_output$padj),]

In [None]:
#FGSEA_output[grep("MONO", FGSEA_output$pathway),]

In [None]:
FGSEA_output_sig = FGSEA_output[which(FGSEA_output$padj<0.05),]

In [None]:
enriched_pathways = FGSEA_output_sig[grep("PID", FGSEA_output_sig$pathway), c(1:4)]

In [None]:
stored_expr = matrix(rep(NA, ncol(gene_expr)),ncol = 1)
for(i_pathway in c(1:nrow(enriched_pathways))){
    print(enriched_pathways$pathway[i_pathway])
    i_geneset = match(enriched_pathways$pathway[i_pathway], Gene_Sets_DB$geneset.names)
    geneset_genes = Gene_Sets_DB$genesets[i_geneset][[1]]
    genes_loc = match(geneset_genes, rownames(gene_expr))
    genes_loc = genes_loc[!is.na(genes_loc)]
    geneset_expr = apply(gene_expr[genes_loc,], 2, FUN = sum)
    stored_expr = cbind(stored_expr, geneset_expr)
}

In [None]:
stored_expr = stored_expr[,-c(1)]

In [None]:
colnames(stored_expr) = enriched_pathways$pathway

In [None]:
write.table(stored_expr,
            "/data/srlab/lrumker/MCSC_Project/mcsc_scratch/sepsis_geneset_scores.csv",
           sep = ",")


In [None]:
write.table(enriched_pathways,
            "/data/srlab/lrumker/MCSC_Project/mcsc_scratch/sepsis_enriched_pathways.csv",
           sep = ",")

# Identify population-characteristic genes

In [None]:
library(presto)

In [None]:
head(res)

In [None]:
only_monos_loc = which(res$cell_state %in%c('MS1', 'MS2', 'MS3', 'MS4'))

In [None]:
length(res$poscells)

In [None]:
wilcox_res <- wilcoxauc(var_gene_expr[,only_monos_loc], (res$poscells[only_monos_loc]=="True")*1)

In [None]:
wilcox_res = wilcox_res[which(wilcox_res$padj<0.05),]

In [None]:
wilcox_res = wilcox_res[which(wilcox_res$group=="1"),]

In [None]:
wilcox_res_pos = wilcox_res[which(wilcox_res$logFC>0),]
wilcox_res_neg = wilcox_res[which(wilcox_res$logFC<0),]

In [None]:
wilcox_res_pos = wilcox_res_pos[order(wilcox_res_pos$padj, decreasing = FALSE),]

In [None]:
wilcox_res_toppos = wilcox_res_pos[which(wilcox_res_pos$padj==0),]

In [None]:
library(ggplot2)

In [None]:
# our pop is HLA-DR-mid, CD14++, CD16-/+
# classical (CD14++CD16−)
# intermediate (CD14++CD16+)
# TOP: nonclassical (CD14+CD16++) monocytes

In [None]:
# Color by qsec values
plot_df = data.frame("tSNE1" = res$tSNE1[ix], "tSNE2"= res$tSNE2[ix],
                     "gene_expr" = (res$poscells[ix]=="True")*1)
sp2<-ggplot(plot_df, aes(x=tSNE1, y=tSNE2, color=gene_expr)) + geom_point()
# Change the low and high colors
# Sequential color scheme
sp2+scale_color_gradient(low="blue", high="red")

In [None]:
# Color by qsec values
plot_df = data.frame("tSNE1" = res$tSNE1[ix], "tSNE2"= res$tSNE2[ix],
                     "gene_expr" = var_gene_expr[which(var_genes=='CD14'),ix])
sp2<-ggplot(plot_df, aes(x=tSNE1, y=tSNE2, color=gene_expr)) + geom_point()
# Change the low and high colors
# Sequential color scheme
sp2+scale_color_gradient(low="blue", high="red")

In [None]:
# Color by qsec values
plot_df = data.frame("tSNE1" = res$tSNE1[ix], "tSNE2"= res$tSNE2[ix],
                     "gene_expr" = var_gene_expr[which(var_genes=='FCGR3A'),ix])
sp2<-ggplot(plot_df, aes(x=tSNE1, y=tSNE2, color=gene_expr)) + geom_point()
# Change the low and high colors
# Sequential color scheme
sp2+scale_color_gradient(low="blue", high="red")

In [None]:
# Color by summed HLA expr values
summed_HLA = apply(var_gene_expr[which(var_genes %in%c('HLA-DRA','HLA-DRB1','HLA-DRB5')),ix], 2, sum)
plot_df = data.frame("tSNE1" = res$tSNE1[ix], "tSNE2"= res$tSNE2[ix],
                     "gene_expr" = summed_HLA)
sp2<-ggplot(plot_df, aes(x=tSNE1, y=tSNE2, color=gene_expr)) + geom_point()
# Change the low and high colors
# Sequential color scheme
sp2+scale_color_gradient(low="blue", high="red")

In [None]:
our pop is HLA-DR-mid, CD14+, CD16-

In [None]:
# Color by qsec values
plot_df = data.frame("tSNE1" = res$tSNE1[ix], "tSNE2"= res$tSNE2[ix],
                     "gene_expr" = var_gene_expr[which(var_genes=='SLAN'),ix])
sp2<-ggplot(plot_df, aes(x=tSNE1, y=tSNE2, color=gene_expr)) + geom_point()
# Change the low and high colors
# Sequential color scheme
sp2+scale_color_gradient(low="blue", high="red")

In [None]:
wilcox_res_toppos[order(wilcox_res_toppos$logFC, decreasing = TRUE),]

In [None]:
intermediate_mono_genes = c("LYZ", "S100A8", "CD14", "S100A10", "HLA-DRA", "CD74", "IFI30", "HLA-DPB1", 
                            "CPV", "FCGR3A")

In [None]:
i_intermediate_mono_genes = c()
for(i in c(1:length(intermediate_mono_genes))){
    i_intermediate_mono_genes = c(i_intermediate_mono_genes, grep(intermediate_mono_genes[i], wilcox_res$feature))
}

In [None]:
wilcox_res[i_intermediate_mono_genes,c(1,3,4, 6, 8:10)]