# RA-Notch Interpretation Notebook

### Load Notch Results

In [None]:
notch_data = readRDS('/data/srlab/lrumker/MCSC_Project/notch/analysis_sc_tissue.rds')

cna_res = read.csv("/data/srlab/lrumker/MCSC_Project/notch/notch_cna_res.txt")

nampcs = read.csv("/data/srlab/lrumker/MCSC_Project/notch/notch_cna_NAM_PCs.txt")

In [None]:
gene_expr = notch_data[['exprs_norm']][,match(cna_res$cell_id,colnames(notch_data[['exprs_norm']]))]

In [None]:
# Confirm cell order matches
sum(cna_res$cell_id==colnames(gene_expr))==dim(gene_expr)[2]

In [None]:
# Confirm NAM PC1 and neighborhood correlations are perfectly correlated,
# since only one NAM PC included in the model
cor(cna_res$ncorrs, nampcs$PC1)

### Compute Gene Correlations

In [None]:
# Examine correlations only with variable genes
n_var_genes = 5000
gene_sds = apply(gene_expr, 1, FUN = sd)
gene_sds = gene_sds[order(gene_sds, decreasing = TRUE)]

In [None]:
var_genes = names(gene_sds)
var_gene_expr = gene_expr[match(var_genes, rownames(gene_expr)),]

In [None]:
#cna_scores = cna_res$ncorrs
#cna_scores = cna_res$score_notch
#cna_scores = cna_res$time
cna_scores = nampcs$PC1

In [None]:
corrs = cor(t(as.matrix(var_gene_expr)), cna_scores)
names(corrs) = var_genes

In [None]:
rankList = corrs

### Most highly correlated genes

In [None]:
corrs[order(corrs, decreasing = TRUE)][c(1:10)]

### Load Genesets

In [None]:
library(fgsea)
library(GSA)

In [None]:
geneset_files = c("/data/srlab/lrumker/MSigDB_Sets/c7.all.v7.0.symbols.gmt.txt", #Cannonical
"/data/srlab/lrumker/MSigDB_Sets/h.all.v7.0.symbols.gmt.txt", #Hallmark
"/data/srlab/lrumker/MSigDB_Sets/c2.all.v7.0.symbols.gmt.txt") #Immune

In [None]:
GeneSet_SizeBound_Upper = 500 # Maximum gene set size considered
GeneSet_SizeBound_Lower = 0 # Minimum gene set size considered

In [None]:
parseGeneSetGMT <-function(filepath, known_genes, sizeBound_Upper, sizeBound_Lower){
  Gene_Sets_DB = GSA.read.gmt(filepath)
  GeneSet_sizes = sapply(Gene_Sets_DB$genesets,length)
  recognized_genes = matrix(NA, nrow=length(GeneSet_sizes), ncol = max(GeneSet_sizes))
  for(i in c(1:length(GeneSet_sizes))){
    recognized_genes[i,c(1:GeneSet_sizes[i])] = Gene_Sets_DB$genesets[[i]]
  }
  recognized_genes = matrix(is.element(recognized_genes, known_genes), ncol = ncol(recognized_genes))
  GeneSet_sizes = apply(recognized_genes, 1, sum)
  retain_GeneSet = (GeneSet_sizes>=sizeBound_Lower)&(GeneSet_sizes<=sizeBound_Upper)
  Gene_Sets_DB$genesets = Gene_Sets_DB$genesets[retain_GeneSet]
  Gene_Sets_DB$geneset.names = Gene_Sets_DB$geneset.names[retain_GeneSet]
  Gene_Sets_DB$geneset.descriptions = Gene_Sets_DB$geneset.descriptions[retain_GeneSet]
  Gene_Sets_DB$geneset.sizes = GeneSet_sizes[retain_GeneSet]
  return(Gene_Sets_DB)
}

In [None]:
i_file = 3
Gene_Sets_DB = parseGeneSetGMT(geneset_files[i_file], names(rankList), GeneSet_SizeBound_Upper,
                               GeneSet_SizeBound_Lower)
input_genesets = Gene_Sets_DB$genesets
names(input_genesets) = Gene_Sets_DB$geneset.names

In [None]:
# Examine presence of relevant genesets
names(input_genesets)[grep("NOTCH", names(input_genesets))]

### Conduct geneset enrichment analysis

In [None]:
FGSEA_output = fgsea(input_genesets, rankList, nperm=10000, 
                         minSize = 15, maxSize = 500, nproc = 0,
                         gseaParam = 1, BPPARAM = NULL)
FGSEA_output = FGSEA_output[order(FGSEA_output$padj),]

In [None]:
FGSEA_output_sig = FGSEA_output[which(FGSEA_output$padj<0.05),]
FGSEA_output_sig[c(1:5),c(1:3)]

In [None]:
FGSEA_output_sig[grep("NOTCH", FGSEA_output_sig$pathway),c(1,3)]