# RA-Notch Interpretation Notebook

### Load Notch Results

In [166]:
notch_data = readRDS('/data/srlab/lrumker/MCSC_Project/notch/analysis_sc_tissue.rds')

cna_res = read.csv("/data/srlab/lrumker/MCSC_Project/notch/notch_cna_res.txt")

nampcs = read.csv("/data/srlab/lrumker/MCSC_Project/notch/notch_cna_NAM_PCs.txt")

In [167]:
gene_expr = notch_data[['exprs_norm']][,match(cna_res$cell_id,colnames(notch_data[['exprs_norm']]))]

In [168]:
# Confirm cell order matches
sum(cna_res$cell_id==colnames(gene_expr))==dim(gene_expr)[2]

In [169]:
# Confirm NAM PC1 and neighborhood correlations are perfectly correlated,
# since only one NAM PC included in the model
cor(cna_res$ncorrs, nampcs$PC1)

In [170]:
cor(cna_res$score_notch, nampcs$PC1)

In [171]:
cor(cna_res$score_notch, nampcs$PC2)

### Compute Gene Correlations to NAM-PC1

In [172]:
# Examine correlations only with variable genes
n_var_genes = 5000
gene_sds = apply(gene_expr, 1, FUN = sd)
gene_sds = gene_sds[order(gene_sds, decreasing = TRUE)]

In [173]:
var_genes = names(gene_sds)
var_gene_expr = gene_expr[match(var_genes, rownames(gene_expr)),]

In [174]:
# Scores are cell loadings on PC1
cna_scores = nampcs$PC1

In [175]:
corrs = cor(t(as.matrix(var_gene_expr)), cna_scores)
names(corrs) = var_genes

“the standard deviation is zero”


In [176]:
quanitfy_correlation <-function(sel_gene, expr_mat, ncorrs){
    my_expr = expr_mat[match(sel_gene, rownames(expr_mat)),]
    my_data = data.frame(ncorrs, my_expr)
    colnames(my_data) = c("corrs", "expr")
    new_summary = summary(lm(corrs ~ expr, data = my_data))
    new_p = new_summary$coefficients[2,4]
    new_coeff = new_summary$coefficients[2,1]
    new_corr = cor(my_expr, ncorrs)
    return(c(new_corr, new_p))
}

### Most highly correlated genes

In [177]:
corrs = corrs[order(corrs, decreasing = TRUE)]

In [178]:
corrs[1:5]

In [179]:
quanitfy_correlation("PRG4", var_gene_expr, cna_scores)

In [180]:
quanitfy_correlation("FN1", var_gene_expr, cna_scores)

### Load Genesets

In [181]:
rankList = corrs

In [182]:
library(fgsea)
library(GSA)

In [183]:
geneset_files = c("/data/srlab/lrumker/MSigDB_Sets/c2.all.v7.0.symbols.gmt.txt", #Immune
                 "/data/srlab/lrumker/MSigDB_Sets/c2.all.v7.0.symbols.gmt.txt") #Canonical

In [184]:
GeneSet_SizeBound_Upper = 500 # Maximum gene set size considered
GeneSet_SizeBound_Lower = 0 # Minimum gene set size considered

In [185]:
parseGeneSetGMT <-function(filepath, known_genes, sizeBound_Upper, sizeBound_Lower){
  Gene_Sets_DB = GSA.read.gmt(filepath)
  GeneSet_sizes = sapply(Gene_Sets_DB$genesets,length)
  recognized_genes = matrix(NA, nrow=length(GeneSet_sizes), ncol = max(GeneSet_sizes))
  for(i in c(1:length(GeneSet_sizes))){
    recognized_genes[i,c(1:GeneSet_sizes[i])] = Gene_Sets_DB$genesets[[i]]
  }
  recognized_genes = matrix(is.element(recognized_genes, known_genes), ncol = ncol(recognized_genes))
  GeneSet_sizes = apply(recognized_genes, 1, sum)
  retain_GeneSet = (GeneSet_sizes>=sizeBound_Lower)&(GeneSet_sizes<=sizeBound_Upper)
  Gene_Sets_DB$genesets = Gene_Sets_DB$genesets[retain_GeneSet]
  Gene_Sets_DB$geneset.names = Gene_Sets_DB$geneset.names[retain_GeneSet]
  Gene_Sets_DB$geneset.descriptions = Gene_Sets_DB$geneset.descriptions[retain_GeneSet]
  Gene_Sets_DB$geneset.sizes = GeneSet_sizes[retain_GeneSet]
  return(Gene_Sets_DB)
}

In [186]:
Gene_Sets_DB = parseGeneSetGMT(geneset_files[1], names(rankList), GeneSet_SizeBound_Upper,
                               GeneSet_SizeBound_Lower)
input_genesets = Gene_Sets_DB$genesets
names(input_genesets) = Gene_Sets_DB$geneset.names

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693

In [187]:
# Confirm presence of some NOTCH genesets
names(input_genesets)[grep("NOTCH", names(input_genesets))]

### Conduct geneset enrichment analysis

In [188]:
FGSEA_output = fgsea(input_genesets, rankList, nperm=10000, 
                         minSize = 0, maxSize = 500, nproc = 0,
                         gseaParam = 1, BPPARAM = NULL)
FGSEA_output = FGSEA_output[order(FGSEA_output$padj),]

“You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.”
“There are ties in the preranked stats (0.38% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.”


In [190]:
FGSEA_output_sig = FGSEA_output[which(FGSEA_output$padj<0.05),]
FGSEA_output_sig[grep("NOTCH", FGSEA_output_sig$pathway),c(1:3)]

pathway,pval,padj
<chr>,<dbl>,<dbl>
VILIMAS_NOTCH1_TARGETS_UP,0.0002883922,0.00742766
REACTOME_SIGNALING_BY_NOTCH,0.0020489333,0.0258108
NGUYEN_NOTCH1_TARGETS_DN,0.0052888527,0.04771678
REACTOME_NEGATIVE_REGULATION_OF_NOTCH4_SIGNALING,0.0054960541,0.04868755


## Characterizing NAM-PC2

### Compute Gene Correlations to NAM-PC1

In [191]:
# Scores are cell loadings on PC2
cna_scores = nampcs$PC2

In [192]:
corrs = cor(t(as.matrix(var_gene_expr)), cna_scores)
names(corrs) = var_genes

“the standard deviation is zero”


In [193]:
corrs = corrs[order(corrs, decreasing = TRUE)]

### Load Genesets

In [194]:
rankList = corrs

In [205]:
grep("HLA", names(corrs)[order(corrs)][1:100])

In [204]:
names(corrs)[order(corrs)][1:100][grep("HLA", names(corrs)[order(corrs)][1:100])]

In [195]:
Gene_Sets_DB = parseGeneSetGMT(geneset_files[2], names(rankList), GeneSet_SizeBound_Upper,
                               GeneSet_SizeBound_Lower)
input_genesets = Gene_Sets_DB$genesets
names(input_genesets) = Gene_Sets_DB$geneset.names

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693

In [196]:
FGSEA_output = fgsea(input_genesets, rankList, nperm=10000, 
                         minSize = 0, maxSize = 500, nproc = 0,
                         gseaParam = 1, BPPARAM = NULL)
FGSEA_output = FGSEA_output[order(FGSEA_output$padj),]

“You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.”
“There are ties in the preranked stats (0.38% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.”


In [197]:
FGSEA_output_sig_NAMPC2 = FGSEA_output[which(FGSEA_output$padj<0.05),]

In [199]:
head(nampcs)

Unnamed: 0_level_0,X,PC1,PC2
Unnamed: 0_level_1,<int>,<dbl>,<dbl>
1,0,0.002823947,0.0008072186
2,2,-0.0008987393,0.0068240035
3,3,-0.004635763,0.0090568405
4,5,0.005397257,0.0082240497
5,6,-7.306052e-06,0.0140845026
6,7,-0.007311816,0.0058684357


In [198]:
FGSEA_output_sig_NAMPC2[grep("REACTOME", FGSEA_output_sig_NAMPC2$pathway),c(1:4)]

pathway,pval,padj,ES
<chr>,<dbl>,<dbl>,<dbl>
REACTOME_CROSS_PRESENTATION_OF_SOLUBLE_EXOGENOUS_ANTIGENS_ENDOSOMES,0.0002358491,0.003650091,-0.6241756
REACTOME_ANTIGEN_PROCESSING_CROSS_PRESENTATION,0.0002498126,0.003650091,-0.7000089
REACTOME_ENDOSOMAL_VACUOLAR_PATHWAY,0.0002248201,0.003650091,-0.9372824
REACTOME_ANTIVIRAL_MECHANISM_BY_IFN_STIMULATED_GENES,0.0002447381,0.003650091,-0.5853584
REACTOME_SIGNALING_BY_NOTCH4,0.0002466091,0.003650091,-0.5638637
REACTOME_DOWNSTREAM_SIGNALING_EVENTS_OF_B_CELL_RECEPTOR_BCR,0.0002447381,0.003650091,-0.5519878
REACTOME_ACTIVATION_OF_NF_KAPPAB_IN_B_CELLS,0.0002394063,0.003650091,-0.6186325
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,0.0002545825,0.003650091,-0.5056536
REACTOME_SIGNALING_BY_NOTCH3,0.0002357934,0.003650091,-0.6423219
REACTOME_MHC_CLASS_II_ANTIGEN_PRESENTATION,0.0002530364,0.003650091,-0.5852245


In [165]:
FGSEA_output_sig_NAMPC2[grep("REACTOME", FGSEA_output_sig_NAMPC2$pathway),c(1:4)]

pathway,pval,padj
<chr>,<dbl>,<dbl>
REACTOME_ANTIGEN_PROCESSING_CROSS_PRESENTATION,0.0002562132,0.003508898
REACTOME_ENDOSOMAL_VACUOLAR_PATHWAY,0.0002228164,0.003508898
REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL,0.0002556891,0.003508898
REACTOME_ORC1_REMOVAL_FROM_CHROMATIN,0.0002445586,0.003508898
REACTOME_ANTIVIRAL_MECHANISM_BY_IFN_STIMULATED_GENES,0.0002498751,0.003508898
REACTOME_SIGNALING_BY_NOTCH4,0.0002506266,0.003508898
REACTOME_DOWNSTREAM_SIGNALING_EVENTS_OF_B_CELL_RECEPTOR_BCR,0.0002498751,0.003508898
REACTOME_ACTIVATION_OF_NF_KAPPAB_IN_B_CELLS,0.0002411963,0.003508898
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,0.0002566076,0.003508898
REACTOME_SIGNALING_BY_NOTCH3,0.0002368546,0.003508898
