## Effects of Phase 2 (ignoring phase 1)
## both vs. control

GO analysis and GSEA with KEGG

In [59]:
# loading packages
library(clusterProfiler)
library(topGO)
library(dplyr)
library(KEGGREST)
library(ggplot2)

What we need for GO analysis:
- list of DMGs with pvalues (feel like this should be log2FoldChange

In [4]:
# load significant genes df for both vs. control for phase 2
data <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/significant_genes/sig_p2_bc_genes.csv')

# select needed columns (really might only need l2fc
data2 <- select(data, Row.names, log2FoldChange, pvalue, padj)

# renaming columns so they make more sense
colnames(data2) = c('gene', 'l2fc', 'pval', 'padj')
head(data2)

Unnamed: 0_level_0,gene,l2fc,pval,padj
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>
1,LOC111100720,1.655455,2.341829e-05,0.0083837465
2,LOC111100790,1.631569,7.118715e-05,0.0145628566
3,LOC111100869,1.854173,3.033653e-05,0.0087569948
4,LOC111100924,4.698931,0.0003330588,0.0330959131
5,LOC111101050,1.62402,0.000413478,0.037006281
6,LOC111101237,2.978028,2.908375e-07,0.0003437863


In [7]:
# creating numeric vector of gene names and log2FoldChange value
geneList <- data2$l2fc
names(geneList) <- data2$gene

# double checking things look right
head(geneList)
class(geneList) # numeric, used in allGenes for topGO object

In [12]:
# loading conversion df of unique genes with associated GO ids
geneID2GO <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/GO_enrichment_analysis/geneID2GO.txt', sep='\t')

# renaming columns
colnames(geneID2GO) = c('gene','GO_id')

# checking things make sense
head(geneID2GO)
dim(geneID2GO) # have 22,654 unique genes that have GO annotations

Unnamed: 0_level_0,gene,GO_id
Unnamed: 0_level_1,<chr>,<chr>
1,LOC111133408,GO:2001070
2,LOC111121603,"GO:2000781,GO:2000781"
3,LOC111132389,GO:2000145
4,LOC111115105,"GO:1990904,GO:1990904"
5,LOC111129853,"GO:1990904,GO:1990904"
6,LOC111101512,GO:1990904


In [13]:
# have to create annotation file
geneID2GO <- readMappings(file = '/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/GO_enrichment_analysis/geneID2GO.txt')
geneID2GO <- geneID2GO[-1] # removes header
head(geneID2GO)

In [14]:
geneNames <- names(geneID2GO)
head(geneNames)

In [16]:
topDiffGenes <- function(allScore) {
    return(allScore < 0.01)
}

x <- topDiffGenes(geneList)
sum(x) ## the number of selected genes

#### GO analysis: molecular function

In [28]:
# creating GO data object
GOdata_MF <- new("topGOdata", 
              description = 'DMGs in phase 2 both vs. control',
              ontology = "MF", 
              allGenes = geneList,
              geneSel = topDiffGenes,
              annot = annFUN.gene2GO, 
              gene2GO = geneID2GO)
GOdata_MF


Building most specific GOs .....

	( 73 GO terms found. )


Build GO DAG topology ..........

	( 212 GO terms and 268 relations. )


Annotating nodes ...............

	( 66 genes annotated to the GO terms. )




------------------------- topGOdata object -------------------------

 Description:
   -  DMGs in phase 2 both vs. control 

 Ontology:
   -  MF 

 111 available genes (all genes from the array):
   - symbol:  LOC111100720 LOC111100790 LOC111100869 LOC111100924 LOC111101050  ...
   - score :  1.655454694 1.631568593 1.854173395 4.698931389 1.624020169  ...
   - 10  significant genes. 

 66 feasible genes (genes that can be used in the analysis):
   - symbol:  LOC111100790 LOC111100869 LOC111100924 LOC111101050 LOC111101237  ...
   - score :  1.631568593 1.854173395 4.698931389 1.624020169 2.978027599  ...
   - 6  significant genes. 

 GO graph (nodes with at least  1  genes):
   - a graph with directed edges
   - number of nodes = 212 
   - number of edges = 268 

------------------------- topGOdata object -------------------------


In [39]:
# KS stat
resultKS_MF <- runTest(GOdata_MF, algorithm = "weight01", statistic = "ks")

# putting result into readable table
tab_MF <- GenTable(GOdata_MF, raw.p.value = resultKS_MF, topNodes = length(resultKS_MF@score), numChar = 120)

# showing top 10 GO term results
head(tab_MF, 10)


			 -- Weight01 Algorithm -- 

		 the algorithm is scoring 212 nontrivial nodes
		 parameters: 
			 test statistic: ks
			 score order: increasing


	 Level 10:	1 nodes to be scored	(0 eliminated genes)


	 Level 9:	4 nodes to be scored	(0 eliminated genes)


	 Level 8:	13 nodes to be scored	(1 eliminated genes)


	 Level 7:	26 nodes to be scored	(19 eliminated genes)


	 Level 6:	40 nodes to be scored	(25 eliminated genes)


	 Level 5:	50 nodes to be scored	(33 eliminated genes)


	 Level 4:	42 nodes to be scored	(48 eliminated genes)


	 Level 3:	24 nodes to be scored	(62 eliminated genes)


	 Level 2:	11 nodes to be scored	(63 eliminated genes)


	 Level 1:	1 nodes to be scored	(66 eliminated genes)



Unnamed: 0_level_0,GO.ID,Term,Annotated,Significant,Expected,raw.p.value
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<dbl>,<chr>
1,GO:0046923,ER retention sequence binding,1,1,0.09,0.015
2,GO:0005096,GTPase activator activity,2,0,0.18,0.021
3,GO:0018064,protein-L-histidine N-tele-methyltransferase activity,1,1,0.09,0.03
4,GO:0015165,pyrimidine nucleotide-sugar transmembrane transporter activity,1,1,0.09,0.061
5,GO:0017154,semaphorin receptor activity,1,1,0.09,0.076
6,GO:0005245,voltage-gated calcium channel activity,1,1,0.09,0.091
7,GO:0004622,lysophospholipase activity,1,0,0.09,0.121
8,GO:0002020,protease binding,1,0,0.09,0.136
9,GO:0070513,death domain binding,1,0,0.09,0.136
10,GO:0000146,microfilament motor activity,1,0,0.09,0.152


#### GO analysis for cellular component

In [41]:
# creating GO data object
GOdata_CC <- new("topGOdata", 
              description = 'DMGs in phase 2 both vs. control',
              ontology = "CC", 
              allGenes = geneList,
              geneSel = topDiffGenes,
              annot = annFUN.gene2GO, 
              gene2GO = geneID2GO)
GOdata_CC


Building most specific GOs .....

	( 31 GO terms found. )


Build GO DAG topology ..........

	( 85 GO terms and 144 relations. )


Annotating nodes ...............

	( 58 genes annotated to the GO terms. )




------------------------- topGOdata object -------------------------

 Description:
   -  DMGs in phase 2 both vs. control 

 Ontology:
   -  CC 

 111 available genes (all genes from the array):
   - symbol:  LOC111100720 LOC111100790 LOC111100869 LOC111100924 LOC111101050  ...
   - score :  1.655454694 1.631568593 1.854173395 4.698931389 1.624020169  ...
   - 10  significant genes. 

 58 feasible genes (genes that can be used in the analysis):
   - symbol:  LOC111100869 LOC111101237 LOC111104344 LOC111104770 LOC111107351  ...
   - score :  1.854173395 2.978027599 1.262573487 3.466359414 -1.617929664  ...
   - 6  significant genes. 

 GO graph (nodes with at least  1  genes):
   - a graph with directed edges
   - number of nodes = 85 
   - number of edges = 144 

------------------------- topGOdata object -------------------------


In [43]:
# KS stat
resultKS_CC <- runTest(GOdata_CC, algorithm = "weight01", statistic = "ks")

# putting result into readable table
tab_CC <- GenTable(GOdata_CC, raw.p.value = resultKS_CC, topNodes = length(resultKS_CC@score), numChar = 120)

# showing top 10 GO term results
head(tab_CC, 10)


			 -- Weight01 Algorithm -- 

		 the algorithm is scoring 85 nontrivial nodes
		 parameters: 
			 test statistic: ks
			 score order: increasing


	 Level 10:	2 nodes to be scored	(0 eliminated genes)


	 Level 9:	6 nodes to be scored	(0 eliminated genes)


	 Level 8:	9 nodes to be scored	(2 eliminated genes)


	 Level 7:	11 nodes to be scored	(8 eliminated genes)


	 Level 6:	15 nodes to be scored	(13 eliminated genes)


	 Level 5:	15 nodes to be scored	(15 eliminated genes)


	 Level 4:	11 nodes to be scored	(25 eliminated genes)


	 Level 3:	13 nodes to be scored	(26 eliminated genes)


	 Level 2:	2 nodes to be scored	(36 eliminated genes)


	 Level 1:	1 nodes to be scored	(57 eliminated genes)



Unnamed: 0_level_0,GO.ID,Term,Annotated,Significant,Expected,raw.p.value
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<dbl>,<chr>
1,GO:0000139,Golgi membrane,1,1,0.1,0.069
2,GO:0005891,voltage-gated calcium channel complex,1,1,0.1,0.103
3,GO:0016459,myosin complex,1,0,0.1,0.138
4,GO:0016282,eukaryotic 43S preinitiation complex,1,0,0.1,0.207
5,GO:0033290,eukaryotic 48S preinitiation complex,1,0,0.1,0.207
6,GO:0005852,eukaryotic translation initiation factor 3 complex,1,0,0.1,0.207
7,GO:0005789,endoplasmic reticulum membrane,3,1,0.31,0.272
8,GO:0031932,TORC2 complex,1,0,0.1,0.276
9,GO:0005737,cytoplasm,20,4,2.07,0.286
10,GO:0110165,cellular anatomical entity,56,6,5.79,0.333


#### GO analysis for biological process

In [44]:
# creating GO data object
GOdata_BP <- new("topGOdata", 
              description = 'DMGs in phase 2 both vs. control',
              ontology = "BP", 
              allGenes = geneList,
              geneSel = topDiffGenes,
              annot = annFUN.gene2GO, 
              gene2GO = geneID2GO)
GOdata_BP


Building most specific GOs .....

	( 43 GO terms found. )


Build GO DAG topology ..........

	( 231 GO terms and 425 relations. )


Annotating nodes ...............

	( 41 genes annotated to the GO terms. )




------------------------- topGOdata object -------------------------

 Description:
   -  DMGs in phase 2 both vs. control 

 Ontology:
   -  BP 

 111 available genes (all genes from the array):
   - symbol:  LOC111100720 LOC111100790 LOC111100869 LOC111100924 LOC111101050  ...
   - score :  1.655454694 1.631568593 1.854173395 4.698931389 1.624020169  ...
   - 10  significant genes. 

 41 feasible genes (genes that can be used in the analysis):
   - symbol:  LOC111100790 LOC111101050 LOC111101237 LOC111106800 LOC111107351  ...
   - score :  1.631568593 1.624020169 2.978027599 2.904505956 -1.617929664  ...
   - 4  significant genes. 

 GO graph (nodes with at least  1  genes):
   - a graph with directed edges
   - number of nodes = 231 
   - number of edges = 425 

------------------------- topGOdata object -------------------------


In [45]:
# KS stat
resultKS_BP <- runTest(GOdata_BP, algorithm = "weight01", statistic = "ks")

# putting result into readable table
tab_BP <- GenTable(GOdata_BP, raw.p.value = resultKS_BP, topNodes = length(resultKS_BP@score), numChar = 120)

# showing top 10 GO term results
head(tab_BP, 10)


			 -- Weight01 Algorithm -- 

		 the algorithm is scoring 231 nontrivial nodes
		 parameters: 
			 test statistic: ks
			 score order: increasing


	 Level 12:	1 nodes to be scored	(0 eliminated genes)


	 Level 11:	3 nodes to be scored	(0 eliminated genes)


	 Level 10:	8 nodes to be scored	(1 eliminated genes)


	 Level 9:	14 nodes to be scored	(3 eliminated genes)


	 Level 8:	22 nodes to be scored	(10 eliminated genes)


	 Level 7:	26 nodes to be scored	(12 eliminated genes)


	 Level 6:	47 nodes to be scored	(18 eliminated genes)


	 Level 5:	52 nodes to be scored	(22 eliminated genes)


	 Level 4:	33 nodes to be scored	(36 eliminated genes)


	 Level 3:	19 nodes to be scored	(39 eliminated genes)


	 Level 2:	5 nodes to be scored	(40 eliminated genes)


	 Level 1:	1 nodes to be scored	(41 eliminated genes)



Unnamed: 0_level_0,GO.ID,Term,Annotated,Significant,Expected,raw.p.value
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<dbl>,<chr>
1,GO:0006621,protein retention in ER lumen,1,1,0.1,0.024
2,GO:0032259,methylation,1,1,0.1,0.049
3,GO:0016043,cellular component organization,3,1,0.29,0.051
4,GO:0043170,macromolecule metabolic process,14,1,1.37,0.071
5,GO:0008643,carbohydrate transport,1,1,0.1,0.073
6,GO:0034765,regulation of monoatomic ion transmembrane transport,1,1,0.1,0.098
7,GO:0046470,phosphatidylcholine metabolic process,1,0,0.1,0.122
8,GO:0035556,intracellular signal transduction,4,0,0.39,0.146
9,GO:0042981,regulation of apoptotic process,1,0,0.1,0.146
10,GO:0007155,cell adhesion,1,0,0.1,0.244


## Gene Set Enrichment Analysis with clusterProfiler
looking for enriched KEGG pathways with a ranked gene list

In [50]:
# already have a df with DMGs and scores - need just gene and l2fc
df <- select(data2, gene, l2fc)
head(df)

Unnamed: 0_level_0,gene,l2fc
Unnamed: 0_level_1,<chr>,<dbl>
1,LOC111100720,1.655455
2,LOC111100790,1.631569
3,LOC111100869,1.854173
4,LOC111100924,4.698931
5,LOC111101050,1.62402
6,LOC111101237,2.978028


In [48]:
# need to have conversion table for gene name to entrez id
david_df <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/KEGG_pathway/sig_p2_bc_convert.txt', sep='\t')
# only selecting columns that I need
david_df <- select(david_df, From, To)
# renaming columns for merge
colnames(david_df) = c('gene', 'entrez_ID')
head(david_df)

Unnamed: 0_level_0,gene,entrez_ID
Unnamed: 0_level_1,<chr>,<int>
1,LOC111129490,111129490
2,LOC111134206,111134206
3,LOC111110636,111110636
4,LOC111128687,111128687
5,LOC111100790,111100790
6,LOC111124361,111124361


In [52]:
# matching up dataframes so entrez id has a log2FoldChange value
merge <- merge(david_df, df, by = 'gene', all=TRUE)

# grabbing just the entrez_ID and l2fc value
merge_df <- select(merge, entrez_ID, l2fc)
head(merge_df)

Unnamed: 0_level_0,entrez_ID,l2fc
Unnamed: 0_level_1,<int>,<dbl>
1,111100720,1.655455
2,111100790,1.631569
3,111100869,1.854173
4,111100924,4.698931
5,111101050,1.62402
6,111101237,2.978028


In [55]:
# checking that there's only unique genes
length(unique(merge_df$entrez_ID))
length(merge_df$entrez_ID)
# both have 111, so all good there

In [58]:
# Create a vector of the gene unuiverse
kegg_gene_list <- merge_df$l2fc

# Name vector with ENTREZ ids
names(kegg_gene_list) <- merge_df$entrez_ID

# omit any NA values 
kegg_gene_list<-na.omit(kegg_gene_list)

# sort the list in decreasing order (required for clusterProfiler)
kegg_gene_list = sort(kegg_gene_list, decreasing = TRUE)

head(kegg_gene_list)
class(kegg_gene_list) # numeric
length(kegg_gene_list) # 111 genes

In [60]:
kegg_organism = "cvn"
kk2 <- gseKEGG(geneList     = kegg_gene_list,
               organism     = kegg_organism,
               nPerm        = 10000,
               minGSSize    = 1,
               maxGSSize    = 800,
               pvalueCutoff = 1, # if this is set to 1, see more pathways, but 0.05 is statistically signif.
               pAdjustMethod = "BH", # Benjamini–Hochberg FDR (false discover rate)
               scoreType = "pos",
               keyType       = "kegg")

Reading KEGG annotation online: "https://rest.kegg.jp/link/cvn/pathway"...

Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/cvn"...

preparing geneSet collections...

GSEA analysis...

“We do not recommend using nPerm parameter incurrent and future releases”
“You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.”
leading edge analysis...

done...



In [61]:
kk2_df <- as.data.frame(kk2)
kk2_df$Description <- sub(" -.*", "", kk2_df$Description)
head(kk2_df) # actually shows the entire df since there's only 5 pathways with pval<0.05

Unnamed: 0_level_0,ID,Description,setSize,enrichmentScore,NES,pvalue,p.adjust,qvalue,rank,leading_edge,core_enrichment
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
cvn03010,cvn03010,Ribosome,1,0.9727273,1.928202,0.03569643,0.4619538,0.4619538,4,"tags=100%, list=4%, signal=97%",111114067
cvn00240,cvn00240,Pyrimidine metabolism,1,0.9181818,1.820079,0.09149085,0.4619538,0.4619538,10,"tags=100%, list=9%, signal=92%",111121137
cvn00983,cvn00983,Drug metabolism,1,0.9181818,1.820079,0.09149085,0.4619538,0.4619538,10,"tags=100%, list=9%, signal=92%",111121137
cvn01232,cvn01232,Nucleotide metabolism,1,0.9181818,1.820079,0.09149085,0.4619538,0.4619538,10,"tags=100%, list=9%, signal=92%",111121137
cvn04080,cvn04080,Neuroactive ligand-receptor interaction,1,0.9,1.784038,0.10708929,0.4619538,0.4619538,12,"tags=100%, list=11%, signal=90%",111121996
cvn00020,cvn00020,Citrate cycle (TCA cycle),1,0.8727273,1.729976,0.1259874,0.4619538,0.4619538,15,"tags=100%, list=14%, signal=87%",111128687


only have 1 gene per 'enriched pathway' - adjusted pvalues are also high...