# Pathway Enrichment Analysis 
Using KEGG analysis to explore enriched pathways from DMGs identified in pairwise comparisons with HC (hypoxic control) 

#### load and prep data frames

In [2]:
# loading libraries
library(topGO)
library(KEGGREST)
library(dplyr)
library(clusterProfiler)
library(ggridges)
library(ggplot2)
library(httr)

In [3]:
# importing csv files

#1 control control vs. hypoxic control
CC_HC <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/deseq_res_files/CC_vs_HC.csv')

#2 control hypoxic vs. hypoxic control
CH_HC <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/deseq_res_files/CH_vs_HC.csv')

#3 hypoxic hypoxic vs. hypoxic control
HH_HC <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/deseq_res_files/HH_vs_HC.csv')


## control control vs. hypoxic control
CC vs. HC

In [4]:
head(CC_HC)

Unnamed: 0_level_0,X,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,LOC111120752,0.3456118,-0.0419579,0.2300421,0,1,
2,LOC111109452,1.5973302,-0.09288309,0.2529871,0,1,
3,LOC111124802,213.0013054,0.06672759,0.1192547,0,1,1.0
4,LOC111101273,76.7130208,-0.20267259,0.1576994,0,1,1.0
5,LOC111101250,123.4748149,-0.13501125,0.17997,0,1,1.0
6,LOC111101262,269.4443993,0.14318006,0.1234759,0,1,1.0


formatting the df as needed to run KEGG

In [5]:
# selecting columns I need
cc_hc <- select(CC_HC, X, log2FoldChange, padj)

# KEGG uses entrez IDs, which are my ensembl IDs without the 'LOC' in front of them, so need to convert those
cc_hc$X <- substr(cc_hc$X, start = 4, stop = nchar(cc_hc$X))

# renaming columns
colnames(cc_hc) <- c('gene', 'lfc', 'padj')

head(cc_hc)

Unnamed: 0_level_0,gene,lfc,padj
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
1,111120752,-0.0419579,
2,111109452,-0.09288309,
3,111124802,0.06672759,1.0
4,111101273,-0.20267259,1.0
5,111101250,-0.13501125,1.0
6,111101262,0.14318006,1.0


In [6]:
# creating numeric vector for kegg object
CC_kegg_gene_list <- cc_hc$lfc
names(CC_kegg_gene_list) <- cc_hc$gene

# omitting anything with NA in a cell
CC_kegg_gene_list <- na.omit(CC_kegg_gene_list)

# sort list in descending order
CC_kegg_gene_list = sort(CC_kegg_gene_list, decreasing = TRUE)

# checking that things look okay
head(CC_kegg_gene_list)
class(CC_kegg_gene_list)

now that my gene list looks okay - running KEGG

In [7]:
kegg_organism = 'cvn'
CC_kk2 <- gseKEGG(geneList = CC_kegg_gene_list,
               organism = kegg_organism,
               #minGSSize = 1,
               #maxGSSize = 800,
               pvalueCutoff = 1,
               pAdjustMethod = 'BH', #Benjamini-Hockberg FDR
               scoreType = 'std', # standard - only change for one-tailed tests
               keyType = 'kegg')

Reading KEGG annotation online: "https://rest.kegg.jp/link/cvn/pathway"...

Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/cvn"...

preparing geneSet collections...

GSEA analysis...

“There are ties in the preranked stats (0.18% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.”
leading edge analysis...

done...



In [10]:
CC_kk2_df <- as.data.frame(CC_kk2)
CC_kk2_df$Description <- sub(" -.*", "", CC_kk2_df$Description)
head(CC_kk2_df)

Unnamed: 0_level_0,ID,Description,setSize,enrichmentScore,NES,pvalue,p.adjust,qvalue,rank,leading_edge,core_enrichment
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
cvn00592,cvn00592,alpha-Linolenic acid metabolism,11,0.7847989,1.980242,0.0005798551,0.07074233,0.07019299,1855,"tags=73%, list=14%, signal=63%",111113990/111115744/111127642/111136066/111123661/111107112/111115745/111136438
cvn00310,cvn00310,Lysine degradation,31,-0.5216369,-1.57922,0.0125270455,0.66371146,0.65855754,1860,"tags=32%, list=14%, signal=28%",111130627/111109254/111115614/111130119/111121380/111125659/111107127/111110608/111112920/111128625
cvn00591,cvn00591,Linoleic acid metabolism,11,0.615816,1.553857,0.0342321447,0.66371146,0.65855754,3475,"tags=55%, list=26%, signal=41%",111127642/111123661/111127589/111127588/111121119/111111230
cvn01040,cvn01040,Biosynthesis of unsaturated fatty acids,18,0.5444173,1.549269,0.0345683297,0.66371146,0.65855754,2069,"tags=44%, list=15%, signal=38%",111113990/111115744/111129730/111136066/111107112/111115745/111136438/111131209
cvn03008,cvn03008,Ribosome biogenesis in eukaryotes,57,-0.421189,-1.421319,0.0358166189,0.66371146,0.65855754,3013,"tags=39%, list=22%, signal=30%",111103436/111122686/111102690/111134591/111123620/111119396/111128896/111123381/111112561/111102803/111105066/111110086/111119458/111125104/111128153/111121480/111132055/111119695/111128265/111120056/111128132/111133163
cvn04068,cvn04068,FoxO signaling pathway,66,-0.4086433,-1.410737,0.0380818054,0.66371146,0.65855754,2298,"tags=38%, list=17%, signal=32%",111128693/111125223/111121739/111121135/111134642/111118834/111131500/111112841/111126185/111103474/111119108/111130138/111105462/111121839/111107163/111120632/111113171/111134713/111102390/111119905/111121740/111120947/111112940/111128744/111104196
