# Install and load packages

In [None]:
if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install(version = "3.17")

In [None]:
BiocManager::install(c("clusterProfiler", "org.Hs.eg.db"))

In [2]:
install.packages(c("plotly", "readxl"))

Installing package into ‘/home/joanismi/R/x86_64-pc-linux-gnu-library/4.3’
(as ‘lib’ is unspecified)

Installing package into ‘/home/joanismi/R/x86_64-pc-linux-gnu-library/4.3’
(as ‘lib’ is unspecified)



In [3]:
library("org.Hs.eg.db")
library("plotly")
library("readxl")
library("dplyr")

Loading required package: AnnotationDbi

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: Biobase

Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


Loading required package: IRanges

Loading required package: S4Vectors

In [15]:
raw_data_dir = '../data/raw/'
intracell_dir = '../data/processed/intracell_network/'
enrichment_dir = "../data/processed/enrichment_analysis/"

# Intercell Genes

## Load Datasets

In [10]:
# Hallmarks to GO term mapping
hallmarks2goterms <- read_excel(paste0(raw_data_dir, "hallmarks_to_goterms.xlsx"))[, 1:3]
head(hallmarks2goterms, 2)

Hallmarks,GO terms,Term name
<chr>,<chr>,<chr>
Sustaining Proliferative Signaling,GO:0008283,Cell Proliferation
Sustaining Proliferative Signaling,GO:0007049,Cell Cycle


In [11]:
# intercellular genes
intercell_target <- read.csv(paste0(intracell_dir, 'target_labels.csv'))
head(intercell_target, 2)
intercell_source <- read.csv(paste0(intracell_dir, 'source_labels.csv'))
head(intercell_source, 2)

Unnamed: 0_level_0,gene,curated_label,label,is_curated
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>
1,A1BG,0,0,True
2,ABCB1,0,0,False


Unnamed: 0_level_0,gene,curated_label,label,is_curated
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>
1,A2M,0,0,True
2,ACAN,0,0,True


In [12]:
# total number of genes
# complete network
all_genes <- unique(c(intercell_target$gene, intercell_source$gene))
length(all_genes)
# curated network
all_curated_genes <- unique(c(
    intercell_target[intercell_target$is_curated=='True', 'gene'],
    intercell_source[intercell_source$is_curated=='True', 'gene']
))
length(all_curated_genes)

## GO enrichment

In [16]:
datasets <- list(source = intercell_source, target = intercell_target)

for (label in names(datasets)) {
    
    df <- datasets[[label]]
  
    for (n in 0:1) {
        go <- clusterProfiler::enrichGO(
            gene = df[df$label==n, 'gene'],
            OrgDb = org.Hs.eg.db,
            keyType = 'SYMBOL',
            ont = 'BP',
            universe = df$gene
        )
        go <- mutate(go, FoldEnrichment=DOSE::parse_ratio(GeneRatio)/DOSE::parse_ratio(BgRatio))
        
        go_curated <- clusterProfiler::enrichGO(
            gene = df[(df$curated_label==n)&(df$is_curated=='True'), 'gene'],
            OrgDb = org.Hs.eg.db,
            keyType = 'SYMBOL',
            ont = 'BP',     
            universe = df[df$is_curated=='True', 'gene']
        )
        go_curated <- mutate(go_curated, FoldEnrichment=DOSE::parse_ratio(GeneRatio)/DOSE::parse_ratio(BgRatio))
        
        # Write results to file
        if (n==0) {
            dir <- paste0(enrichment_dir, "intercell_go/", label, "_nsign_go.csv")
            dir_curated <- paste0(enrichment_dir, "intercell_go/", label, "_curated_nsign_go.csv")
        } else {
            dir <- paste0(enrichment_dir, "intercell_go/", label, "_sign_go.csv")
            dir_curated <- paste0(enrichment_dir, "intercell_go/", label, "_curated_sign_go.csv")
        }
        write.csv(go@result, dir, row.names = FALSE)
        write.csv(go_curated@result, dir_curated, row.names = FALSE)
    }
}



Registered S3 methods overwritten by 'treeio':
  method              from    
  MRCA.phylo          tidytree
  MRCA.treedata       tidytree
  Nnode.treedata      tidytree
  Ntip.treedata       tidytree
  ancestor.phylo      tidytree
  ancestor.treedata   tidytree
  child.phylo         tidytree
  child.treedata      tidytree
  full_join.phylo     tidytree
  full_join.treedata  tidytree
  groupClade.phylo    tidytree
  groupClade.treedata tidytree
  groupOTU.phylo      tidytree
  groupOTU.treedata   tidytree
  inner_join.phylo    tidytree
  inner_join.treedata tidytree
  is.rooted.treedata  tidytree
  nodeid.phylo        tidytree
  nodeid.treedata     tidytree
  nodelab.phylo       tidytree
  nodelab.treedata    tidytree
  offspring.phylo     tidytree
  offspring.treedata  tidytree
  parent.phylo        tidytree
  parent.treedata     tidytree
  root.treedata       tidytree
  rootnode.phylo      tidytree
  sibling.phylo       tidytree



## Cancer Hallmarks Enrichment

### Complete network

In [17]:
go_enrichment = list(
    source = read.csv(paste0(enrichment_dir, 'intercell_go/source_sign_go.csv')),
    source_nsign = read.csv(paste0(enrichment_dir,'intercell_go/source_nsign_go.csv')),
    target = read.csv(paste0(enrichment_dir,'intercell_go/target_sign_go.csv')),
    target_nsign = read.csv(paste0(enrichment_dir,'intercell_go/target_nsign_go.csv'))
)

In [18]:
mapping = list()

for (k in names(go_enrichment)) {
    d <- go_enrichment[[k]]
    enrichment <- d[d$p.adjust<0.05, c('ID', 'Description', 'p.adjust', 'Count', 'FoldEnrichment', 'geneID')]
    mapping[[k]] <- merge(hallmarks2goterms, enrichment, by.x='GO terms', by.y='ID')
}

for (k in names(mapping)) {
    d <- mapping[[k]]
    
    x <- group_by(d, Hallmarks)
    n_goterms <- summarise(x, n=n())
    print(k)
    print(n_goterms)
    print(paste0(rep('-', 40), collapse=""))
}

[1] "source"
[90m# A tibble: 4 × 2[39m
  Hallmarks                              n
  [3m[90m<chr>[39m[23m                              [3m[90m<int>[39m[23m
[90m1[39m Activating Invasion and Metastasis     3
[90m2[39m Avoiding Immune Destruction            4
[90m3[39m Resist Cell Death                      3
[90m4[39m Sustaining Proliferative Signaling     3
[1] "----------------------------------------"
[1] "source_nsign"
[90m# A tibble: 0 × 2[39m
[90m# ℹ 2 variables: Hallmarks <chr>, n <int>[39m
[1] "----------------------------------------"
[1] "target"
[90m# A tibble: 7 × 2[39m
  Hallmarks                              n
  [3m[90m<chr>[39m[23m                              [3m[90m<int>[39m[23m
[90m1[39m Activating Invasion and Metastasis     4
[90m2[39m Avoiding Immune Destruction            4
[90m3[39m Evading Growth Suppressor              2
[90m4[39m Inducing Angiogenesis                  2
[90m5[39m Resist Cell Death                      3


### Curated network

In [19]:
go_enrichment_curated = list(
    source = read.csv(paste0(enrichment_dir,'intercell_go/source_curated_sign_go.csv')),
    source_nsign = read.csv(paste0(enrichment_dir,'intercell_go/source_curated_nsign_go.csv')),
    target = read.csv(paste0(enrichment_dir,'intercell_go/target_curated_sign_go.csv')),
    target_nsign = read.csv(paste0(enrichment_dir,'intercell_go/target_curated_nsign_go.csv'))
)

In [20]:
mapping_curated = list()

for (k in names(go_enrichment_curated)) {
    d <- go_enrichment_curated[[k]]
    enrichment <- d[d$p.adjust<0.05, c('ID', 'Description', 'p.adjust', 'Count', 'FoldEnrichment', 'geneID')]
    mapping_curated[[k]] <- merge(hallmarks2goterms, enrichment, by.x='GO terms', by.y='ID')
}

for (k in names(mapping_curated)) {
    d <- mapping_curated[[k]]
    x <- group_by(d, Hallmarks)
    n_goterms <- summarise(x, n=n())
    print(k)
    print(n_goterms)
    print(paste0(rep('-', 40), collapse = ""))
}

[1] "source"
[90m# A tibble: 5 × 2[39m
  Hallmarks                              n
  [3m[90m<chr>[39m[23m                              [3m[90m<int>[39m[23m
[90m1[39m Activating Invasion and Metastasis     2
[90m2[39m Avoiding Immune Destruction            4
[90m3[39m Evading Growth Suppressor              1
[90m4[39m Resist Cell Death                      2
[90m5[39m Sustaining Proliferative Signaling     4
[1] "----------------------------------------"
[1] "source_nsign"
[90m# A tibble: 0 × 2[39m
[90m# ℹ 2 variables: Hallmarks <chr>, n <int>[39m
[1] "----------------------------------------"
[1] "target"
[90m# A tibble: 5 × 2[39m
  Hallmarks                              n
  [3m[90m<chr>[39m[23m                              [3m[90m<int>[39m[23m
[90m1[39m Activating Invasion and Metastasis     1
[90m2[39m Avoiding Immune Destruction            2
[90m3[39m Resist Cell Death                      2
[90m4[39m Sustaining Proliferative Signaling     2


# Intracell Genes

## Load Datasets

In [21]:
# Hallmarks to GO term mapping
hallmarks2goterms <- read_excel(paste0(raw_data_dir, "hallmarks_to_goterms.xlsx"))[, 1:3]
head(hallmarks2goterms, 2)

Hallmarks,GO terms,Term name
<chr>,<chr>,<chr>
Sustaining Proliferative Signaling,GO:0008283,Cell Proliferation
Sustaining Proliferative Signaling,GO:0007049,Cell Cycle


In [23]:
# the genes in the intracell graph comprise our population 
all_genes <- read.csv(paste0(intracell_dir, 'intracell_genes.csv'), col.names='gene')$gene
length(all_genes)

In [26]:
# Statistically significant (alpha=0.05) genes with positive association with metastasis
# complete network
complete <- list(
  source = read.csv(paste0(enrichment_dir, 'source_sign.csv'))$gene,
  source_out = read.csv(paste0(enrichment_dir, 'source_sign_outliers.csv'))$gene,
  target = read.csv(paste0(enrichment_dir, 'target_sign.csv'))$gene,
  target_out = read.csv(paste0(enrichment_dir, 'target_sign_outliers.csv'))$gene
)
# curated network
curated <- list(
  source = read.csv(paste0(enrichment_dir, 'source_curated_sign.csv'))$gene,
  source_out = read.csv(paste0(enrichment_dir, 'source_curated_sign_outliers.csv'))$gene,
  target = read.csv(paste0(enrichment_dir, 'target_curated_sign.csv'))$gene,
  target_out = read.csv(paste0(enrichment_dir, 'target_curated_sign_outliers.csv'))$gene
)
head(complete[['source']], 5)

## GO enrichment

In [27]:
networks <- list(complete = complete, curated = curated)

for (network in names(networks)) {
    
    datasets <- networks[[network]]
    
    for (dataset in names(datasets)) {
        
        genes <- datasets[[dataset]]
        go <- clusterProfiler::enrichGO(
            gene = genes,
            OrgDb = org.Hs.eg.db,
            keyType = 'SYMBOL',
            ont = 'BP',
            universe = all_genes
        )
        go <- mutate(go, FoldEnrichment=DOSE::parse_ratio(GeneRatio)/DOSE::parse_ratio(BgRatio))
        
        # Write results to file
        if (network=='complete') {
            dir <- paste0(enrichment_dir, "intracell_go/", dataset, "_go.csv")
        } else {
            dir <- paste0(enrichment_dir, "intracell_go/", dataset, "_curated_go.csv")
        }
        write.csv(go@result, dir, row.names = FALSE)
    }
}