In [889]:
### Script for Pathway enrichment analysis on MOFA results; requires a prepared pathway set input dataset

#############################################
# Prerequisites - Load Libraries

In [890]:
source('MS0_Libraries.r')

“incomplete final line found by readTableHeader on '../conda_environment/Environment_Configs.csv'”


[1] "/home/icb/corinna.losert/miniconda3/envs/mofa_analysis//lib/R/library"


In [891]:
source('MS2_Plot_Config.r')

In [892]:
source('MS1_Functions.r')

###############################################
# Preqrequisites Configurations & Parameters

In [893]:
### Load the parameters that are set via the configuration files

In [894]:
### Load configurations file
global_configs = read.csv('configurations/Data_Configs.csv', sep = ',')

“incomplete final line found by readTableHeader on 'configurations/Data_Configs.csv'”


In [895]:
head(global_configs,2)

Unnamed: 0_level_0,parameter,value
Unnamed: 0_level_1,<chr>,<chr>
1,data_path,/lustre/groups/epigenereg01/workspace/projects/jove/example_data/
2,result_path,/lustre/groups/epigenereg01/workspace/projects/jove/example_results/


In [896]:
data_path = global_configs$value[global_configs$parameter == 'data_path']

In [897]:
data_path

In [898]:
result_path = global_configs$value[global_configs$parameter == 'result_path']

In [899]:
result_path

In [900]:
### Load the configuration for the pathway enrichment from the config file

In [901]:
pathway_configs = read.csv('configurations/06_Pathway_Configs.csv', sep = ',')

“incomplete final line found by readTableHeader on 'configurations/06_Pathway_Configs.csv'”


In [902]:
head(pathway_configs)

Unnamed: 0_level_0,mofa_result_name,factor_set,coverage_par,types,coverage_plot,p_value_plot,max_pathways_plot,enrichment_plot,top_features_plot,pathway_selection
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<int>,<chr>,<dbl>,<lgl>
1,MI_v1_MOFA,123,0.2,"myeloid.cell,mural.cell,fibroblast.of.cardiac.tissue,mast.cell,lymphocyte,endothelial.cell,cardiac.neuron,cardiac.muscle.cell,adipocyte",0.5,0.05,8,negative,0.125,


In [903]:
### Generate the result data directory if it does not exist yet
if(!file.exists(paste0(result_path, '06_results'))){
    dir.create(file.path(paste0(result_path, '06_results')))
    }

# Define parameters 

In [904]:
### Save values from loaded config file in variables

In [905]:
## For the calculation of the pathway enrichment

In [906]:
mofa_name = pathway_configs$mofa_result_name[1]   # mofa results that should be used

In [907]:
factor_set = as.numeric(unlist(str_split(pathway_configs$factor_set[1], ',')))  # factors for which enrichment analysis should be executed

In [908]:
coverage_par = pathway_configs$coverage_par[1] # coverage parameter: how many of the genes of a pathway need to be included in the MOFA feature set for testing this pathway for enrichment

In [909]:
views_set = unlist(str_split( pathway_configs$types[1], ','))  # extract the views for which a view-specific pathway enrichment analysis should be executed

In [910]:
views_set

In [911]:
## Parameters for the visualization of pathways

In [912]:
### Select pathways based on thresholds (like coverage, p-value, direction of enrichment)
coverage_par = pathway_configs$coverage_plot[1]
p_value_cutoff_plot =pathway_configs$p_value_plot[1]
max_pathways =pathway_configs$max_pathways_plot[1]
select_enrichment = pathway_configs$enrichment_plot[1]

### Alternative: select pathways based on their specified names
pathway_selection_var =pathway_configs$pathway_selection[1]

In [913]:
pathway_selection_var

In [914]:
### For visualization define which genes should be ploted (need to be among the top x% of features for the Factor)
top_var_thres =pathway_configs$top_features_plot[1] # choose threshold of top x % of features of MOFA factor to take into account

In [915]:
## Fixed parameters (may be modified here)

In [916]:
## For enrichment calculation
use_statistic = "rank.sum" # which statistic to use to calcuate the enrichment; alternatives: mean.diff, rank.sum
use_test = 'parametric'  # which test to use to test the enrichment; alternatives: permutation, parametric, "cor.adj.parametric"
p_val_cutoff = 0.05
min_size = 5

# Load Data 

## Model Data

In [917]:
### Load the trained MOFA Model

In [918]:
model_name =  paste0("03_MOFA_MODEL_", mofa_name,'.hdf5')

In [919]:
outfile = file.path( paste0(result_path, '/03_results/',  model_name) )

In [920]:
outfile

In [921]:
model <- load_model(outfile, verbose = TRUE)

Loading data...

Loading expectations for 2 nodes...

Loading model options...

Loading training options and statistics...

Assigning names to the different dimensions...

Re-ordering factors by their variance explained...

Doing quality control...

Checking views names...

Checking groups names...

Checking samples names...

Checking features names...

Checking dimensions...

Checking there are no features with complete missing values...

Checking sample covariates...

Checking expectations...

Checking for intercept factors...

“Factor(s) 2, 8 are strongly correlated with the total number of expressed features for at least one of your omics. Such factors appear when there are differences in the total 'levels' between your samples, *sometimes* because of poor normalisation in the preprocessing steps.
”
Checking for highly correlated factors...



## Pathways

In [922]:
### Load the pre-defined pathway set( needs to include the columns:
# ID (unique identifier of the pathway)
# gene : gene-symbol of the gene belonging to the pathway (will be matched to the MOFA features)
# pathway_name: a textual description of the pathway 

In [923]:
pathways =  read.csv(paste0(data_path, 'Prepared_Pathway_Data.csv'))
pathways$X = NULL

In [924]:
head(pathways,2)

Unnamed: 0_level_0,ID,gene,pathway_name
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,R-HSA-1059683,JAK1,Interleukin-6 signaling
2,R-HSA-1059683,STAT1,Interleukin-6 signaling


# Prepare model data

## Extract the weights from the model

In [925]:
### Get the feature weights from the model and prepare the format

In [926]:
weights = get_weights(model, views = "all", factors = "all")
weight_data = data.frame()
for (i in names(weights)){
    data = data.frame(weights[[i]])
    data$type = i
    weight_data = rbind(weight_data,data)
}
weight_data$variable_name = rownames(weight_data)
weight_data$view <- weight_data$type 
weight_data$gene = sapply(strsplit(weight_data$variable_name, "_"), "[", 3)
head(weight_data)

Unnamed: 0_level_0,Factor1,Factor2,Factor3,Factor4,Factor5,Factor6,Factor7,Factor8,Factor9,Factor10,type,variable_name,view,gene
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
adipocyte__ABI1,-0.16729734,-0.0212781961,0.04680285,-0.016812213,0.030347374,0.203569526,-0.16049232,-0.3918164,-0.006508904,-0.002958227,adipocyte,adipocyte__ABI1,adipocyte,ABI1
adipocyte__ABI2,0.04293374,0.0002131761,-0.08831489,-0.019120189,-0.00419851,-0.001043558,-0.13235444,-1.6351764,0.113636161,0.00467813,adipocyte,adipocyte__ABI2,adipocyte,ABI2
adipocyte__ABL1,0.09751513,-0.0630781553,-0.05471721,0.028032557,0.008731156,0.010634944,0.23650912,-0.9663811,-0.030435844,0.003306451,adipocyte,adipocyte__ABL1,adipocyte,ABL1
adipocyte__ABL2,0.05848752,0.1293690713,-0.02835816,-0.007998253,0.002762416,-0.054200865,-0.08235982,-1.1974129,0.038830668,-0.033597339,adipocyte,adipocyte__ABL2,adipocyte,ABL2
adipocyte__ACLY,0.27296034,0.0323241477,-0.12582902,-0.004415822,0.01971719,-0.256614723,-0.02932352,-1.4521545,-0.072279904,0.002201647,adipocyte,adipocyte__ACLY,adipocyte,ACLY
adipocyte__ACTB,0.14216662,0.1760970278,0.27535008,0.008460503,-0.024404793,-0.01524549,-0.19601077,-0.8561824,0.01573827,0.014856187,adipocyte,adipocyte__ACTB,adipocyte,ACTB


In [927]:
## Transform to long format

In [928]:
feature_weights_long = melt(weight_data)

Using type, variable_name, view, gene as id variables



In [929]:
head(feature_weights_long,2)

Unnamed: 0_level_0,type,variable_name,view,gene,variable,value
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<fct>,<dbl>
1,adipocyte,adipocyte__ABI1,adipocyte,ABI1,Factor1,-0.16729734
2,adipocyte,adipocyte__ABI2,adipocyte,ABI2,Factor1,0.04293374


## Adjust feature names in the model

In [930]:
### Feature names should map to the genes in the pathway set (therefore the view component is  removed)

In [931]:
head(features_names(model)[[1]] )

In [932]:
model_conc = model ## save original model for later
for(i in names(features_names(model))){   
        features_names(model)[[i]] = with(feature_weights_long, gene[match(features_names(model)[[i]], variable_name)])
    }

In [933]:
head(features_names(model)[[1]] )

## Create a MOFA model for the overall enrichment analysis across views

In [934]:
# We need a model that has features of all views concatenated in a single view

In [935]:
views <- names(features_names(model_conc))
tmp <- sapply(views, function(view) model_conc@intercepts[[view]]$group1)
names(tmp) <- NULL  
model_conc@intercepts[['complete']]$group1 = unlist(tmp)
model_conc@expectations$W[['complete']] = do.call(rbind, lapply(views, function(view) model_conc@expectations$W[[view]]))
model_conc@features_metadata = rbind(model_conc@features_metadata, 
                                     data.frame(feature = model_conc@features_metadata$feature, view="complete"))
model_conc@dimensions$D['complete'] = sum(model_conc@dimensions$D)
model_conc@data[['complete']]$group1 = do.call(rbind, lapply(views, function(view) model_conc@data[[view]]$group1))
model_conc@data_options$views = c(model_conc@data_options$views , 'complete')
model_conc@model_options$likelihoods['complete'] = 'gaussian'
model_conc@dimensions$M = length(views) + 1

model_conc@cache$variance_explained$r2_total$group1['complete'] = mean(model_conc@cache$variance_explained$r2_total$group1)
model_conc@cache$variance_explained$r2_per_factor$group1 = cbind(model@cache$variance_explained$r2_per_factor$group1, data.frame('complete' = rowMeans(model_conc@cache$variance_explained$r2_per_factor$group1)))

In [936]:
model_conc

Trained MOFA with the following characteristics: 
 Number of views: 10 
 Views names: adipocyte cardiac.muscle.cell cardiac.neuron endothelial.cell fibroblast.of.cardiac.tissue lymphocyte mast.cell mural.cell myeloid.cell complete 
 Number of features (per view): 414 368 162 166 243 170 99 112 331 2065 
 Number of groups: 1 
 Groups names: group1 
 Number of samples (per group): 53 
 Number of factors: 10 


# Prepare pathway data

In [937]:
### Prepare the pathway data to use for the enrichment

In [938]:
head(pathways,2)

Unnamed: 0_level_0,ID,gene,pathway_name
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,R-HSA-1059683,JAK1,Interleukin-6 signaling
2,R-HSA-1059683,STAT1,Interleukin-6 signaling


## Filter pathways out based on coverage

In [939]:
## get all features included in the MOFA model
mofa_genes = data.frame(gene = unique(feature_weights_long$gene), is_feature = 1)

In [940]:
head(mofa_genes,2)

Unnamed: 0_level_0,gene,is_feature
Unnamed: 0_level_1,<chr>,<dbl>
1,ABI1,1
2,ABI2,1


In [941]:
### Merge features within mofa model to pathways
feature_set = merge(pathways, mofa_genes, all.x = TRUE)  

In [942]:
head(feature_set,2)

Unnamed: 0_level_0,gene,ID,pathway_name,is_feature
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>
1,A1BG,R-HSA-168249,Innate Immune System,
2,A1BG,R-HSA-168256,Immune System,


In [943]:
### Remove pathways for which we have not a high amount of genes in our data (coverage_par)
filter = feature_set %>% group_by(ID, pathway_name) %>% summarise(gene_amount = n(),matched_amount = sum(!is.na(is_feature)),  coverage = sum(!is.na(is_feature)) / n()) %>% filter(coverage >=  coverage_par)

### Get the pathways that have been filtered out because of to low coverage
filtered_pathways = feature_set %>% group_by(ID, pathway_name) %>% summarise(gene_amount = n(),matched_amount = sum(!is.na(is_feature)),  coverage = sum(!is.na(is_feature)) / n()) %>% filter(coverage <  coverage_par)
nrow(filtered_pathways)  # amount of pathways that have een filtred out

[1m[22m`summarise()` has grouped output by 'ID'. You can override using the `.groups`
argument.
[1m[22m`summarise()` has grouped output by 'ID'. You can override using the `.groups`
argument.


In [944]:
head(filter,2)

ID,pathway_name,gene_amount,matched_amount,coverage
<chr>,<chr>,<int>,<int>,<dbl>
R-HSA-1059683,Interleukin-6 signaling,15,9,0.6
R-HSA-1169092,Activation of RAS in B cells,7,4,0.5714286


In [945]:
feature_set = merge(feature_set, filter)

In [946]:
# Overview pathways that have been excluded from testing due to low amount of matching genes
head(filtered_pathways,15) 

ID,pathway_name,gene_amount,matched_amount,coverage
<chr>,<chr>,<int>,<int>,<dbl>
R-HSA-1168372,Downstream signaling events of B Cell Receptor (BCR),101,40,0.3960396
R-HSA-1169091,Activation of NF-kappaB in B cells,73,26,0.3561644
R-HSA-1169408,ISG15 antiviral mechanism,72,31,0.4305556
R-HSA-1169410,Antiviral mechanism by IFN-stimulated genes,80,32,0.4
R-HSA-1170546,Prolactin receptor signaling,21,10,0.4761905
R-HSA-1222556,ROS and RNS production in phagocytes,36,10,0.2777778
R-HSA-1236974,ER-Phagosome pathway,100,22,0.22
R-HSA-1236975,Antigen processing-Cross presentation,164,44,0.2682927
R-HSA-1236977,Endosomal/Vacuolar pathway,11,4,0.3636364
R-HSA-1236978,Cross-presentation of soluble exogenous antigens (endosomes),53,15,0.2830189


In [947]:
### Remove NA entries for not mapped genes in feature set

In [948]:
head(feature_set,2)

Unnamed: 0_level_0,ID,pathway_name,gene,is_feature,gene_amount,matched_amount,coverage
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<int>,<int>,<dbl>
1,R-HSA-1059683,Interleukin-6 signaling,CBL,1,15,9,0.6
2,R-HSA-1059683,Interleukin-6 signaling,JAK2,1,15,9,0.6


In [949]:
feature_set = feature_set[!is.na(feature_set$is_feature),]

In [950]:
nrow(feature_set)

In [951]:
head(feature_set,2)

Unnamed: 0_level_0,ID,pathway_name,gene,is_feature,gene_amount,matched_amount,coverage
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<int>,<int>,<dbl>
1,R-HSA-1059683,Interleukin-6 signaling,CBL,1,15,9,0.6
2,R-HSA-1059683,Interleukin-6 signaling,JAK2,1,15,9,0.6


In [952]:
### Save coverage for later usage
coverage_info = unique(feature_set[,c('ID', 'pathway_name', 'coverage')])

In [953]:
### Get genes that are not mapped / and add a not mapped pathway to use them in the background later
# (this is for including all features that are in the MOFA model in the enrichment analysis, either as belonging to a pathway or only being part of the background set)

In [954]:
non_pathway_genes = unique(mofa_genes$gene[!mofa_genes$gene %in% unique(feature_set$gene)])

In [955]:
add_pathway = data.frame(ID = 'Background', pathway_name = 'Background', gene = non_pathway_genes, 
                         is_feature = 1, gene_amount = length(non_pathway_genes), matched_amount =  length(non_pathway_genes),
                         coverage = 1)

In [956]:
feature_set = rbind(feature_set, add_pathway)

## Transform to binary matrix format

In [957]:
### Transform the pathway feature set to a binary matrix format (1 indicating that a certain feature belongs to the pathway; 0 that it does not)

In [958]:
head(feature_set,2)

Unnamed: 0_level_0,ID,pathway_name,gene,is_feature,gene_amount,matched_amount,coverage
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<int>,<int>,<dbl>
1,R-HSA-1059683,Interleukin-6 signaling,CBL,1,15,9,0.6
2,R-HSA-1059683,Interleukin-6 signaling,JAK2,1,15,9,0.6


In [959]:
feature_set$pathway_id = paste0(feature_set$ID, '_', feature_set$pathway_name)

In [960]:
feature_set$value = 1

In [961]:
### Adjust names for overall approach (concatenate gene names with cell-types and match to pathways)

In [962]:
feature_weights_long_mapped_join = unique(feature_weights_long[,c('gene', 'variable_name')])

In [963]:
head(feature_weights_long_mapped_join,2)

Unnamed: 0_level_0,gene,variable_name
Unnamed: 0_level_1,<chr>,<chr>
1,ABI1,adipocyte__ABI1
2,ABI2,adipocyte__ABI2


In [964]:
feature_set_all = merge(feature_set, feature_weights_long_mapped_join, by.x = 'gene', by.y = 'gene')

In [965]:
nrow(feature_set_all)

In [966]:
### Binarize feature set to use in type-specific enrichment

In [967]:
## Based on gene-name for overall enrichment

feature_set = unique(feature_set[,c('pathway_id', 'gene', 'value')]) %>% dcast(pathway_id ~ gene, value.var = 'value')
feature_set[is.na(feature_set)]= 0

In [968]:
head(feature_set,2)

Unnamed: 0_level_0,pathway_id,ABI1,ABI2,ABL1,ABL2,ACLY,ACTB,ACTG1,ACTR10,ACTR2,⋯,WWP1,XAF1,XRCC5,XRCC6,YES1,YWHAZ,ZBTB16,ZEB1,ZNRF1,ZNRF2
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,Background_Background,0,0,0,0,1,0,0,1,0,⋯,1,1,1,1,0,0,1,1,1,1
2,R-HSA-1059683_Interleukin-6 signaling,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [969]:
### Binarize feature set to use in overall enrichment

In [970]:
feature_set_all = unique(feature_set_all[,c('pathway_id', 'variable_name', 'value')]) %>% dcast(pathway_id ~ variable_name, value.var = 'value')
feature_set_all[is.na(feature_set_all)] = 0

In [971]:
head(feature_set_all,2) # feature set containing only genes

Unnamed: 0_level_0,pathway_id,adipocyte__ABI1,adipocyte__ABI2,adipocyte__ABL1,adipocyte__ABL2,adipocyte__ACLY,adipocyte__ACTB,adipocyte__ACTG1,adipocyte__ACTR10,adipocyte__ACTR2,⋯,myeloid.cell__VAV3,myeloid.cell__VPS35L,myeloid.cell__WASF2,myeloid.cell__WIPF1,myeloid.cell__WSB1,myeloid.cell__WWP1,myeloid.cell__XRCC5,myeloid.cell__YWHAZ,myeloid.cell__ZBTB16,myeloid.cell__ZNRF2
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,Background_Background,0,0,0,0,1,0,0,1,0,⋯,0,1,0,0,1,1,1,0,1,1
2,R-HSA-1059683_Interleukin-6 signaling,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [972]:
## Adjust rownames and convert to matrix

In [973]:
rownames(feature_set) = feature_set$pathway_id
rownames(feature_set_all) = feature_set_all$pathway_id

In [974]:
feature_set$pathway_id = NULL
feature_set_all$pathway_id = NULL

In [975]:
feature_set = as.matrix(feature_set)
feature_set_all = as.matrix(feature_set_all)

# Run pathway enrichment

## Per View

In [976]:
## For each view seperately run the enrichment

In [977]:
if(max(views_set) != ''){
    enrichment_result_types = run_enrichment_pathway(
        model = model, # MOFA Model
        factor_set = factor_set, # list of factors for which to run the enrichment
        views = views_set,
        use_statistic = use_statistic, # which statistic to use
        feature_set = feature_set, # Pathway Feature Set mapping; here use without type names
        min_size = min_size, # Min size of genes within a pathway
        use_test = use_test, # test used for calculating p-value
        p_val_cutoff = p_val_cutoff, # p-value cutoff used
        enrichment_result_p_val = data.frame())# dataset for saving result 
    } else{enrichment_result_types = ''}

Intersecting features names in the model and the gene set annotation results in a total of 331 features.


Running feature set Enrichment Analysis with the following options...
View: myeloid.cell 
Number of feature sets: 27 
Set statistic: rank.sum 
Statistical test: parametric 





Using pathway as id variables

Using pathway as id variables

Intersecting features names in the model and the gene set annotation results in a total of 331 features.


Running feature set Enrichment Analysis with the following options...
View: myeloid.cell 
Number of feature sets: 27 
Set statistic: rank.sum 
Statistical test: parametric 


Subsetting weights with negative sign




Using pathway as id variables

Using pathway as id variables

Intersecting features names in the model and the gene set annotation results in a total of 331 features.


Running feature set Enrichment Analysis with the following options...
View: myeloid.cell 
Number of feature sets: 27 
Set statistic: rank.sum 
Statistical test:

In [978]:
# Example of the resulting enrichment table

In [979]:
if(is.data.frame(enrichment_result_types)){
    head(enrichment_result_types  %>% arrange(padj) ,5)}

Unnamed: 0_level_0,pathway,variable,p,padj,view,enrichment,global_FDR
Unnamed: 0_level_1,<chr>,<fct>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
1,R-HSA-1059683_Interleukin-6 signaling,Factor1,0.001446213,0.01952387,cardiac.muscle.cell,all,0.5639793
2,"R-HSA-512988_Interleukin-3, Interleukin-5 and GM-CSF signaling",Factor1,0.001051805,0.01952387,cardiac.muscle.cell,all,0.5639793
3,R-HSA-2454202_Fc epsilon receptor (FCERI) signaling,Factor2,0.002580297,0.02322268,myeloid.cell,negative,0.5639793
4,R-HSA-2871809_FCERI mediated Ca+2 mobilization,Factor2,0.00201615,0.02322268,myeloid.cell,negative,0.5639793
5,"R-HSA-512988_Interleukin-3, Interleukin-5 and GM-CSF signaling",Factor2,0.001258003,0.02322268,myeloid.cell,negative,0.5639793


## Across Views

In [980]:
## Run an enrichment analysis across all the views

In [981]:
enrichment_result_all = run_enrichment_pathway(
    model = model_conc, # MOFA Model
    views = 'complete', # dimensions for which to run
    factor_set = factor_set, # list of factors for which to run the enrichment
    use_statistic = use_statistic, # which statistic to use
    feature_set = feature_set_all, # Pathway Feature Set mapping; use the concatenated one with cell-types
    min_size = min_size, # Min size of genes within a pathway
    use_test = use_test, # test used for calculating p-value
    p_val_cutoff = p_val_cutoff, # p-value cutoff used
    enrichment_result_p_val = data.frame()) # dataset for saving results

Intersecting features names in the model and the gene set annotation results in a total of 2065 features.


Running feature set Enrichment Analysis with the following options...
View: complete 
Number of feature sets: 39 
Set statistic: rank.sum 
Statistical test: parametric 





Using pathway as id variables

Using pathway as id variables

Intersecting features names in the model and the gene set annotation results in a total of 2065 features.


Running feature set Enrichment Analysis with the following options...
View: complete 
Number of feature sets: 39 
Set statistic: rank.sum 
Statistical test: parametric 


Subsetting weights with negative sign




Using pathway as id variables

Using pathway as id variables

Intersecting features names in the model and the gene set annotation results in a total of 2065 features.


Running feature set Enrichment Analysis with the following options...
View: complete 
Number of feature sets: 39 
Set statistic: rank.sum 
Statistical test: parametr

In [982]:
head(enrichment_result_all %>% arrange(padj) ,5)

Unnamed: 0_level_0,pathway,variable,p,padj,view,enrichment,global_FDR
Unnamed: 0_level_1,<chr>,<fct>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
1,R-HSA-877312_Regulation of IFNG signaling,Factor1,1.333772e-06,5.201711e-05,complete,negative,0.000468154
2,"R-HSA-512988_Interleukin-3, Interleukin-5 and GM-CSF signaling",Factor2,8.505212e-05,0.001695328,complete,negative,0.014926648
3,R-HSA-2454202_Fc epsilon receptor (FCERI) signaling,Factor2,0.0002139521,0.002781378,complete,negative,0.01658069
4,R-HSA-9607240_FLT3 Signaling,Factor1,0.0001504252,0.002933291,complete,negative,0.01658069
5,R-HSA-173623_Classical antibody-mediated complement activation,Factor2,0.0002834306,0.005526897,complete,positive,0.01658069


In [983]:
### Combine both versions and save the result

In [984]:
if(is.data.frame(enrichment_result_types)){
    enrichment_result = rbind(enrichment_result_all, enrichment_result_types  )
    } else{enrichment_result = enrichment_result_all}

In [985]:
enrichment_result$ID = str_replace(enrichment_result$pathway, '_.*', '')
enrichment_result$pathway_name = str_replace(enrichment_result$pathway, '.*_', '')

In [986]:
enrichment_result$pathway = NULL

In [987]:
enrichment_result = merge(enrichment_result, coverage_info)

In [988]:
write.csv(enrichment_result, paste0(result_path, '/06_results/06_Pathway_Enrichment_' ,mofa_name, '.csv'), row.names = FALSE)

# Plot interesting pathways

In [989]:
## Plot a subset of the enriched pathways (based on the parameters of the config file)

In [990]:
pathway_selection_var  # specifies whether a specific pathway has been selected in the config file

In [991]:
# specifies whether pathways will be displayed based on p-value and coverage thresholds specified in the config file
# or specifically selected pathways will be shown
if(pathway_selection_var == '' | is.na(pathway_selection_var) ){
    pathways_selection = enrichment_result[
        (enrichment_result$coverage > coverage_par) & 
        (enrichment_result$enrichment == select_enrichment) &
        (enrichment_result$global_FDR < p_value_cutoff_plot),] %>%  arrange(padj) %>% group_by(variable) %>% top_n(max_pathways, -padj)
    } else{ pathways_selection = enrichment_result[enrichment_result$pathway_name %in% unlist(str_split(pathway_selection_var, ',')),]}

In [992]:
## Select the genes of the pathway that have the highest weights on the selected factor

In [993]:
### Get involved genes

## Define gene-set to merge
geneset_oi_pos_per_factor = feature_weights_long %>% group_by(variable) %>% arrange( desc(value),  .by_group = TRUE)  %>% top_frac(as.numeric(top_var_thres))
geneset_oi_pos_per_factor$direction = 'positive'
geneset_oi_neg_per_factor = feature_weights_long %>% group_by(variable) %>% arrange(desc(value),  .by_group = TRUE)  %>% top_frac(-as.numeric(top_var_thres))
geneset_oi_neg_per_factor$direction = 'negative'
geneset_oi = rbind(geneset_oi_pos_per_factor, geneset_oi_neg_per_factor)

colnames(geneset_oi) = c('type', 'variable_name','view',  'gene', 'variable', 'factor_value', 'factor_value_direction')
head(geneset_oi,2)

[1m[22mSelecting by value
[1m[22mSelecting by value


type,variable_name,view,gene,variable,factor_value,factor_value_direction
<chr>,<chr>,<chr>,<chr>,<fct>,<dbl>,<chr>
myeloid.cell,myeloid.cell__ATP8B4,myeloid.cell,ATP8B4,Factor1,0.6472803,positive
cardiac.muscle.cell,cardiac.muscle.cell__YWHAZ,cardiac.muscle.cell,YWHAZ,Factor1,0.6431828,positive


In [994]:
### Merge genes belongig to pathway to enriched pathway sets

pathways_vis_genes = merge(pathways_selection, pathways) %>% mutate(pvalue=p, view_text=view, cluster=view, name=pathway_name) %>% dplyr::select(-view)

In [995]:
### Add the feature weights to the corresponding genes from geneset_oi

In [996]:
pathways_vis_genes = merge(pathways_vis_genes, geneset_oi, by.x = c('gene', 'variable'), by.y = c('gene','variable' ))

In [997]:
### Summarise to get max/ mean factor value of each gene per pathway (remove cell-cluster/view/type dimension)  --> tBD max or mean (for max --> absolute value?)

pathways_vis_genes_summarized = pathways_vis_genes %>% group_by(gene ,variable,  pathway_name, ID, name, cluster ,view) %>% summarise(factor_value = mean(factor_value), enrichment_type = paste0(unique(enrichment), collapse = '&'), pvalue = min(pvalue))

[1m[22m`summarise()` has grouped output by 'gene', 'variable', 'pathway_name', 'ID', 'name',
'cluster'. You can override using the `.groups` argument.


In [998]:
head(pathways_vis_genes_summarized,2)

gene,variable,pathway_name,ID,name,cluster,view,factor_value,enrichment_type,pvalue
<chr>,<fct>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>
ABL2,Factor1,FLT3 Signaling,R-HSA-9607240,FLT3 Signaling,complete,cardiac.neuron,0.4184974,negative,0.0001504252
ABL2,Factor1,FLT3 Signaling,R-HSA-9607240,FLT3 Signaling,complete,myeloid.cell,0.4747316,negative,0.0001504252


In [999]:
### Plot the pathways with summarized gene values

In [1000]:
pathways_sum_plot = list()

In [1001]:
for(i in unique(pathways_vis_genes_summarized$variable)){

    xlabel = xlab('Gene') 
    ylabel = ylab('Pathway')
    
    pathways_sum_plot[[i]] = ggplot(pathways_vis_genes_summarized[pathways_vis_genes_summarized$variable == i,], aes(gene,  pathway_name, fill= factor_value)) + 
        plot_config_heatmap + 
        geom_tile() + 
        scale_fill_gradient2(low = "#1D2ED8", mid = "white", high = "#D8911D", midpoint = 0)  + 
        scale_x_discrete(position = "top") +
        theme(axis.text.x = element_text(angle = 90), axis.title.x = element_blank(), axis.text.y = element_text(hjust = 0, vjust = 0.5)) +
        xlabel +
        ylabel +  scale_y_discrete(labels = label_wrap(50))+
        ggtitle(paste0(i, ' values of pathway genes (top ', top_var_thres  * 2 *100, '% of features)'))
    }

In [1002]:
options(repr.plot.width=20, repr.plot.height=10)
#pathways_sum_plot[[i]] +  scale_y_discrete(labels = label_wrap(25))

In [1003]:
## Plot factor values for all genes

In [1004]:
pathways_detail_plot = list()

In [1005]:
### Visualized the exact factor values of the genes
for(i in unique(pathways_vis_genes$variable)){
    # Specific Text Descriptions:
    xlabel = xlab('Gene') 
    ylabel = ylab('View')

    plot_data_cluster = unique(pathways_vis_genes[pathways_vis_genes$variable == i,c('gene', 'variable', 'type','view',  'variable_name', 'factor_value', 'factor_value_direction')])

    pathways_detail_plot[[i]] = ggplot(plot_data_cluster, aes(gene,  view, fill= factor_value)) + 
        plot_config_heatmap + 
        geom_tile() + 
        scale_fill_gradient2(low = "#1D2ED8", mid = "white", high ="#D8911D", midpoint = 0)  + 
        scale_x_discrete(position = "top") +
        theme(axis.text.x = element_text(angle = 90)) +
        xlabel +
        ylabel
}

In [1006]:
#pathways_detail_plot[[i]]

In [1007]:
### Combine plot and save

In [1008]:
combined_plot = list()

In [1009]:
for(i in 1:length(pathways_detail_plot)){
    combined_plot[[i]] = ggarrange(pathways_sum_plot[[i]],
              pathways_detail_plot[[i]] + theme(axis.text.x = element_blank(), axis.title.x =element_blank()), align = 'v', ncol = 1)
}

In [1010]:
#combined_plot[[i]]

In [1011]:
## Save the plot

In [1012]:
figure_name = paste0( "FIG06_Pathways_and_Genes")

In [1013]:
# Sizes of the plot
width_par = 8.07
height_par = 4

In [1014]:
pdf(paste0('figures/06_figures/', figure_name, '_',   mofa_name, '.pdf'), width =width_par, height =height_par)

for(j in 1:length(combined_plot)){
        print( combined_plot[[j]])
        }
dev.off()   
