In [95]:
### Script to integrate, preprocess and harmonize all available data sets
### In our use-case
# Single Cell RNA Seq
# Cytokine Data
# Neutrophil Data
# Clinical Data
# Proteomics

#############################################
# Prerequisites - Load Libraries

In [96]:
source('MS0_Libraries.r')

[1] "/opt/conda/envs/mofa_analysis/lib/R/library"


In [97]:
source('MS2_Plot_Config.r')

In [98]:
source('MS1_Functions.r')

###############################################
# Preqrequisites Configurations & Parameters

In [99]:
### Load the parameters that are set via the configuration files

In [100]:
### Load configurations file
global_configs = read.csv('configurations/Data_Configs.csv', sep = ',')

"incomplete final line found by readTableHeader on 'configurations/Data_Configs.csv'"


In [101]:
head(global_configs,3)

Unnamed: 0_level_0,parameter,value
Unnamed: 0_level_1,<chr>,<chr>
1,data_path,/home/icb/corinna.losert/projects/mofa_workflow/input_data/
2,result_path,/home/icb/corinna.losert/projects/mofa_workflow/result_data/
3,configuration_name,MI_v1


In [102]:
data_path = global_configs$value[global_configs$parameter == 'data_path']

In [103]:
data_path

In [104]:
result_path = global_configs$value[global_configs$parameter == 'result_path']

In [105]:
result_path

In [106]:
## Load the configuration file specifying single-cell specific filtering options

In [107]:
sc_configs = read.csv('configurations/02_Pre_Processing_Configs_SC.csv', sep = ',')

"incomplete final line found by readTableHeader on 'configurations/02_Pre_Processing_Configs_SC.csv'"


In [108]:
head(sc_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,data_type,cell_type_exclusion,cell_expr_thres1,cell_expr_thres2
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,MI_v1,Prepared_sc_Data,h5ad,"Platelet,Plasmablast,pDC,Eryth,Doublet,dnT,cdC1,CD8 TCM,CD8 Proliferating,CD4 Proliferating,NK Proliferating,ASDC,NK_CD56bright,ILC,MAIT,HSPC",50;10,40;20
2,,,,,,


In [109]:
sc_configs = sc_configs[sc_configs$data_name != '',]

In [110]:
## Load the configuration file specifying the pre-processing options for all datasets

In [111]:
data_configs = read.csv('configurations/02_Pre_Processing_Configs.csv', sep = ',')

In [112]:
data_configs = data_configs[data_configs$configuration_name != '',]   # remove lines with empty configuration names
data_configs = data_configs[!is.na(data_configs$configuration_name),]  # remove lines with NA in configuration name

In [113]:
head(data_configs)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<lgl>,<lgl>,<lgl>
1,MI_v1,Prepared_Proteomic_Data,csv,proteomic,,1.0,0.2,False,False,1.0,False,False,True
2,MI_v1,Prepared_Cytokine_Data,csv,cytokine,,1.0,0.2,False,True,1.0,False,False,True
3,MI_v1,Prepared_Clinical_Data,csv,clinical,,1.0,0.2,False,True,1.0,False,False,True
4,MI_v1,Prepared_sc_Data,h5ad,sc,,1.0,0.05,True,True,1.0,True,True,True
5,MI_v1,Prepared_Neutrophil_Data,csv,neutrophil,,0.9,0.8,True,True,0.75,True,True,True


In [114]:
### Generate the result data directory if it does not exist yet
if(!file.exists(paste0(result_path, '02_results'))){
    dir.create(file.path(paste0(result_path, '02_results')))
    }

# Load Data

In [115]:
### Load sc Data and exclude cluster_ids as specified in the configuration file

In [116]:
datasets = list()

In [117]:
## Load sc data (pseudobulk) generated in previous step
if(nrow(sc_configs) > 0){
for(j in 1:nrow(sc_configs)){
    sc_data_name = sc_configs$data_name[j]
    sc_data =  fread(paste0(result_path, '/01_results/01_', sc_data_name, 'Pseudobulk_Table', '.csv'))

    sc_data$V1 = NULL

    ## Split up sc to different configs
    for(i in unique(sc_configs$configuration_name)){    
        for(j in unique(sc_configs$data_name[sc_configs$configuration_name == i])){

            data = sc_data[sc_data$dataset == j,]

            ## Exclude cluster_id's (cell-type clusters)
            if(!is.na(sc_configs$cell_type_exclusion[sc_configs$configuration_name == i])){
            data = data[!data$type %in% unlist(strsplit(sc_configs$cell_type_exclusion[sc_configs$configuration_name == i] ,',')),]
                }

            datasets[[i]][[j]] = data
            }
        }
    }    
 }   

In [118]:
sc_data_name

In [119]:
#str(datasets)

In [120]:
length(unique(data$sample_id))

In [121]:
### Load the other datasets specified in the configuration file

In [122]:
for(i in unique(data_configs$configuration_name)){     # for each config
    for(j in unique(data_configs$data_name[data_configs$configuration_name == i])){      # each specifiec data-name
        
        configuration = data_configs[(data_configs$configuration_name == i) & (data_configs$data_name == j),]
        
        if(configuration$file_type == 'csv'){
        data = read.csv(paste0(data_path, j, '.csv'))
        data$X = NULL
        data = melt(data, id.vars = 'sample_id')
        data$dataset = j
        data$type = configuration$data_type
        
        datasets[[i]][[j]] = data
        }
        }
    }

In [123]:
#head(data,2)

In [124]:
#str(datasets)

In [125]:
data_backup = datasets # in case something should be re-executed, so loading of data is not necessary a second time

In [126]:
datasets = data_backup

In [127]:
#str(datasets)

# Pre-Process each dataset as specified in the configuration files

## Sample Filter

In [128]:
### Filter out sample_id's specified in the configuration file

In [129]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<lgl>,<lgl>,<lgl>
1,MI_v1,Prepared_Proteomic_Data,csv,proteomic,,1,0.2,False,False,1,False,False,True
2,MI_v1,Prepared_Cytokine_Data,csv,cytokine,,1,0.2,False,True,1,False,False,True


In [130]:
for(i in 1:nrow(data_configs)){
    ### Remove samples based on specified samples in remove_sample_ids column
    if( (!is.na(data_configs$remove_sample_ids[i])) & (data_configs$remove_sample_ids[i] != '')){
        
        print(paste0('Filtered specific samples for ',data_configs$data_name[i], ' ',  unique( unlist(strsplit(data_configs$remove_sample_ids[i], ',')))))
        
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
        
        ### remove samples
        data = data[! data$sample_id %in% unlist(strsplit(data_configs$remove_sample_ids[i], ',')),]  # TBD check!
        
        ### replace adjusted data
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = data
        
        }
    
     ### Remove samples based on threshold in sample_filtering_thres
     if ( (as.numeric(data_configs$sample_filtering_thres[i]) < 1) & (as.numeric(data_configs$sample_filtering_thres[i]) > 0)){
         
         print(paste0('Filtered samples based on threshold for ',data_configs$data_name[i])) 
         data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
         print(paste0('Amount samples before filtering ', length(unique(data$sample_id))))
         
         ### calculate percentage of features with zero values
         data = data %>% group_by(sample_id, type) %>% mutate(zero_expression_percentage = sum(value == 0)/ n())
         ### filter out samples if percentage higher than threshold
         data = data[data$zero_expression_percentage < data_configs$sample_filtering_thres[i],]
         print(paste0('Amount samples after filtering ', length(unique(data$sample_id))))
         
         ### remove generated columns
         data$zero_expression_percentage = NULL
         
         ### replace adjusted data
         datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = data            
    } 
}
        

[1] "Filtered samples based on threshold for Prepared_Neutrophil_Data"
[1] "Amount samples before filtering 121"
[1] "Amount samples after filtering 112"


In [131]:
#str(datasets)

## Feature Removal (based on sample expression)

In [132]:
## Filter out features that are not expressed in a certain amount of sample (threshold set in the configuration file)

In [133]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<lgl>,<lgl>,<lgl>
1,MI_v1,Prepared_Proteomic_Data,csv,proteomic,,1,0.2,False,False,1,False,False,True
2,MI_v1,Prepared_Cytokine_Data,csv,cytokine,,1,0.2,False,True,1,False,False,True


In [134]:
data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]

In [135]:
head(data,2)

sample_id,variable,value,dataset,type
<chr>,<fct>,<int>,<chr>,<chr>
k4,A1BG,0,Prepared_Neutrophil_Data,neutrophil
m14.3,A1BG,0,Prepared_Neutrophil_Data,neutrophil


In [136]:
for(i in 1:nrow(data_configs)){

    if( (!is.na(data_configs$feature_filtering_thres[i])) & (data_configs$feature_filtering_thres[i] != '')  & (data_configs$feature_filtering_thres[i] > 0)){
        
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
        
        print(paste0(data_configs$configuration_name[i], ' ' ,data_configs$data_name[i]))
        
        ## Determine data to filter
        data$expression = TRUE
        data$expression[data$value == 0] = FALSE
        expression_filter = data %>% group_by(type, variable) %>% summarise(perc_expression = sum(expression)  )
        expression_filter$perc_expression = expression_filter$perc_expression / length(unique(data$sample_id))
        
        ## Apply filter
        filtered_out = expression_filter[expression_filter$perc_expression <= data_configs$feature_filtering_thres[i],]
        print(paste0( 'Filtered: ' ))
        if(nrow(filtered_out) > 0){
            print((head(filtered_out %>% dplyr::group_by(type) %>% dplyr::count())))
            }
        expression_filter = expression_filter[expression_filter$perc_expression >data_configs$feature_filtering_thres[i],]  # kept data
        
        data = merge(data, expression_filter[,c('type', 'variable')], by.x = c('type', 'variable'), by.y = c('type', 'variable'))   # filter the data
        
        ## Remove expression column 
        data$expression = NULL
        
        ## Replace 
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]  = data
        
      }
}

[1] "MI_v1 Prepared_Proteomic_Data"


[1m[22m`summarise()` has grouped output by 'type'. You can override using the `.groups` argument.


[1] "Filtered: "
[1] "MI_v1 Prepared_Cytokine_Data"


[1m[22m`summarise()` has grouped output by 'type'. You can override using the `.groups` argument.


[1] "Filtered: "
[90m# A tibble: 1 x 2[39m
[90m# Groups:   type [1][39m
  type         n
  [3m[90m<chr>[39m[23m    [3m[90m<int>[39m[23m
[90m1[39m cytokine     5
[1] "MI_v1 Prepared_Clinical_Data"


[1m[22m`summarise()` has grouped output by 'type'. You can override using the `.groups` argument.


[1] "Filtered: "
[1] "MI_v1 Prepared_sc_Data"


[1m[22m`summarise()` has grouped output by 'type'. You can override using the `.groups` argument.


[1] "Filtered: "
[90m# A tibble: 6 x 2[39m
[90m# Groups:   type [6][39m
  type          n
  [3m[90m<chr>[39m[23m     [3m[90m<int>[39m[23m
[90m1[39m B cell     [4m2[24m472
[90m2[39m CD14 Mono   139
[90m3[39m CD16 Mono  [4m3[24m465
[90m4[39m CD4 CTL    [4m5[24m521
[90m5[39m CD4 Naive  [4m4[24m380
[90m6[39m CD4 TCM     650
[1] "MI_v1 Prepared_Neutrophil_Data"


[1m[22m`summarise()` has grouped output by 'type'. You can override using the `.groups` argument.


[1] "Filtered: "
[90m# A tibble: 1 x 2[39m
[90m# Groups:   type [1][39m
  type           n
  [3m[90m<chr>[39m[23m      [3m[90m<int>[39m[23m
[90m1[39m neutrophil [4m2[24m[4m3[24m955


In [137]:
names(datasets)

## Library Adjustment

In [138]:
## Normalize measured counts for each sample to have the same amount of counts

In [139]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<lgl>,<lgl>,<lgl>
1,MI_v1,Prepared_Proteomic_Data,csv,proteomic,,1,0.2,False,False,1,False,False,True
2,MI_v1,Prepared_Cytokine_Data,csv,cytokine,,1,0.2,False,True,1,False,False,True


In [140]:
for(i in 1:nrow(data_configs)){
    if((data_configs$library_adjustment[i] == 'TRUE')){
        
        print(paste0('Library Adjustment for ',data_configs$data_name[i]))
        
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]

        ### Calculate scaling factor per sample
        data = data %>% group_by(sample_id,type) %>% mutate(sample_counts = sum(value))
        data = data %>% group_by(type) %>% mutate(mean_sample_counts = mean(sample_counts))
        
        data$scaling_factor = data$sample_counts/ data$mean_sample_counts
        data$scaling_factor[data$scaling_factor == 0] = 1 # avoid dividing by 0; TBD whether to include or exclude samples with only zero counts in a cell-type
        
        ### Apply scaling to counts
        
        data$value = data$value / data$scaling_factor
        
        ### Save transformed data to list
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = data
        
        }
    }
        

[1] "Library Adjustment for Prepared_sc_Data"
[1] "Library Adjustment for Prepared_Neutrophil_Data"


In [141]:
names(datasets)

## Gene Filtering (according to cells expressing genes - only for sc Data)

In [142]:
### Remove genes from the single-cell dataset that are expressed in a too low amount of cells

In [143]:
head(sc_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,data_type,cell_type_exclusion,cell_expr_thres1,cell_expr_thres2
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,MI_v1,Prepared_sc_Data,h5ad,"Platelet,Plasmablast,pDC,Eryth,Doublet,dnT,cdC1,CD8 TCM,CD8 Proliferating,CD4 Proliferating,NK Proliferating,ASDC,NK_CD56bright,ILC,MAIT,HSPC",50;10,40;20


In [144]:
## Load gene filtering information from previous script

In [145]:
gene_expression_info = data.frame()

In [146]:
for(i in sc_configs$data_name){
    data= read.csv(paste0(result_path, '/01_results/01_' ,i, '_Gene_Expr_per_Cell_Type.csv'))
    data$X = NULL
    
    data$data_name = i
    gene_expression_info = rbind(gene_expression_info, data)
    }

In [147]:
head(gene_expression_info,2)

Unnamed: 0_level_0,perc_cells_expressing_gene,total_amount_cells_expressing_gene,gene,cluster,dataset,data_name
Unnamed: 0_level_1,<dbl>,<int>,<chr>,<chr>,<chr>,<chr>
1,0.23400142,23,AL627309.1,B cell,Prepared_sc_Data,Prepared_sc_Data
2,0.06104385,6,AL627309.4,B cell,Prepared_sc_Data,Prepared_sc_Data


In [148]:
sc_configs

Unnamed: 0_level_0,configuration_name,data_name,data_type,cell_type_exclusion,cell_expr_thres1,cell_expr_thres2
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,MI_v1,Prepared_sc_Data,h5ad,"Platelet,Plasmablast,pDC,Eryth,Doublet,dnT,cdC1,CD8 TCM,CD8 Proliferating,CD4 Proliferating,NK Proliferating,ASDC,NK_CD56bright,ILC,MAIT,HSPC",50;10,40;20


In [149]:
if(nrow(sc_configs) > 0){
for(i in unique(gene_expression_info$data_name)){
        
        print(paste0('Gene Filtering for  ',sc_configs$configuration_name[sc_configs$data_name == i]))
        
        data = datasets[[sc_configs$configuration_name[sc_configs$data_name == i]]][[i]]
        gene_expr_data = gene_expression_info[gene_expression_info$data_name == i,]
    
        ## Get thresholds for config
        thres1 = as.numeric(unlist(str_split(sc_configs$cell_expr_thres1[sc_configs$data_name == i], ';')))
        thres2 = as.numeric(unlist(str_split(sc_configs$cell_expr_thres2[sc_configs$data_name == i], ';')))
        amount_samples = length(unique(data$sample_id))
        print(paste0('Amount Samples', amount_samples))
    
        ## Filter down gene based on the expression info
        gene_filtering =  gene_expr_data[((     gene_expr_data$perc_cells > thres1[1]) & (     gene_expr_data$total_amount_cells_expressing_gene > amount_samples * thres1[2])) |
         ((     gene_expr_data$perc_cells > thres2[1]) & (     gene_expr_data$total_amount_cells_expressing_gene > amount_samples * thres2[2])) ,]
    
        ## Apply to thresholds set in the configuration file
        filtered_data = data.frame()
        for( k in unique(data$type)){
            data_cluster = data[(data$type == k) & (data$variable %in% gene_filtering$gene[gene_filtering$cluster == k]),]
            filtered_data = rbind(filtered_data, data_cluster)
            }
        datasets[[sc_configs$configuration_name[sc_configs$data_name == i]]][[i]] = filtered_data
    }
    }

[1] "Gene Filtering for  MI_v1"
[1] "Amount Samples119"


In [150]:
### Amount of genes after filtering

In [151]:
#unique(datasets[[1]][['Prepared_sc_Data']][,c('type', 'variable')])%>% group_by(type) %>% dplyr::count()

In [152]:
head(filtered_data,2)

type,variable,sample_id,dataset,value,sample_counts,mean_sample_counts,scaling_factor
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
B cell,ACTB,k10,Prepared_sc_Data,9.099435,6192.328,4954.368,1.2498726
B cell,ACTB,k11,Prepared_sc_Data,6.540224,3706.186,4954.368,0.7480643


In [153]:
#str(datasets)

## Log Transformation

In [154]:
## Apply log transformation to the data types specified in the configuration file

In [155]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<lgl>,<lgl>,<lgl>
1,MI_v1,Prepared_Proteomic_Data,csv,proteomic,,1,0.2,False,False,1,False,False,True
2,MI_v1,Prepared_Cytokine_Data,csv,cytokine,,1,0.2,False,True,1,False,False,True


In [156]:
data_configs

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<lgl>,<lgl>,<lgl>
1,MI_v1,Prepared_Proteomic_Data,csv,proteomic,,1.0,0.2,False,False,1.0,False,False,True
2,MI_v1,Prepared_Cytokine_Data,csv,cytokine,,1.0,0.2,False,True,1.0,False,False,True
3,MI_v1,Prepared_Clinical_Data,csv,clinical,,1.0,0.2,False,True,1.0,False,False,True
4,MI_v1,Prepared_sc_Data,h5ad,sc,,1.0,0.05,True,True,1.0,True,True,True
5,MI_v1,Prepared_Neutrophil_Data,csv,neutrophil,,0.9,0.8,True,True,0.75,True,True,True


In [157]:
for(i in 1:nrow(data_configs)){
    if((data_configs$log_transformation[i] == 'TRUE')){
        
        print(paste0('Log Transformation for ',data_configs$data_name[i]))
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
        
        data$value = log2(data$value + 1)  # add pseudocount of 1
        
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = data # save adjusted data
        }
    }
        
        

[1] "Log Transformation for Prepared_Cytokine_Data"
[1] "Log Transformation for Prepared_Clinical_Data"
[1] "Log Transformation for Prepared_sc_Data"
[1] "Log Transformation for Prepared_Neutrophil_Data"


## Variable Gene Filtering

In [158]:
### Filter on highly variable genes if specified in the configuration file

In [159]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<lgl>,<lgl>,<lgl>
1,MI_v1,Prepared_Proteomic_Data,csv,proteomic,,1,0.2,False,False,1,False,False,True
2,MI_v1,Prepared_Cytokine_Data,csv,cytokine,,1,0.2,False,True,1,False,False,True


In [160]:
for(i in 1:nrow(data_configs)){
    ### Filter genes with lowest variance
    if ( (as.numeric(data_configs$variable_genes_filtering[i]) < 1) & (as.numeric(data_configs$variable_genes_filtering[i]) > 0)){
        print(paste0('Variable Genes Filtering for ',data_configs$data_name[i]))
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
        
        ### Calculate variance and threshold
        data = data %>% group_by(variable, type) %>% mutate(feature_variance = var(value)) # variance
        data = data %>% group_by(type) %>% mutate(variance_threshold = quantile(feature_variance, probs = seq(0, 1, 0.01), na.rm = FALSE,
         names = TRUE)[(1-as.numeric(data_configs$variable_genes_filtering[i]))*100])   # threshold
        
        ### Filter
        data = data[data$feature_variance > data$variance_threshold,]
        
        ### remove generated columns
        data$feature_variance = NULL
        data$variance_threshold = NULL
        
        ### Save transformed data to list
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = data
        
        }
    }
        

[1] "Variable Genes Filtering for Prepared_Neutrophil_Data"


## Sample Quantile Normalization

In [161]:
### Apply Sample Quantile Normalization for the data-types specified in the configuration file

In [162]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<lgl>,<lgl>,<lgl>
1,MI_v1,Prepared_Proteomic_Data,csv,proteomic,,1,0.2,False,False,1,False,False,True
2,MI_v1,Prepared_Cytokine_Data,csv,cytokine,,1,0.2,False,True,1,False,False,True


In [163]:
for(i in 1:nrow(data_configs)){
    if((data_configs$quantile_normalization_samples[i] == 'TRUE')){
        
        print(paste0('Sample Quantile Normalization for ',data_configs$data_name[i]))
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
        transformed_data = data.frame()
        
        for(k in unique(data$type)){
            data_type = data[data$type == k,]
            data_type = data_type %>% dcast(variable ~ sample_id, value.var = 'value')
            features = data_type$variable
            rownames(data_type) = features
            data_type$variable = NULL
            data_type = data_type[,colSums(is.na(data_type)) != nrow(data_type)] # remove na samples
            data_type  = quantile_normalization(data_type ) 
            data_type = data.frame(data_type)
            data_type$variable = features
            data_type = melt(data_type)
            colnames(data_type) = c('variable', 'sample_id', 'value')
            
            data_type$type = k 
            data_type$dataset = data_configs$data_name[i]
            transformed_data = rbind(transformed_data, data_type)
            
            }
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = transformed_data
        }
    }
            
        
        
        

[1] "Sample Quantile Normalization for Prepared_sc_Data"


Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables



[1] "Sample Quantile Normalization for Prepared_Neutrophil_Data"


Using variable as id variables



## Gene Removal (ribosomal, mitochondrial)

In [164]:
### Remove ribosomal and mitochondrial genes (only works if 'Gene' annotation is given as SYMBOL

In [165]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<lgl>,<lgl>,<lgl>
1,MI_v1,Prepared_Proteomic_Data,csv,proteomic,,1,0.2,False,False,1,False,False,True
2,MI_v1,Prepared_Cytokine_Data,csv,cytokine,,1,0.2,False,True,1,False,False,True


In [166]:
for(i in 1:nrow(data_configs)){
    if((data_configs$ribosomal_mitochondrial_gene_filtering[i] == 'TRUE')){
        
        print(paste0('Remove ribosomal and mitochondrial genes for ',data_configs$data_name[i]))
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
        
        ## Remove ribosomal and mitochondiral genes
        data = data[is.na(str_extract(data$variable, '^MT.*|^RPL.*|^RPS.*')),]
        
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = data
        }
    }
        

        
        

[1] "Remove ribosomal and mitochondrial genes for Prepared_sc_Data"
[1] "Remove ribosomal and mitochondrial genes for Prepared_Neutrophil_Data"


# Merge data types and process

In [167]:
### Combine all the datasets to one dataset

In [168]:
#str(datasets)

In [169]:
### Combine all types into one dataset
datasets = lapply(datasets, function(x){
    data = do.call(rbind, x)
    })

In [170]:
### Overview amount of features per type/ view 

In [171]:
unique(datasets[[1]][,c('type', 'variable')]) %>% group_by(type) %>% dplyr::count()

type,n
<chr>,<int>
B cell,686
CD14 Mono,1589
CD16 Mono,1059
CD4 CTL,542
CD4 Naive,473
CD4 TCM,661
CD4 TEM,737
CD8 Naive,397
CD8 TEM,570
NK,508


# Feature Wise Quantile Normalization

In [172]:
### Apply feature wise quantile normalization if specified in the configuration file

In [173]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<lgl>,<lgl>,<lgl>
1,MI_v1,Prepared_Proteomic_Data,csv,proteomic,,1,0.2,False,False,1,False,False,True
2,MI_v1,Prepared_Cytokine_Data,csv,cytokine,,1,0.2,False,True,1,False,False,True


In [174]:
data_configs$feature_wise_quantile_normalization

In [175]:
names(datasets)

In [176]:
for(i in names(datasets)){
    
    data = datasets[[i]]
    data$ident = paste0(data$type, '_0_', data$variable)
    final_data = dcast(data, sample_id ~ ident , value.var = "value") # ! with this merging there might be NA values for some samples on some data types
    rownames(final_data) = final_data$sample_id
    final_data$sample_id = NULL
    
    # Remove samples with only NA's
    data_nas = is.na(final_data)
    rowSums(data_nas)  # TBD maybe plot amount of NA per sample
    keep_samples = names(rowSums(data_nas))[rowSums(data_nas) != ncol(final_data)]
    final_data = final_data[keep_samples,]
    data_nas = data_nas[keep_samples,]
    
    # Feature Wise Quantile Normalization in kind of TRUE value
    if(unique(data_configs$feature_wise_quantile_normalization[data_configs$configuration_name == i]) == 'TRUE'){
        print('Applying Feature Wise Quantile Normalization')
        final_data = apply(final_data, 2,stdnorm)
        final_data = data.frame(final_data)
        final_data[data_nas] = NA
        final_data$sample_id = rownames(final_data)
        data_long = melt(final_data)
        data_long$type = str_extract(data_long$variable, '.*_0_')
        data_long$type  = str_replace(data_long$type , '_0_', '')
        data_long$variable = str_replace(data_long$variable, '.*_0_', '')
        datasets[[i]] = data_long
        
        }
    
    }
        
        
        

[1] "Applying Feature Wise Quantile Normalization"


Using sample_id as id variables



In [177]:
head(datasets[[1]],2)

Unnamed: 0_level_0,sample_id,variable,value,type
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>
1,k10,ACTB,0.4307273,B.cell
2,k11,ACTB,-0.7009514,B.cell


# Save the data

In [178]:
### Save the data to use as input in the next script

In [179]:
### Adjust variable names
datasets = lapply(datasets, function(x){
    x$gene = x$variable
    x$variable = paste0(x$type, '__', x$variable)
    return(x)
    })

In [180]:
for(i in names(datasets)){
    write.csv(datasets[[i]], paste0(result_path, '/02_results/02_Combined_Data_', i, '_INTEGRATED',  '.csv'))
    }

In [181]:
### Example of structure of dataset

In [182]:
head(datasets[[1]],2)

Unnamed: 0_level_0,sample_id,variable,value,type,gene
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>,<chr>
1,k10,B.cell__ACTB,0.4307273,B.cell,ACTB
2,k11,B.cell__ACTB,-0.7009514,B.cell,ACTB


In [183]:
length(unique(datasets[[1]]$sample_id))

# Update Configuration File

In [184]:
paste0(unique(datasets[[i]]$type), collapse = ',')  # TBD make config specific

In [185]:
### Adjust 06 Pathway Analysis Configs

In [186]:
configs06 = data.frame(
    mofa_result_name = paste0(unique(data_configs$configuration_name), '_MOFA'),
    factor_set = '1',
    coverage_par = 0.2,
    types = paste0(unique(datasets[[i]]$type), collapse = ','),
    coverage_plot  = 0.5,
    p_value_plot = 0.05,
    max_pathways_plot = 8,
    enrichment_plot = 'positive',
    top_features_plot = 0.125,
    pathway_selection = '')

In [187]:
configs06

mofa_result_name,factor_set,coverage_par,types,coverage_plot,p_value_plot,max_pathways_plot,enrichment_plot,top_features_plot,pathway_selection
<chr>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>
MI_v1_MOFA,1,0.2,"B.cell,CD14.Mono,CD16.Mono,CD4.CTL,CD4.Naive,CD4.TCM,CD4.TEM,CD8.Naive,CD8.TEM,NK,Treg,cDC2,clinical,cytokine,gdT,neutrophil,proteomic",0.5,0.05,8,positive,0.125,


In [188]:
write.csv(configs06, 'configurations/06_Pathway_Configs.csv', row.names = FALSE)