In [533]:
### Script to integrate, preprocess and harmonize all available data sets
### In our use-case
# Single Cell RNA Seq
# Cytokine Data
# Neutrophil Data
# Clinical Data
# Proteomics

#############################################
# Prerequisites - Load Libraries

In [534]:
source('MS0_Libraries.r')

“incomplete final line found by readTableHeader on '../conda_environment/Environment_Configs.csv'”


[1] "/home/icb/corinna.losert/miniconda3/envs/mofa_analysis//lib/R/library"


In [535]:
source('MS2_Plot_Config.r')

In [536]:
source('MS1_Functions.r')

###############################################
# Preqrequisites Configurations & Parameters

In [537]:
### Load the parameters that are set via the configuration files

In [538]:
### Load configurations file
global_configs = read.csv('configurations/Data_Configs.csv', sep = ',')

“incomplete final line found by readTableHeader on 'configurations/Data_Configs.csv'”


In [539]:
head(global_configs,2)

Unnamed: 0_level_0,parameter,value
Unnamed: 0_level_1,<chr>,<chr>
1,data_path,/lustre/groups/epigenereg01/workspace/projects/jove/example_data/
2,result_path,/lustre/groups/epigenereg01/workspace/projects/jove/example_results/


In [540]:
data_path = global_configs$value[global_configs$parameter == 'data_path']

In [541]:
data_path

In [542]:
result_path = global_configs$value[global_configs$parameter == 'result_path']

In [543]:
result_path

In [544]:
## Load the configuration file specifying single-cell specific filtering options

In [545]:
sc_configs = read.csv('configurations/02_Pre_Processing_Configs_SC.csv', sep = ',')

“incomplete final line found by readTableHeader on 'configurations/02_Pre_Processing_Configs_SC.csv'”


In [546]:
head(sc_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,data_type,cell_expr_thres1,cell_expr_thres2,cell_type_exclusion
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,MI_v1,dcm_acm,h5seurat,30;10,20;5,"Platelet,Plasmablast,pDC,Eryth,Doublet,dnT,cdC1,CD8 TCM,CD8 Proliferating,CD4 Proliferating,NK Proliferating,ASDC,NK_CD56bright,ILC,MAIT,HSPC"


In [547]:
sc_configs = sc_configs[sc_configs$data_name != '',]

In [548]:
## Load the configuration file specifying the pre-processing options for all datasets

In [549]:
data_configs = read.csv('configurations/02_Pre_Processing_Configs.csv', sep = ',')

“incomplete final line found by readTableHeader on 'configurations/02_Pre_Processing_Configs.csv'”


In [550]:
data_configs = data_configs[data_configs$configuration_name != '',]   # remove lines with empty configuration names
data_configs = data_configs[!is.na(data_configs$configuration_name),]  # remove lines with NA in configuration name

In [551]:
head(data_configs)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<int>,<dbl>,<lgl>,<lgl>,<int>,<lgl>,<lgl>,<lgl>
1,MI_v1,dcm_acm,h5seurat,sc,,1,0.05,True,True,1,True,True,True


In [552]:
### Generate the result data directory if it does not exist yet
if(!file.exists(paste0(result_path, '02_results'))){
    dir.create(file.path(paste0(result_path, '02_results')))
    }

# Load Data

In [553]:
### Load sc Data and exclude cluster_ids as specified in the configuration file

In [554]:
datasets = list()

In [555]:
## Load sc data (pseudobulk) generated in previous step
if(nrow(sc_configs) > 0){
for(j in 1:nrow(sc_configs)){
    sc_data_name = sc_configs$data_name[j]
    sc_data =  fread(paste0(result_path, '/01_results/01_', sc_data_name, 'Pseudobulk_Table', '.csv'))

    sc_data$V1 = NULL

    ## Split up sc to different configs
    for(i in unique(sc_configs$configuration_name)){    
        for(j in unique(sc_configs$data_name[sc_configs$configuration_name == i])){

            data = sc_data[sc_data$dataset == j,]

            ## Exclude cluster_id's (cell-type clusters)
            if(!is.na(sc_configs$cell_type_exclusion[sc_configs$configuration_name == i])){
            data = data[!data$type %in% unlist(strsplit(sc_configs$cell_type_exclusion[sc_configs$configuration_name == i] ,',')),]
                }

            datasets[[i]][[j]] = data
            }
        }
    }    
 }   

In [556]:
sc_data_name

In [557]:
length(unique(data$sample_id))

In [558]:
### Load the other datasets specified in the configuration file

In [559]:
for(i in unique(data_configs$configuration_name)){     # for each config
    for(j in unique(data_configs$data_name[data_configs$configuration_name == i])){      # each specifiec data-name
        
        configuration = data_configs[(data_configs$configuration_name == i) & (data_configs$data_name == j),]
        
        if(configuration$file_type == 'csv'){
        data = read.csv(paste0(data_path, j, '.csv'))
        data$X = NULL
        data = melt(data, id.vars = 'sample_id')
        data$dataset = j
        data$type = configuration$data_type
        
        datasets[[i]][[j]] = data
        }
        }
    }

In [560]:
#head(data,2)

In [561]:
#str(datasets)

In [562]:
data_backup = datasets # in case something should be re-executed, so loading of data is not necessary a second time

In [563]:
datasets = data_backup

In [564]:
#str(datasets)

# Pre-Process each dataset as specified in the configuration files

## Sample Filter

In [565]:
### Filter out sample_id's specified in the configuration file

In [566]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<int>,<dbl>,<lgl>,<lgl>,<int>,<lgl>,<lgl>,<lgl>
1,MI_v1,dcm_acm,h5seurat,sc,,1,0.05,True,True,1,True,True,True


In [567]:
for(i in 1:nrow(data_configs)){
    ### Remove samples based on specified samples in remove_sample_ids column
    if( (!is.na(data_configs$remove_sample_ids[i])) & (data_configs$remove_sample_ids[i] != '')){
        
        print(paste0('Filtered specific samples for ',data_configs$data_name[i], ' ',  unique( unlist(strsplit(data_configs$remove_sample_ids[i], ',')))))
        
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
        
        ### remove samples
        data = data[! data$sample_id %in% unlist(strsplit(data_configs$remove_sample_ids[i], ',')),]  # TBD check!
        
        ### replace adjusted data
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = data
        
        }
    
     ### Remove samples based on threshold in sample_filtering_thres
     if ( (as.numeric(data_configs$sample_filtering_thres[i]) < 1) & (as.numeric(data_configs$sample_filtering_thres[i]) > 0)){
         
         print(paste0('Filtered samples based on threshold for ',data_configs$data_name[i])) 
         data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
         print(paste0('Amount samples before filtering ', length(unique(data$sample_id))))
         
         ### calculate percentage of features with zero values
         data = data %>% group_by(sample_id, type) %>% mutate(zero_expression_percentage = sum(value == 0)/ n())
         ### filter out samples if percentage higher than threshold
         data = data[data$zero_expression_percentage < data_configs$sample_filtering_thres[i],]
         print(paste0('Amount samples after filtering ', length(unique(data$sample_id))))
         
         ### remove generated columns
         data$zero_expression_percentage = NULL
         
         ### replace adjusted data
         datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = data            
    } 
}
        

In [568]:
#str(datasets)

## Feature Removal (based on sample expression)

In [569]:
## Filter out features that are not expressed in a certain amount of sample (threshold set in the configuration file)

In [570]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<int>,<dbl>,<lgl>,<lgl>,<int>,<lgl>,<lgl>,<lgl>
1,MI_v1,dcm_acm,h5seurat,sc,,1,0.05,True,True,1,True,True,True


In [571]:
data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]

In [572]:
head(data,2)

sample_id,variable,value,dataset,type
<chr>,<chr>,<dbl>,<chr>,<chr>
D1,A1BG,0,dcm_acm,adipocyte
D2,A1BG,0,dcm_acm,adipocyte


In [573]:
for(i in 1:nrow(data_configs)){

    if( (!is.na(data_configs$feature_filtering_thres[i])) & (data_configs$feature_filtering_thres[i] != '')  & (data_configs$feature_filtering_thres[i] > 0)){
        
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
        
        print(paste0(data_configs$configuration_name[i], ' ' ,data_configs$data_name[i]))
        
        ## Determine data to filter
        data$expression = TRUE
        data$expression[data$value == 0] = FALSE
        expression_filter = data %>% group_by(type, variable) %>% summarise(perc_expression = sum(expression)  )
        expression_filter$perc_expression = expression_filter$perc_expression / length(unique(data$sample_id))
        
        ## Apply filter
        filtered_out = expression_filter[expression_filter$perc_expression <= data_configs$feature_filtering_thres[i],]
        print(paste0( 'Filtered: ' ))
        if(nrow(filtered_out) > 0){
            print(filtered_out %>% dplyr::group_by(type) %>% dplyr::count())
            }
        expression_filter = expression_filter[expression_filter$perc_expression >data_configs$feature_filtering_thres[i],]  # kept data
        
        data = merge(data, expression_filter[,c('type', 'variable')], by.x = c('type', 'variable'), by.y = c('type', 'variable'))   # filter the data
        
        ## Remove expression column 
        data$expression = NULL
        
        ## Replace 
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]  = data
        
      }
}

[1] "MI_v1 dcm_acm"


[1m[22m`summarise()` has grouped output by 'type'. You can override using the `.groups`
argument.


[1] "Filtered: "
[90m# A tibble: 34 × 2[39m
[90m# Groups:   type [34][39m
   type                                 n
   [3m[90m<chr>[39m[23m                            [3m[90m<int>[39m[23m
[90m 1[39m H01_cardiac muscle cell           [4m1[24m919
[90m 2[39m H01_cardiac neuron                [4m1[24m919
[90m 3[39m H01_endothelial cell              [4m1[24m919
[90m 4[39m H01_fibroblast of cardiac tissue  [4m1[24m919
[90m 5[39m H01_lymphocyte                    [4m1[24m919
[90m 6[39m H01_mural cell                    [4m1[24m919
[90m 7[39m H01_myeloid cell                  [4m1[24m919
[90m 8[39m H02_adipocyte                     [4m1[24m919
[90m 9[39m H02_cardiac muscle cell           [4m1[24m919
[90m10[39m H02_cardiac neuron                [4m1[24m919
[90m# ℹ 24 more rows[39m


## Library Adjustment

In [574]:
## Normalize measured counts for each sample to have the same amount of counts

In [575]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<int>,<dbl>,<lgl>,<lgl>,<int>,<lgl>,<lgl>,<lgl>
1,MI_v1,dcm_acm,h5seurat,sc,,1,0.05,True,True,1,True,True,True


In [576]:
for(i in 1:nrow(data_configs)){
    if((data_configs$library_adjustment[i] == 'TRUE')){
        
        print(paste0('Library Adjustment for ',data_configs$data_name[i]))
        
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]

        ### Calculate scaling factor per sample
        data = data %>% group_by(sample_id,type) %>% mutate(sample_counts = sum(value))
        data = data %>% group_by(type) %>% mutate(mean_sample_counts = mean(sample_counts))
        
        data$scaling_factor = data$sample_counts/ data$mean_sample_counts
        data$scaling_factor[data$scaling_factor == 0] = 1 # avoid dividing by 0; TBD whether to include or exclude samples with only zero counts in a cell-type
        
        ### Apply scaling to counts
        
        data$value = data$value / data$scaling_factor
        
        ### Save transformed data to list
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = data
        
        }
    }
        

[1] "Library Adjustment for dcm_acm"


## Gene Filtering (according to cells expressing genes - only for sc Data)

In [577]:
### Remove genes from the single-cell dataset that are expressed in a too low amount of cells

In [578]:
head(sc_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,data_type,cell_expr_thres1,cell_expr_thres2,cell_type_exclusion
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,MI_v1,dcm_acm,h5seurat,30;10,20;5,"Platelet,Plasmablast,pDC,Eryth,Doublet,dnT,cdC1,CD8 TCM,CD8 Proliferating,CD4 Proliferating,NK Proliferating,ASDC,NK_CD56bright,ILC,MAIT,HSPC"


In [579]:
## Load gene filtering information from previous script

In [580]:
gene_expression_info = data.frame()

In [581]:
for(i in sc_configs$data_name){
    data= read.csv(paste0(result_path, '/01_results/01_' , i , '_Gene_Expr_per_Cell_Type.csv'))
    data$X = NULL
    
    data$data_name = i
    gene_expression_info = rbind(gene_expression_info, data)
    }

In [582]:
head(gene_expression_info,2)

Unnamed: 0_level_0,perc_cells_expressing_gene,total_amount_cells_expressing_gene,gene,cluster,data_name
Unnamed: 0_level_1,<dbl>,<int>,<chr>,<chr>,<chr>
1,1.314940055,1711,ISG15,mural cell,dcm_acm
2,0.006148171,8,TNFRSF18,mural cell,dcm_acm


In [583]:
if(nrow(sc_configs) > 0){
for(i in 1:nrow(sc_configs)){
        
        print(paste0('Gene Filtering for  ',sc_configs$configuration_name[i]))
        
        data = datasets[[sc_configs$configuration_name[i]]][[sc_configs$data_name[i]]]
        gene_expr_data = gene_expression_info[gene_expression_info$data_name == unique(gene_expression_info$data_name[i]),]
    
        ## Get thresholds for config
        thres1 = as.numeric(unlist(str_split(sc_configs$cell_expr_thres1[i], ';')))
        thres2 = as.numeric(unlist(str_split(sc_configs$cell_expr_thres2[i], ';')))
        amount_samples = length(unique(data$sample_id))
        print(paste0('Amount Samples', amount_samples))
    
        ## Filter down gene based on the expression info
        gene_filtering =  gene_expr_data[((     gene_expr_data$perc_cells > thres1[1]) & (     gene_expr_data$total_amount_cells_expressing_gene > amount_samples * thres1[2])) |
         ((     gene_expr_data$perc_cells > thres2[1]) & (     gene_expr_data$total_amount_cells_expressing_gene > amount_samples * thres2[2])) ,]
    
        ## Apply to thresholds set in the configuration file
        filtered_data = data.frame()
        for( k in unique(data$type)){
            data_cluster = data[(data$type == k) & (data$variable %in% gene_filtering$gene[gene_filtering$cluster == k]),]
            filtered_data = rbind(filtered_data, data_cluster)
            }
        datasets[[sc_configs$configuration_name[i]]][[sc_configs$data_name[i]]] = filtered_data
    }
    }

[1] "Gene Filtering for  MI_v1"
[1] "Amount Samples53"


In [584]:
### Amount of genes after filtering

In [585]:
#unique(datasets[[1]][['Prepared_sc_Data']][,c('type', 'variable')])%>% group_by(type) %>% dplyr::count()

## Log Transformation

In [586]:
## Apply log transformation to the data types specified in the configuration file

In [587]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<int>,<dbl>,<lgl>,<lgl>,<int>,<lgl>,<lgl>,<lgl>
1,MI_v1,dcm_acm,h5seurat,sc,,1,0.05,True,True,1,True,True,True


In [588]:
for(i in 1:nrow(data_configs)){
    if((data_configs$log_transformation[i] == 'TRUE')){
        
        print(paste0('Log Transformation for ',data_configs$data_name[i]))
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
        
        data$value = log2(data$value + 1)  # add pseudocount of 1
        
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = data # save adjusted data
        }
    }
        
        

[1] "Log Transformation for dcm_acm"


## Variable Gene Filtering

In [589]:
### Filter on highly variable genes if specified in the configuration file

In [590]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<int>,<dbl>,<lgl>,<lgl>,<int>,<lgl>,<lgl>,<lgl>
1,MI_v1,dcm_acm,h5seurat,sc,,1,0.05,True,True,1,True,True,True


In [591]:
for(i in 1:nrow(data_configs)){
    ### Filter genes with lowest variance
    if ( (as.numeric(data_configs$variable_genes_filtering[i]) < 1) & (as.numeric(data_configs$variable_genes_filtering[i]) > 0)){
        print(paste0('Variable Genes Filtering for ',data_configs$data_name[i]))
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
        
        ### Calculate variance and threshold
        data = data %>% group_by(variable, type) %>% mutate(feature_variance = var(value)) # variance
        data = data %>% group_by(type) %>% mutate(variance_threshold = quantile(feature_variance, probs = seq(0, 1, 0.01), na.rm = FALSE,
         names = TRUE)[(1-as.numeric(data_configs$variable_genes_filtering[i]))*100])   # threshold
        
        ### Filter
        data = data[data$feature_variance > data$variance_threshold,]
        
        ### remove generated columns
        data$feature_variance = NULL
        data$variance_threshold = NULL
        
        ### Save transformed data to list
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = data
        
        }
    }
        

## Sample Quantile Normalization

In [592]:
### Apply Sample Quantile Normalization for the data-types specified in the configuration file

In [593]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<int>,<dbl>,<lgl>,<lgl>,<int>,<lgl>,<lgl>,<lgl>
1,MI_v1,dcm_acm,h5seurat,sc,,1,0.05,True,True,1,True,True,True


In [594]:
for(i in 1:nrow(data_configs)){
    if((data_configs$quantile_normalization_samples[i] == 'TRUE')){
        
        print(paste0('Sample Quantile Normalization for ',data_configs$data_name[i]))
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
        transformed_data = data.frame()
        
        for(k in unique(data$type)){
            data_type = data[data$type == k,]
            data_type = data_type %>% dcast(variable ~ sample_id, value.var = 'value')
            features = data_type$variable
            rownames(data_type) = features
            data_type$variable = NULL
            data_type = data_type[,colSums(is.na(data_type)) != nrow(data_type)] # remove na samples
            data_type  = quantile_normalization(data_type ) 
            data_type = data.frame(data_type)
            data_type$variable = features
            data_type = melt(data_type)
            colnames(data_type) = c('variable', 'sample_id', 'value')
            
            data_type$type = k 
            data_type$dataset = data_configs$data_name[i]
            transformed_data = rbind(transformed_data, data_type)
            
            }
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = transformed_data
        }
    }
            
        
        
        

[1] "Sample Quantile Normalization for dcm_acm"


Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables

Using variable as id variables



## Gene Removal (ribosomal, mitochondrial)

In [595]:
### Remove ribosomal and mitochondrial genes (only works if 'Gene' annotation is given as SYMBOL

In [596]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<int>,<dbl>,<lgl>,<lgl>,<int>,<lgl>,<lgl>,<lgl>
1,MI_v1,dcm_acm,h5seurat,sc,,1,0.05,True,True,1,True,True,True


In [597]:
for(i in 1:nrow(data_configs)){
    if((data_configs$ribosomal_mitochondrial_gene_filtering[i] == 'TRUE')){
        
        print(paste0('Remove ribosomal and mitochondrial genes for ',data_configs$data_name[i]))
        data = datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]]
        
        ## Remove ribosomal and mitochondiral genes
        data = data[is.na(str_extract(data$variable, '^MT.*|^RPL.*|^RPS.*')),]
        
        datasets[[data_configs$configuration_name[i]]][[data_configs$data_name[i]]] = data
        }
    }
        

        
        

[1] "Remove ribosomal and mitochondrial genes for dcm_acm"


# Merge data types and process

In [598]:
### Combine all the datasets to one dataset

In [599]:
#str(datasets)

In [600]:
### Combine all types into one dataset
datasets = lapply(datasets, function(x){
    data = do.call(rbind, x)
    })

In [601]:
### Overview amount of features per type/ view 

In [602]:
unique(datasets[[1]][,c('type', 'variable')]) %>% group_by(type) %>% dplyr::count()

type,n
<chr>,<int>
adipocyte,414
cardiac muscle cell,368
cardiac neuron,162
endothelial cell,166
fibroblast of cardiac tissue,243
lymphocyte,170
mast cell,99
mural cell,112
myeloid cell,331


# Feature Wise Quantile Normalization

In [603]:
### Apply feature wise quantile normalization if specified in the configuration file

In [604]:
head(data_configs,2)

Unnamed: 0_level_0,configuration_name,data_name,file_type,data_type,remove_sample_ids,sample_filtering_thres,feature_filtering_thres,library_adjustment,log_transformation,variable_genes_filtering,quantile_normalization_samples,ribosomal_mitochondrial_gene_filtering,feature_wise_quantile_normalization
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<int>,<dbl>,<lgl>,<lgl>,<int>,<lgl>,<lgl>,<lgl>
1,MI_v1,dcm_acm,h5seurat,sc,,1,0.05,True,True,1,True,True,True


In [605]:
data_configs$feature_wise_quantile_normalization

In [606]:
for(i in names(datasets)){
    
    data = datasets[[i]]
    data$ident = paste0(data$type, '_0_', data$variable)
    final_data = dcast(data, sample_id ~ ident , value.var = "value") # ! with this merging there might be NA values for some samples on some data types
    rownames(final_data) = final_data$sample_id
    final_data$sample_id = NULL
    
    # Remove samples with only NA's
    data_nas = is.na(final_data)
    rowSums(data_nas)  # TBD maybe plot amount of NA per sample
    keep_samples = names(rowSums(data_nas))[rowSums(data_nas) != ncol(final_data)]
    final_data = final_data[keep_samples,]
    data_nas = data_nas[keep_samples,]
    
    # Feature Wise Quantile Normalization in kind of TRUE value
    if(unique(data_configs$feature_wise_quantile_normalization[data_configs$configuration_name == i]) == 'TRUE'){
        print('Applying Feature Wise Quantile Normalization')
        final_data = apply(final_data, 2,stdnorm)
        final_data = data.frame(final_data)
        final_data[data_nas] = NA
        final_data$sample_id = rownames(final_data)
        data_long = melt(final_data)
        data_long$type = str_extract(data_long$variable, '.*_0_')
        data_long$type  = str_replace(data_long$type , '_0_', '')
        data_long$variable = str_replace(data_long$variable, '.*_0_', '')
        datasets[[i]] = data_long
        
        }
    
    }
        
        
        

[1] "Applying Feature Wise Quantile Normalization"


Using sample_id as id variables



In [607]:
head(datasets[[1]],2)

Unnamed: 0_level_0,sample_id,variable,value,type
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>
1,D1,ABI1,-1.1243382,adipocyte
2,D2,ABI1,-0.8572543,adipocyte


# Save the data

In [608]:
### Save the data to use as input in the next script

In [609]:
### Adjust variable names
datasets = lapply(datasets, function(x){
    x$gene = x$variable
    x$variable = paste0(x$type, '__', x$variable)
    return(x)
    })

In [610]:
for(i in names(datasets)){
    write.csv(datasets[[i]], paste0(result_path, '/02_results/02_Combined_Data_', i, '_INTEGRATED',  '.csv'))
    }

In [611]:
### Example of structure of dataset

In [612]:
head(datasets[[1]],2)

Unnamed: 0_level_0,sample_id,variable,value,type,gene
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>,<chr>
1,D1,adipocyte__ABI1,-1.1243382,adipocyte,ABI1
2,D2,adipocyte__ABI1,-0.8572543,adipocyte,ABI1


In [613]:
length(unique(datasets[[1]]$sample_id))