In [1]:
### Filter and normalize data from E1 to prepare for MOFA input

#############################################
# Prerequisites - Load Libraries

In [2]:
source('MS0_Libraries.r')

In [14]:
source('MS4_Plot_Config.r')

“[1m[22mThe `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
[36mℹ[39m Please use the `linewidth` argument instead.”


###############################################
# Preqrequisites Configurations & Parameters

In [15]:
data_path =   '../data/current'

In [16]:
result_path =  '../results/current'

In [17]:
standardize = FALSE

In [18]:
set_zero_na = FALSE

In [19]:
quantile_norm_feat = TRUE

In [20]:
#samples_var = 'acs_ccs'
# samples_var = 'no_acs'
samples_var = 'all'
# samples_var = 'acs'

In [21]:
name = 'V_FINAL'   # name of dataset to load
name_save = 'V_FINAL_INTEGRATED'


# Functions

In [22]:
### Function for quantile normalization

quantile_normalization = function(X){
  set.seed(42)
  ranks = apply(X, 2, rank, ties.method = 'min')  # determine ranks of each entry
  
  sorted = data.frame(apply(X, 2, sort)) # sort the entries
  means = apply(sorted, 1, mean) # calculate the means
  
  normalized_data = apply(ranks, 2 ,function(x){ means[x]}) # substitute the means into ranks matrix
}


In [23]:
### Gene wise quantile normalization

stdnorm <- function(x) {
  set.seed(42)
  r = rank(x[!is.na(x)], ties.method="average")
  x[!is.na(x)] = qnorm(r / (length(x[!is.na(x)]) + 1))
  return(x)
}

# Load Data 

## Prepared combined data

In [24]:
path = paste0(result_path, '/E-Analysis/Combined_Data_',name,'.csv')
data_long = read.csv(path)
print(file.info(path)$mtime)

[1] "2023-07-18 09:12:52 UTC"


In [25]:
head(data_long,2)

Unnamed: 0_level_0,X,sample_id,type,variable,value,config
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>
1,1,k10,single_cell,X0_T.cell.CD4__SSU72,0.5896835,FALSE-FALSE-TRUE-0.2-FALSE-TRUE-
2,2,k10,single_cell,X0_T.cell.CD4__PARK7,0.8587511,FALSE-FALSE-TRUE-0.2-FALSE-TRUE-


In [26]:
nrow(data_long)

In [27]:
nrow(unique(data_long[,c('sample_id', 'type', 'variable')]))

In [28]:
length(unique(data_long$sample_id))

## Sample Data

In [31]:
sample_data = read.csv(paste0(result_path, '/00_Data_Overview/Merged_Sample_Meta_Data.csv'))

In [33]:
sample_data$CK_raw = sample_data$CK

In [34]:
sample_data$CK = log2(sample_data$CK+1)

In [35]:
sample_data$Troponin = log2(sample_data$Troponin+1)

In [36]:
sample_data$CK_MB = log2(as.numeric(sample_data$CK_MB)+1)

“NAs introduced by coercion”


In [37]:
sample_data$CRP =  log2(as.numeric(sample_data$CRP)+1)

“NAs introduced by coercion”


In [38]:
sample_data$sample_id[(sample_data$sample_id == 'm13.2')  & (sample_data$library %in%  c('L6'))] = 'm13.22'

In [39]:
sample_data$sample_id[(sample_data$sample_id == 'm6.4')  & (sample_data$library %in%  c('L10'))] = 'm6.42'

In [40]:
sample_data$sample_id[(sample_data$sample_id == 'm6.1')  & (sample_data$library %in%  c('L3'))] = 'm6.12'

# Potential Pre-Processing / Data Adjustments

###  Filter samples (remove no-acs patients)

In [41]:
head(data_long,2)

Unnamed: 0_level_0,X,sample_id,type,variable,value,config
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>
1,1,k10,single_cell,X0_T.cell.CD4__SSU72,0.5896835,FALSE-FALSE-TRUE-0.2-FALSE-TRUE-
2,2,k10,single_cell,X0_T.cell.CD4__PARK7,0.8587511,FALSE-FALSE-TRUE-0.2-FALSE-TRUE-


In [43]:
unique(sample_data$classification)

In [44]:
samples_var

In [45]:
if(samples_var == 'all'){
    samples = unique(sample_data$sample_id) # without filter 
    }
if(samples_var == 'acs_ccs'){
    samples = unique(sample_data$sample_id[!sample_data$classification %in% c('vollstaendiger_ausschluss', 'koronarsklerose')])  # only acs comparison
    }
if(samples_var == 'no_acs'){
    samples = unique(sample_data$sample_id[sample_data$classification %in% c('vollstaendiger_ausschluss', 'ccs', 'koronarsklerose')])  # only acs comparison
    }
if(samples_var == 'acs'){
    samples = unique(sample_data$sample_id[sample_data$classification %in% c('acs_subacute', 'acs_w_infection', 'acs_w_o_infection')])  # only acs comparison
    }
    

In [46]:
length(samples)

In [47]:
#samples

In [48]:
length(unique(data_long$sample_id))

In [49]:
data_long = data_long[data_long$sample_id %in% samples,]

In [50]:
length(unique(data_long$sample_id))

### Filter out variables

In [51]:
head(data_long,2)

Unnamed: 0_level_0,X,sample_id,type,variable,value,config
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>
1,1,k10,single_cell,X0_T.cell.CD4__SSU72,0.5896835,FALSE-FALSE-TRUE-0.2-FALSE-TRUE-
2,2,k10,single_cell,X0_T.cell.CD4__PARK7,0.8587511,FALSE-FALSE-TRUE-0.2-FALSE-TRUE-


In [52]:
unique(data_long$type)

In [53]:
data_long = data_long[data_long$variable != 'delta_ef_value',]

In [54]:
unique(data_long[data_long$type == 'clinical_data', c('type', 'variable')])

Unnamed: 0_level_0,type,variable
Unnamed: 0_level_1,<chr>,<chr>
1557210,clinical_data,CK
1557356,clinical_data,CK_MB
1557502,clinical_data,Troponin
1557794,clinical_data,CRP


### Filter on expression

In [55]:
data_long$expression = TRUE

In [56]:
data_long$expression[data_long$value == 0] = FALSE

In [57]:
expression_filter = data_long %>% group_by(type, variable) %>% summarise(perc_expression = sum(expression)  )

[1m[22m`summarise()` has grouped output by 'type'. You can override using the `.groups` argument.


In [58]:
expression_filter$perc_expression = expression_filter$perc_expression / length(unique(data_long$sample_id))

In [59]:
nrow(expression_filter)

In [60]:
nrow(unique(data_long[,c('type', 'variable')]))

In [61]:
unique(expression_filter$type[expression_filter$perc_expression <= 0.2])

In [62]:
filtered_out = expression_filter[expression_filter$perc_expression <= 0.2,]

In [63]:
expression_filter = expression_filter[expression_filter$perc_expression > 0.2,]

In [64]:
#filtered_out

In [65]:
head(expression_filter,2)

type,variable,perc_expression
<chr>,<chr>,<dbl>
clinical_data,CK,1
clinical_data,CK_MB,1


In [66]:
nrow(expression_filter)

In [67]:
head(expression_filter)

type,variable,perc_expression
<chr>,<chr>,<dbl>
clinical_data,CK,1.0
clinical_data,CK_MB,1.0
clinical_data,CRP,1.0
clinical_data,Troponin,1.0
cytokine,BCA1__CXCL13,0.8630137
cytokine,CTACK__CCL27,0.8630137


In [68]:
data_long = merge(data_long, expression_filter[,c('type', 'variable')], by.x = c('type', 'variable'), by.y = c('type', 'variable'))   # filter the data

In [69]:
unique(data_long$type)

In [70]:
nrow(data_long)

In [72]:
length(unique(data_long$variable))

In [73]:
length(unique(data_long$sample_id))

### Normalization & wide format

In [74]:
### Standardize values

In [75]:
standardize

In [76]:
if(standardize == TRUE){
    data_long = merge(data_long, data_long %>% group_by(variable) %>% summarise(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE)))
    
    data_long[data_long == 0] = NA
    
    data_long = data_long[(data_long$sd != 0) & (!is.na(data_long$sd)),]
    
    data_nas = data.frame(is.na(data_long))
    data_long$value = (data_long$value - data_long$mean)/data_long$sd
    
    #data_long = data.frame(data_long)
    data_long$mean = NULL
    data_long$sd = NULL
    data_long$value[data_nas$value] = NA
    }

In [77]:
unique(data_long$type)

In [78]:
## Prepare wide format for correlations

In [79]:
data_long$ident = paste0(data_long$type, '_0_', data_long$variable)

In [80]:
nrow(unique(data_long[,c('sample_id', 'ident')]))

In [81]:
nrow(data_long)

In [83]:
### Transform to wide

In [84]:
final_data = dcast(data_long, sample_id ~ ident , value.var = "value") # ! with this merging there might be NA values for some samples on some data types



In [86]:
rownames(final_data) = final_data$sample_id

In [87]:
final_data$sample_id = NULL

In [89]:
ncol(final_data)

In [90]:
nrow(final_data)

### Deal with NA - Set NA for 0 observation + remove samples with only NA

In [92]:
set_zero_na

In [93]:
if(set_zero_na == TRUE){
    final_data[final_data == 0] = NA
    }

In [94]:
### Remember NA's

In [95]:
data_nas = is.na(final_data)

In [96]:
head(data_nas,2)

Unnamed: 0,clinical_data_0_CK,clinical_data_0_CK_MB,clinical_data_0_CRP,clinical_data_0_Troponin,cytokine_0_BCA1__CXCL13,cytokine_0_CTACK__CCL27,cytokine_0_EGF__EGF,cytokine_0_ENA78__CXCL5,cytokine_0_Eotaxin__CCL11,cytokine_0_Eotaxin2__CCL24,⋯,single_cell_0_X9_Monocytes...CD16_FCGR3A__ZEB2,single_cell_0_X9_Monocytes...CD16_FCGR3A__ZFAND5,single_cell_0_X9_Monocytes...CD16_FCGR3A__ZFAS1,single_cell_0_X9_Monocytes...CD16_FCGR3A__ZFP36,single_cell_0_X9_Monocytes...CD16_FCGR3A__ZFP36L1,single_cell_0_X9_Monocytes...CD16_FCGR3A__ZFP36L2,single_cell_0_X9_Monocytes...CD16_FCGR3A__ZNF106,single_cell_0_X9_Monocytes...CD16_FCGR3A__ZNF207,single_cell_0_X9_Monocytes...CD16_FCGR3A__ZNF706,single_cell_0_X9_Monocytes...CD16_FCGR3A__ZYX
k1,False,True,False,False,False,False,False,False,False,False,⋯,True,True,True,True,True,True,True,True,True,True
k10,True,True,False,True,False,False,False,False,False,False,⋯,False,False,False,False,False,False,False,False,False,False


In [97]:
rowSums(data_nas)

In [98]:
ncol(final_data)

In [99]:
keep_samples = names(rowSums(data_nas))[rowSums(data_nas) != ncol(final_data)]

In [100]:
final_data = final_data[keep_samples,]

In [101]:
data_nas = data_nas[keep_samples,]

In [102]:
nrow(final_data)

### Apply feature wise quantile normalization

In [108]:
quantile_norm_feat

In [109]:
if(quantile_norm_feat == TRUE){
    final_data = apply(final_data, 2,stdnorm)
    final_data = data.frame(final_data)
    final_data[data_nas] = NA
    final_data$sample_id = rownames(final_data)
    data_long = melt(final_data)
    data_long$type = str_extract(data_long$variable, '.*_0_')
    data_long$type  = str_replace(data_long$type , '_0_', '')
    data_long$variable = str_replace(data_long$variable, '.*_0_', '')
    }

Using sample_id as id variables



In [111]:
length(unique(data_long$variable))

# Save Prepared Data

In [113]:
name_save

In [115]:
write.csv(data_long, paste0(result_path, '/E-Analysis/Combined_Data_', name_save, '.csv'))

In [116]:
length(unique(data_long$variable))