In [1]:
### Make adjustments to data for MOFA input from E1 + Prepare

#############################################
# Prerequisites - Load Libraries

In [2]:
source('MS0_Libraries.r')


Attaching package: ‘igraph’


The following objects are masked from ‘package:stats’:

    decompose, spectrum


The following object is masked from ‘package:base’:

    union



Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
   

In [3]:
source('MS4_Plot_Config.r')

###############################################
# Preqrequisites Configurations & Parameters

In [4]:
data_path = '../data/current'

In [5]:
result_path = '../results/current'

In [6]:
data_path

In [7]:
standardize = FALSE

In [8]:
set_zero_na = FALSE

In [9]:
quantile_norm_feat = TRUE

In [10]:
samples_var = 'all'
# samples_var = 'acs'

In [11]:
name = 'V_AZIMUTH'   # name of dataset to load
name_save = 'V_AZIMUTH_INTEGRATED'

# Functions

In [12]:
### Function for quantile normalization

quantile_normalization = function(X){
  set.seed(42)
  ranks = apply(X, 2, rank, ties.method = 'min')  # determine ranks of each entry
  
  sorted = data.frame(apply(X, 2, sort)) # sort the entries
  means = apply(sorted, 1, mean) # calculate the means
  
  normalized_data = apply(ranks, 2 ,function(x){ means[x]}) # substitute the means into ranks matrix
}


In [13]:
### Gene wise quantile normalization

stdnorm <- function(x) {
  set.seed(42)
  r = rank(x[!is.na(x)], ties.method="average")
  x[!is.na(x)] = qnorm(r / (length(x[!is.na(x)]) + 1))
  return(x)
}

# Load Data 

## Prepared combined data

In [14]:
path = paste0(result_path, '/G-Analysis/Combined_Data_',name,'.csv')
data_long = read.csv(path)
print(file.info(path)$mtime)

[1] "2023-02-01 11:41:54 CET"


In [15]:
head(data_long,2)

Unnamed: 0_level_0,X,sample_id,type,variable,value,config
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>
1,1,k10,single_cell,B.cell__PARK7,0.9412188,FALSE-FALSE-TRUE-0.2-FALSE-TRUE-
2,2,k10,single_cell,B.cell__ENO1,0.8205378,FALSE-FALSE-TRUE-0.2-FALSE-TRUE-


In [16]:
nrow(data_long)

In [17]:
nrow(unique(data_long[,c('sample_id', 'type', 'variable')]))

In [21]:
length(unique(data_long$sample_id))

## Sample Data

In [24]:
sample_data = read.csv(paste0(result_path, '/00_Data_Overview/Merged_Sample_Meta_Data.csv'))

In [26]:
sample_data$CK_raw = sample_data$CK

In [27]:
sample_data$CK = log2(sample_data$CK+1)

In [28]:
sample_data$Troponin = log2(sample_data$Troponin+1)

In [29]:
sample_data$CK_MB = log2(as.numeric(sample_data$CK_MB)+1)

“NAs introduced by coercion”


In [30]:
sample_data$CRP =  log2(as.numeric(sample_data$CRP)+1)

“NAs introduced by coercion”


In [31]:
sample_data$sample_id[(sample_data$sample_id == 'm13.2')  & (sample_data$library %in%  c('L6'))] = 'm13.22'

In [32]:
sample_data$sample_id[(sample_data$sample_id == 'm6.4')  & (sample_data$library %in%  c('L10'))] = 'm6.42'

In [33]:
sample_data$sample_id[(sample_data$sample_id == 'm6.1')  & (sample_data$library %in%  c('L3'))] = 'm6.12'

# Potential Pre-Processing / Data Adjustments

###  Filter samples (remove no-acs patients)

In [54]:
head(data_long,2)

Unnamed: 0_level_0,X,sample_id,type,variable,value,config
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>
1,1,k10,single_cell,B.cell__PARK7,0.9412188,FALSE-FALSE-TRUE-0.2-FALSE-TRUE-
2,2,k10,single_cell,B.cell__ENO1,0.8205378,FALSE-FALSE-TRUE-0.2-FALSE-TRUE-


In [55]:
head(sample_data,2)

Unnamed: 0_level_0,X.1,sample_id,sample,id,measurement,library,id.y,name,read,pattern,⋯,delta_ef_value_group,delta_ef_value,delta_ef_value_class,ef_classification_data,CK,CK_MB,Troponin,CRP,clinical_data,CK_raw
Unnamed: 0_level_1,<int>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<dbl>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>
1,1,k1,K1,1,TP0,L13,HTO_B0251,No-CCS-1,R2,5PNNNNNNNNNN(BC),⋯,,,,0,5.459432,,0.01863417,0.4854268,1,43.0
2,2,k10,K10,10,TP0,L11,HTO_B0256,Ch-CCS-10,R2,5PNNNNNNNNNN(BC),⋯,,,,0,,,,0.2630344,1,


In [56]:
unique(sample_data$classification)

In [57]:
samples_var

In [58]:
if(samples_var == 'all'){
    samples = unique(sample_data$sample_id) # without filter 
    }
if(samples_var == 'acs_ccs'){
    samples = unique(sample_data$sample_id[!sample_data$classification %in% c('vollstaendiger_ausschluss', 'koronarsklerose')])  # only acs comparison
    }
if(samples_var == 'no_acs'){
    samples = unique(sample_data$sample_id[sample_data$classification %in% c('vollstaendiger_ausschluss', 'ccs', 'koronarsklerose')])  # only acs comparison
    }
if(samples_var == 'acs'){
    samples = unique(sample_data$sample_id[sample_data$classification %in% c('acs_subacute', 'acs_w_infection', 'acs_w_o_infection')])  # only acs comparison
    }
    

In [59]:
length(samples)

In [60]:
#samples

In [61]:
length(unique(data_long$sample_id))

In [62]:
data_long = data_long[data_long$sample_id %in% samples,]

In [63]:
length(unique(data_long$sample_id))

### Filter out variables

In [64]:
head(data_long,2)

Unnamed: 0_level_0,X,sample_id,type,variable,value,config
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>
1,1,k10,single_cell,B.cell__PARK7,0.9412188,FALSE-FALSE-TRUE-0.2-FALSE-TRUE-
2,2,k10,single_cell,B.cell__ENO1,0.8205378,FALSE-FALSE-TRUE-0.2-FALSE-TRUE-


In [65]:
unique(data_long$type)

In [66]:
data_long = data_long[data_long$variable != 'delta_ef_value',]

In [67]:
unique(data_long[data_long$type == 'clinical_data', c('type', 'variable')])

Unnamed: 0_level_0,type,variable
Unnamed: 0_level_1,<chr>,<chr>
1064907,clinical_data,CK
1065053,clinical_data,CK_MB
1065199,clinical_data,Troponin
1065491,clinical_data,CRP


### Filter on expression

In [68]:
data_long$expression = TRUE

In [69]:
data_long$expression[data_long$value == 0] = FALSE

In [71]:
expression_filter = data_long %>% group_by(type, variable) %>% summarise(perc_expression = sum(expression)  )

[1m[22m`summarise()` has grouped output by 'type'. You can override using the
`.groups` argument.


In [72]:
expression_filter$perc_expression = expression_filter$perc_expression / length(unique(data_long$sample_id))

In [73]:
nrow(expression_filter)

In [74]:
nrow(unique(data_long[,c('type', 'variable')]))

In [75]:
unique(expression_filter$type[expression_filter$perc_expression <= 0.2])

In [76]:
filtered_out = expression_filter[expression_filter$perc_expression <= 0.2,]

In [77]:
expression_filter = expression_filter[expression_filter$perc_expression > 0.2,]

In [78]:
#filtered_out

In [79]:
head(expression_filter,2)

type,variable,perc_expression
<chr>,<chr>,<dbl>
clinical_data,CK,1
clinical_data,CK_MB,1


In [80]:
nrow(expression_filter)

In [81]:
head(expression_filter)

type,variable,perc_expression
<chr>,<chr>,<dbl>
clinical_data,CK,1.0
clinical_data,CK_MB,1.0
clinical_data,CRP,1.0
clinical_data,Troponin,1.0
cytokine,BCA1__CXCL13,0.8630137
cytokine,CTACK__CCL27,0.8630137


In [82]:
data_long = merge(data_long, expression_filter[,c('type', 'variable')], by.x = c('type', 'variable'), by.y = c('type', 'variable'))   # filter the data

In [83]:
unique(data_long$type)

In [84]:
nrow(data_long)

In [86]:
length(unique(data_long$variable))

In [87]:
length(unique(data_long$sample_id))

### Normalization & wide format

In [88]:
### Standardize values

In [89]:
standardize

In [90]:
if(standardize == TRUE){
    data_long = merge(data_long, data_long %>% group_by(variable) %>% summarise(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE)))
    
    data_long[data_long == 0] = NA
    
    data_long = data_long[(data_long$sd != 0) & (!is.na(data_long$sd)),]
    
    data_nas = data.frame(is.na(data_long))
    data_long$value = (data_long$value - data_long$mean)/data_long$sd
    
    #data_long = data.frame(data_long)
    data_long$mean = NULL
    data_long$sd = NULL
    data_long$value[data_nas$value] = NA
    }

In [91]:
unique(data_long$type)

In [92]:
## Prepare wide format for correlations

In [93]:
data_long$ident = paste0(data_long$type, '_0_', data_long$variable)

In [94]:
nrow(unique(data_long[,c('sample_id', 'ident')]))

In [95]:
nrow(data_long)

In [96]:
data_long[(data_long$sample_id == 'k11') & (data_long$variable == 'CRP'),]

Unnamed: 0_level_0,type,variable,X,sample_id,value,config,expression,ident
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<dbl>,<chr>,<lgl>,<chr>
295,clinical_data,CRP,1065493,k11,0.1375035,FALSE-FALSE-TRUE-0.2-FALSE-TRUE-,True,clinical_data_0_CRP


In [97]:
### Transform to wide

In [98]:
final_data = dcast(data_long, sample_id ~ ident , value.var = "value") # ! with this merging there might be NA values for some samples on some data types

In [99]:
head(final_data,2)

Unnamed: 0_level_0,sample_id,clinical_data_0_CK,clinical_data_0_CK_MB,clinical_data_0_CRP,clinical_data_0_Troponin,cytokine_0_BCA1__CXCL13,cytokine_0_CTACK__CCL27,cytokine_0_EGF__EGF,cytokine_0_ENA78__CXCL5,cytokine_0_Eotaxin__CCL11,⋯,single_cell_0_Treg__VIM,single_cell_0_Treg__YBX1,single_cell_0_Treg__YPEL5,single_cell_0_Treg__YWHAB,single_cell_0_Treg__YWHAZ,single_cell_0_Treg__ZC3HAV1,single_cell_0_Treg__ZFAS1,single_cell_0_Treg__ZFP36,single_cell_0_Treg__ZFP36L2,single_cell_0_Treg__ZNF331
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,k1,5.459432,,0.4854268,0.01863417,5.69655,9.821662,5.045268,10.562366,7.42341,⋯,,,,,,,,,,
2,k10,,,0.2630344,,4.697663,9.70122,1.922198,7.724173,9.19842,⋯,2.356913,1.735349,1.695393,1.772633,1.975818,0.9466424,1.417687,1.670544,1.474395,1.695393


In [100]:
rownames(final_data) = final_data$sample_id

In [101]:
final_data$sample_id = NULL

In [103]:
ncol(final_data)

In [104]:
nrow(final_data)

### Deal with NA - Set NA for 0 observation + remove samples with only NA

In [106]:
set_zero_na

In [107]:
if(set_zero_na == TRUE){
    final_data[final_data == 0] = NA
    }

In [108]:
### Remember NA's

In [109]:
data_nas = is.na(final_data)

In [110]:
head(data_nas,2)

Unnamed: 0,clinical_data_0_CK,clinical_data_0_CK_MB,clinical_data_0_CRP,clinical_data_0_Troponin,cytokine_0_BCA1__CXCL13,cytokine_0_CTACK__CCL27,cytokine_0_EGF__EGF,cytokine_0_ENA78__CXCL5,cytokine_0_Eotaxin__CCL11,cytokine_0_Eotaxin2__CCL24,⋯,single_cell_0_Treg__VIM,single_cell_0_Treg__YBX1,single_cell_0_Treg__YPEL5,single_cell_0_Treg__YWHAB,single_cell_0_Treg__YWHAZ,single_cell_0_Treg__ZC3HAV1,single_cell_0_Treg__ZFAS1,single_cell_0_Treg__ZFP36,single_cell_0_Treg__ZFP36L2,single_cell_0_Treg__ZNF331
k1,False,True,False,False,False,False,False,False,False,False,⋯,True,True,True,True,True,True,True,True,True,True
k10,True,True,False,True,False,False,False,False,False,False,⋯,False,False,False,False,False,False,False,False,False,False


In [111]:
rowSums(data_nas)

In [112]:
ncol(final_data)

In [113]:
keep_samples = names(rowSums(data_nas))[rowSums(data_nas) != ncol(final_data)]

In [114]:
final_data = final_data[keep_samples,]

In [115]:
data_nas = data_nas[keep_samples,]

In [116]:
nrow(final_data)

### Apply feature wise quantile normalization

In [136]:
quantile_norm_feat

In [137]:
if(quantile_norm_feat == TRUE){
    final_data = apply(final_data, 2,stdnorm)
    final_data = data.frame(final_data)
    final_data[data_nas] = NA
    final_data$sample_id = rownames(final_data)
    data_long = melt(final_data)
    data_long$type = str_extract(data_long$variable, '.*_0_')
    data_long$type  = str_replace(data_long$type , '_0_', '')
    data_long$variable = str_replace(data_long$variable, '.*_0_', '')
    }

Using sample_id as id variables



In [139]:
length(unique(data_long$variable))

# Remove certain variables

In [234]:
unique(data_long$variable[data_long$type == 'clinical_data'])

In [235]:
data_long = data_long[data_long$variable != 'delta_ef_value',]

In [236]:
head(data_long,2)

Unnamed: 0_level_0,sample_id,variable,value,type
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>
1,k1,CK,-2.397022,clinical_data
2,k10,CK,,clinical_data


In [237]:
unique(data_long$variable[data_long$type == 'clinical_data'])

In [238]:
length(unique(data_long$variable))

# Save Prepared Data

In [239]:
name_save

In [240]:
write.csv(data_long, paste0(result_path, '/G-Analysis/Combined_Data_', name_save, '.csv'))

In [242]:
length(unique(data_long$variable))