In [1]:
### Script to integrate all available data source into a combined dataset
# Single Cell RNA Seq
# Cytokine Data
# Neutrophil Data
# Proteomics

#############################################
# Prerequisites - Load Libraries

In [2]:
source('MS0_Libraries.r')

“incomplete final line found by readTableHeader on '../conda_environment/Environment_Configs.csv'”


[1] "/home/icb/corinna.losert/miniconda3/envs/stark_stemi_R_Env_4_1//lib/R/library"



Attaching package: ‘igraph’


The following objects are masked from ‘package:stats’:

    decompose, spectrum


The following object is masked from ‘package:base’:

    union



Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
   

###############################################
# Preqrequisites Configurations & Parameters

In [3]:
data_path =  '../data/current'

In [4]:
result_path =  '../results/current'

In [5]:
data_path

In [6]:
### Define pre-processing options for the different data-types

In [7]:
quantile_normalization_cyto = FALSE # quantile normalization on cytokines?

In [8]:
quantile_normalization_proteomics = FALSE # quantile normalization on proteomics?

In [9]:
library_adjustment_neutrophils = TRUE # quantile normalization on neutrophils?

In [10]:
regress_neutrophils = FALSE   # keep only neutrophil residuals

In [11]:
neutrophil_threshold = 0.2 # decide how many 0 are allowed on genes measured(percentage zeros across the samples)

In [12]:
quantile_normalization_neutrophils = TRUE # quantile normalization on neutrophil data?

In [13]:
quantile_normalization_single_cell = TRUE # quantile normalization on single-cell?

In [14]:
align_genes = FALSE  # decide whether to take for neutrophils only genes of single-cell data

In [15]:
# Name on which to Save the Data
name = 'V_AZIMUTH'

# Functions

In [16]:
### Function for quantile normalization

quantile_normalization = function(X){
  ranks = apply(X, 2, rank, ties.method = 'min')  # determine ranks of each entry
  
  sorted = data.frame(apply(X, 2, sort)) # sort the entries
  means = apply(sorted, 1, mean) # calculate the means
  
  normalized_data = apply(ranks, 2 ,function(x){ means[x]}) # substitute the means into ranks matrix
}


In [17]:
### Gene wise quantile normalization


stdnorm <- function(x) {
  r = rank(x, ties.method="random")
  qnorm(r / (length(x) + 1))
}

# Load Data 

## Sample Meta Data

### Load

In [18]:
path = paste0(result_path, '/00_Data_Overview/Available_Data_per_Sample_Overview.csv')
all_samples_info = read.csv(path)
print(file.info(path)$mtime)

[1] "2023-12-10 11:16:00 CET"


In [19]:
path = paste0(result_path, '/00_Data_Overview/Merged_Sample_Meta_Data.csv')
sample_data = read.csv(path)
print(file.info(path)$mtime)

[1] "2023-12-10 11:16:00 CET"


In [20]:
## Define whether to filter only on a subset of samples

In [21]:
#patients_filter = unique(sample_data$sample_id[is.na(str_extract(sample_data$sample_id, 'k'))])  # use only acs samples
patients_filter = unique(sample_data$sample_id) #  use all samples

### Process Clinical Data

In [22]:
### Select relevant columns

In [23]:
clinical_data = sample_data[,c('sample_id', 'measurement', 'CK', 'CK_MB', 'Troponin','CRP', 'delta_ef_value', 'sample')]

In [24]:
clinical_data = clinical_data[clinical_data$sample_id %in% patients_filter,]

In [25]:
### Data transformations (log)

In [26]:
clinical_data$CK_MB = as.numeric(clinical_data$CK_MB)

“NAs introduced by coercion”


In [27]:
clinical_data$CRP = as.numeric(clinical_data$CRP)

“NAs introduced by coercion”


In [28]:
clinical_data[,3:6] = log2(clinical_data[,3:6]+1)   # logarithmize

In [29]:
### Create timepoint column from meta-data

In [30]:
clinical_data$timepoint = str_replace(clinical_data$measurement, 'TP', '')

In [31]:
clinical_data$timepoint  = as.numeric(clinical_data$timepoint )

In [32]:
### Remove some columns

In [33]:
clinical_data$measurement = NULL

In [34]:
clinical_data$sample = NULL

In [35]:
clinical_data$timepoint = NULL

In [36]:
### Summarise in case of multiple measurements

In [37]:
clinical_data = clinical_data %>% group_by(sample_id) %>% summarise(CK = mean(CK), CK_MB = mean(CK_MB), Troponin = mean(Troponin), delta_ef_value = mean(delta_ef_value), CRP =mean(CRP) )

In [38]:
clinical_data = data.frame(clinical_data)

In [39]:
### Long format to integrate in clustering

In [40]:
clinical_data_long = melt(clinical_data)

Using sample_id as id variables



In [41]:
clinical_data_long$type = 'clinical_data'

In [42]:
unique(clinical_data_long$variable)

## Cytokine Data

### Load

In [43]:
### Load processed cytokine data

In [44]:
path = paste0(result_path, '/00_Data_Overview/Prepared_Cytokine_Data.csv')
cytokines = read.csv( path)
print(file.info(path)$mtime)

[1] "2023-12-10 11:16:00 CET"


In [45]:
head(cytokines,2)

Unnamed: 0_level_0,X,sample_id,sCD40L,EGF,Eotaxin,FGF2,FLT3L,Fractalkine,GCSF,GMCSF,⋯,MCP4,MIP1.,SCF,SDF1alpha.beta,TARC,TPO,TRAIL,TSLP,id,cytokine_data
Unnamed: 0_level_1,<int>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,⋯,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<int>
1,1,M1.1,1033.94,4.49,48.09,52.77,14.4,57.66,8.70,OOR <,⋯,91.13,2665.03,5.24,4033.41,59.34,354.84,44.25,4.65,m1.1,1
2,2,M11.1,232.19,16.74,32.19,54.38,52.98,107.78,OOR <,OOR <,⋯,73.06,5105.8,74.24,10498.23,10.47,OOR <,84.87,OOR <,m11.1,1


In [46]:
### Load cytokine gene mapping (mapping of cytokines to gene-codes)

In [47]:
path = paste0(data_path, '/preprocessed-data/meta-data/Cytokine_Gene_Mapping.csv')
cytokine_gene_mapping = read.csv( path)
print(file.info(path)$mtime)

[1] "2022-07-13 11:17:31 CEST"


In [48]:
head(cytokine_gene_mapping,2)

Unnamed: 0_level_0,cytokine,mapped_name
Unnamed: 0_level_1,<chr>,<chr>
1,IL8,IL8__CXCL8
2,MIP1alpha,MIP1alpha__CCL3


In [49]:
ncol(cytokines)  # about 75 cytokines

### Pre-process

In [50]:
#### Set OOR values to 0

In [51]:
cytokines[cytokines == 'OOR <'] = 0

In [52]:
cytokines[cytokines == 'OOR'] = 0

In [53]:
cytokines[cytokines == ''] = 0

In [54]:
rownames(cytokines) = cytokines$id

In [55]:
samples = cytokines$id

In [56]:
## remove columns and ajust column names

In [57]:
cytokines$id = NULL

In [58]:
cytokines$sample_id = NULL

In [59]:
cytokines$X = NULL

In [60]:
colnames(cytokines) = str_replace(colnames(cytokines), '\\.|\\.\\.|\\.\\.\\.', '_')

In [61]:
for(i in colnames(cytokines)){
    cytokines[,i] = as.numeric(   cytokines[,i])
    }

In [62]:
head(cytokines,2)

Unnamed: 0_level_0,sCD40L,EGF,Eotaxin,FGF2,FLT3L,Fractalkine,GCSF,GMCSF,GROalpha,IFNalpha2,⋯,MCP2,MCP4,MIP1_,SCF,SDF1alpha_beta,TARC,TPO,TRAIL,TSLP,cytokine_data
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
m1.1,1033.94,4.49,48.09,52.77,14.4,57.66,8.7,0,3.43,16.31,⋯,34.73,91.13,2665.03,5.24,4033.41,59.34,354.84,44.25,4.65,1
m11.1,232.19,16.74,32.19,54.38,52.98,107.78,0.0,0,8.2,18.32,⋯,24.26,73.06,5105.8,74.24,10498.23,10.47,0.0,84.87,0.0,1


In [63]:
cytokine_names = colnames(cytokines)

In [64]:
cytokines_trans_adapted = cytokines

In [65]:
cytokines_trans_adapted = data.frame(cytokines_trans_adapted)

In [66]:
cytokines_trans_adapted$X = NULL

In [67]:
cytokines_trans_adapted$sample_id = NULL

In [68]:
#### Logarithmize the values 

In [69]:
cytokines_trans_adapted = log2(cytokines_trans_adapted + 1)

In [70]:
cytokines_trans_adapted$sample_id = samples

In [71]:
cytokines_trans_adapted$cytokine_data = NULL

In [72]:
head(cytokines_trans_adapted,2)

Unnamed: 0_level_0,sCD40L,EGF,Eotaxin,FGF2,FLT3L,Fractalkine,GCSF,GMCSF,GROalpha,IFNalpha2,⋯,MCP2,MCP4,MIP1_,SCF,SDF1alpha_beta,TARC,TPO,TRAIL,TSLP,sample_id
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
m1.1,10.015331,2.456806,5.617357,5.74873,3.944858,5.874305,3.277985,0,2.147307,4.113534,⋯,5.159064,6.525599,11.38048,2.641546,11.97814,5.915043,8.475085,5.499846,2.498251,m1.1
m11.1,7.865362,4.148934,5.052677,5.791293,5.754353,6.76527,0.0,0,3.201634,4.272023,⋯,4.658783,6.210623,12.3182,6.233428,13.358,3.519793,0.0,6.424082,0.0,m11.1


In [73]:
#### Apply quantile normalization

In [74]:
quantile_normalization_cyto

In [75]:
if(quantile_normalization_cyto == TRUE){
    rownames(cytokines_trans_adapted) = cytokines_trans_adapted$sample_id
    cytokines_trans_adapted$sample_id = NULL
    cytokines_trans_adapted = data.frame(t(cytokines_trans_adapted))
    cyto_names = rownames(cytokines_trans_adapted)
    
    
    cytokines_trans_adapted = quantile_normalization(cytokines_trans_adapted)
    rownames(cytokines_trans_adapted) = cyto_names
    cytokines_trans_adapted = data.frame(t(cytokines_trans_adapted))
    cytokines_trans_adapted$sample_id = rownames(cytokines_trans_adapted)
    }

In [76]:
#### Generate cytokine long format 

In [77]:
cytokines_trans_adapted$id = NULL

In [78]:
cytokines_trans_adapted$cytokine_data = NULL

In [79]:
cytokines_long = melt(cytokines_trans_adapted)

Using sample_id as id variables



In [80]:
cytokines_long$type = 'cytokine'

In [81]:
head(cytokines_long,2)

Unnamed: 0_level_0,sample_id,variable,value,type
Unnamed: 0_level_1,<chr>,<fct>,<dbl>,<chr>
1,m1.1,sCD40L,10.015331,cytokine
2,m11.1,sCD40L,7.865362,cytokine


In [82]:
### Adjust names to include also mapping to gene-code

In [83]:
cytokines_long$variable = as.character(cytokines_long$variable)

In [84]:
cytokines_long = merge(cytokines_long, cytokine_gene_mapping, by.x = c('variable'), by.y = c('cytokine'), all.x = TRUE)

In [85]:
cytokines_long$mapped_name[is.na(cytokines_long$mapped_name)] = cytokines_long$variable[is.na(cytokines_long$mapped_name)]

In [86]:
cytokines_long$variable = cytokines_long$mapped_name

In [87]:
cytokines_long$mapped_name = NULL

In [88]:
head(cytokines_long,2)

Unnamed: 0_level_0,variable,sample_id,value,type
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>
1,BCA1__CXCL13,m1.1,5.570766,cytokine
2,BCA1__CXCL13,m11.1,5.952334,cytokine


In [89]:
### Add duplicates

In [90]:
length(unique(cytokines_long$variable))   # amount of cytokines in data

## Proteomic Data

### Load

In [91]:
### Load prepared proteomic data


In [92]:
path = paste0(result_path, '/00_Data_Overview/Prepared_Proteomic_Data.csv')
proteomics = read.csv( path)
print(file.info(path)$mtime)

[1] "2023-12-10 11:16:09 CET"


In [93]:
head(proteomics,2)

Unnamed: 0_level_0,X,SERPINA1_A0A024R6I7,IGLV4.69_A0A075B6H9,IGLV8.61_A0A075B6I0,IGLV4.60_A0A075B6I1,IGLV10.54_A0A075B6I4,IGLV7.46_A0A075B6I9,IGLV2.18_A0A075B6J9,IGLV3.16_A0A075B6K0,IGLV3.12_A0A075B6K2,⋯,MINPP1_Q9UNW1,TLN1_Q9Y490,ANGPTL3_Q9Y5C1,LYVE1_Q9Y5Y7,FCGBP_Q9Y6R7,COLEC10_Q9Y6Z7,IGHV3OR16.9_S4R460,APOA2_V9GYM3,sample_id,proteomics_data
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<int>
1,M2.3_P10.2,-1.66338,-2.414588,1.467756,-6.164178,-1.858412,-2.588001,2.128858,-0.110549,1.968863,⋯,4.3852723,-2.882139,-6.1443596,-4.26206,0.1977374,-1.653755,6.184887,4.235612,m2.3,1
2,M5.1_P1,3.744514,-2.672991,2.308556,-5.999634,-3.812528,-2.425649,1.186534,-2.204914,1.172303,⋯,-0.5717449,-3.766868,0.4187132,-5.92088,-0.1863114,-1.655054,6.121172,4.837075,m5.1,1


### Pre-Process

In [94]:
rownames(proteomics) = proteomics$X

In [95]:
proteomics$X = NULL

In [96]:
proteomics$proteomics_data = NULL

In [97]:
ncol(proteomics)  # about 490 proteins measured

In [98]:
proteomic_names = colnames(proteomics)

#### Adjust distribution

In [99]:
## Apply quantile normalization?

In [100]:
quantile_normalization_proteomics

In [101]:
if (quantile_normalization_proteomics == TRUE){
    rownames(proteomics) = proteomics$sample_id
    proteomics$sample_id = NULL
    proteomics = t(proteomics)
    names = rownames(proteomics)
    
    proteomics  = quantile_normalization(proteomics )  # works on proteomics data
    rownames(proteomics) = names
    proteomics = data.frame(t(proteomics))
    proteomics$sample_id = rownames(proteomics)
    }

#### Prepare long format

In [102]:
## Adjust to long format

In [103]:
proteomics_long =  melt(proteomics)

Using sample_id as id variables



In [104]:
proteomics_long$type = 'proteomics'

In [105]:
head(proteomics_long,2)

Unnamed: 0_level_0,sample_id,variable,value,type
Unnamed: 0_level_1,<chr>,<fct>,<dbl>,<chr>
1,m2.3,SERPINA1_A0A024R6I7,-1.66338,proteomics
2,m5.1,SERPINA1_A0A024R6I7,3.744514,proteomics


In [106]:
length(unique(proteomics_long$variable))   # amount proteomic features

## Neutrophil Data

### Load

In [107]:
### Load prepared neutrophil data

In [108]:
path = paste0(result_path, '/00_Data_Overview/Prepared_Neutrophil_Data.csv')
neutrophils = read.csv(path)
print(file.info(path)$mtime)

[1] "2023-12-10 11:16:08 CET"


In [109]:
nrow(neutrophils)

In [110]:
head(neutrophils,2)

Unnamed: 0_level_0,X,ENSG00000000003.15,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,⋯,ENSG00000288612.1,ENSG00000288615.1,ENSG00000288617.1,ENSG00000288621.1,ENSG00000288631.1,ENSG00000288637.1,ENSG00000288638.1,ENSG00000288642.1,sample_id,neutrophil_data
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<int>
1,k4_0_ccs_ccs,0,1,0,0,14,0,0,0,1,⋯,0,0,0,0,0,0,0,0,k4,1
2,m14_3_acs_acs_noinf,0,0,1,0,6,0,0,0,0,⋯,0,0,0,0,0,0,0,0,m14.3,1


### Pre-Process

#### Adjust gene-names

In [111]:
### Convert gene names to SYMBOL annotations

In [112]:
rownames(neutrophils) = neutrophils$sample_id

In [113]:
neutrophils$X = NULL

In [114]:
neutrophils$sample_id = NULL

In [115]:
neutrophils$neutrophil_data = NULL

In [116]:
head(neutrophils,2)

Unnamed: 0_level_0,ENSG00000000003.15,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,ENSG00000001460.18,⋯,ENSG00000288607.1,ENSG00000288611.1,ENSG00000288612.1,ENSG00000288615.1,ENSG00000288617.1,ENSG00000288621.1,ENSG00000288631.1,ENSG00000288637.1,ENSG00000288638.1,ENSG00000288642.1
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
k4,0,1,0,0,14,0,0,0,1,0,⋯,0,0,0,0,0,0,0,0,0,0
m14.3,0,0,1,0,6,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [117]:
genes = colnames(neutrophils)[!is.na(str_extract(colnames(neutrophils), 'ENSG'))]

In [118]:
length(unique(genes))

In [119]:
length(genes)

In [120]:
genes = str_replace(genes, '\\..*', '') ## Adjust format for mapping

In [121]:
genes = genes[! genes %in% names(table(genes)[table(genes) > 1])]

In [122]:
length(genes)

In [123]:
length(unique(genes))

In [124]:
### Map genes to SYMBOL

In [125]:
genes_mapped = bitr(genes, fromType="ENSEMBL", toType="SYMBOL", OrgDb = 'org.Hs.eg.db') ### Map genes to SYMBOL

'select()' returned 1:many mapping between keys and columns

“33.75% of input gene IDs are fail to map...”


In [126]:
head(genes_mapped,2)

Unnamed: 0_level_0,ENSEMBL,SYMBOL
Unnamed: 0_level_1,<chr>,<chr>
1,ENSG00000000003,TSPAN6
2,ENSG00000000419,DPM1


In [127]:
nrow(genes_mapped)  # 25.338 genes that can be mapped to SYMBOL!

In [128]:
### Adjust neutrophil gene names

In [129]:
head(neutrophils,2)

Unnamed: 0_level_0,ENSG00000000003.15,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,ENSG00000001460.18,⋯,ENSG00000288607.1,ENSG00000288611.1,ENSG00000288612.1,ENSG00000288615.1,ENSG00000288617.1,ENSG00000288621.1,ENSG00000288631.1,ENSG00000288637.1,ENSG00000288638.1,ENSG00000288642.1
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
k4,0,1,0,0,14,0,0,0,1,0,⋯,0,0,0,0,0,0,0,0,0,0
m14.3,0,0,1,0,6,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [130]:
neutrophils$neutrophil_data = NULL

In [131]:
neutrophils$sample_id = NULL

In [132]:
neutrophils = data.frame(t(neutrophils))

In [133]:
neutrophils$gene = rownames(neutrophils)

In [134]:
neutrophils$gene = str_replace(neutrophils$gene, '\\..*', '')

In [135]:
head(genes_mapped, 2)

Unnamed: 0_level_0,ENSEMBL,SYMBOL
Unnamed: 0_level_1,<chr>,<chr>
1,ENSG00000000003,TSPAN6
2,ENSG00000000419,DPM1


In [136]:
neutrophils = merge(neutrophils, genes_mapped, by.x = 'gene', by.y = 'ENSEMBL')

In [137]:
nrow(neutrophils)

In [138]:
## Summarise in case of multiple entries

In [139]:
neutrophils = neutrophils %>% group_by(SYMBOL) %>% summarise(across(-gene, sum, na.rm = TRUE))

[1m[22m[36mℹ[39m In argument: `across(-gene, sum, na.rm = TRUE)`.
[36mℹ[39m In group 1: `SYMBOL = "A1BG"`.
[1m[22m[33m![39m The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))”


In [140]:
neutrophils = data.frame(neutrophils)

In [141]:
nrow(neutrophils)

In [142]:
rownames(neutrophils) = neutrophils$SYMBOL

In [143]:
neutrophils$SYMBOL = NULL

In [144]:
neutrophils = data.frame(t(neutrophils))

In [145]:
neutrophils$sample_id = rownames(neutrophils)

In [146]:
neutrophils$sample_id = NULL

In [147]:
head(neutrophils,2)

Unnamed: 0_level_0,A1BG,A1BG.AS1,A1CF,A2M,A2M.AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,⋯,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZYXP1,ZZEF1,ZZZ3
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
k4,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,1,0,0,9,0,0,0
m14.3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,2,13,0,0,0


In [148]:
neutrophil_names = colnames(neutrophils)

#### Filter out ribosomal and mitochondrial genes

In [149]:
head(neutrophils,2)

Unnamed: 0_level_0,A1BG,A1BG.AS1,A1CF,A2M,A2M.AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,⋯,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZYXP1,ZZEF1,ZZZ3
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
k4,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,1,0,0,9,0,0,0
m14.3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,2,13,0,0,0


In [150]:
dim(neutrophils)

In [151]:
neutrophils = neutrophils[,is.na(str_extract(colnames(neutrophils), '^MT.*|^RPL.*|^RPS.*'))]

In [152]:
dim(neutrophils)

In [153]:
neutrophils$sample_id = rownames(neutrophils)

#### Filter out low expressed genes

In [154]:
### filter out genes that do not show an expression in sufficient number of samples

In [155]:
neutrophils$sample_id = NULL

In [156]:
dim(neutrophils)

In [157]:
neutrophil_threshold

In [158]:
ncol(neutrophils)

In [159]:
nrow(neutrophils)

In [160]:
neutrophils = neutrophils[,((colSums(neutrophils == 0))/ nrow(neutrophils)) <= neutrophil_threshold]

In [161]:
ncol(neutrophils)

### Remove samples with high amount of 0

In [162]:
head(neutrophils,2)

Unnamed: 0_level_0,AATK,ABCA7,ABHD2,ABHD5,ABTB1,ACAP2,ACOX1,ACSL1,ACTB,ACTG1,⋯,ZFAS1,ZFP36,ZFP36L1,ZFP36L2,ZMAT2,ZNF106,ZNF281,ZNF467,ZNFX1,ZYX
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
k4,1,2,4,7,8,3,0,9,34,12,⋯,5,6,11,5,2,0,2,4,0,9
m14.3,4,1,8,3,17,6,4,8,58,24,⋯,1,19,29,9,0,3,2,3,1,13


In [163]:
### Calculate percentage of zero values per sample

In [164]:
sample_perc_zero = rowSums(neutrophils==0)/ ncol(neutrophils) 

In [165]:
### Remove samples with high amount of zero values

In [166]:
remove_samples = sample_perc_zero[sample_perc_zero > 0.1]

In [167]:
neutrophils = neutrophils[!rownames(neutrophils) %in% names(remove_samples),]

In [168]:
length(unique(rownames(neutrophils)))

### Normalization

In [169]:
### Adjust for library size 

In [170]:
neutrophils$sample_id =  NULL

In [171]:
dim(neutrophils)

In [172]:
### Calculate amount of counts per sample by mean amount of counts across all samples

In [173]:
scaling_factor = rowSums(neutrophils) /mean(rowSums(neutrophils))

In [174]:
head(scaling_factor)

In [175]:
mean(rowSums(neutrophils))

In [176]:
## Adjust counts by scaling factor

In [177]:
if(library_adjustment_neutrophils == TRUE){
    neutrophils = apply(neutrophils,2, function(x){ x/scaling_factor})
    }

In [178]:
head(neutrophils,2)

Unnamed: 0,AATK,ABCA7,ABHD2,ABHD5,ABTB1,ACAP2,ACOX1,ACSL1,ACTB,ACTG1,⋯,ZFAS1,ZFP36,ZFP36L1,ZFP36L2,ZMAT2,ZNF106,ZNF281,ZNF467,ZNFX1,ZYX
m14.3,6.325736,1.581434,12.651473,4.744302,26.88438,9.488604,6.325736,12.65147,91.72318,37.95442,⋯,1.581434,30.04725,45.86159,14.23291,0.0,4.744302,3.162868,4.744302,1.581434,20.55864
m26.2,5.21303,3.909773,2.606515,15.639091,17.59398,13.032576,3.909773,27.36841,73.63406,20.20049,⋯,7.819546,15.63909,20.85212,14.33583,5.21303,7.167917,3.909773,3.909773,3.258144,14.98746


In [179]:
#### logarithmize neutrophil data 

In [180]:

neutrophils = data.frame(log2(neutrophils + 1))

In [181]:
neutrophils = data.frame(neutrophils)

In [182]:
head(neutrophils,2)

Unnamed: 0_level_0,AATK,ABCA7,ABHD2,ABHD5,ABTB1,ACAP2,ACOX1,ACSL1,ACTB,ACTG1,⋯,ZFAS1,ZFP36,ZFP36L1,ZFP36L2,ZMAT2,ZNF106,ZNF281,ZNF467,ZNFX1,ZYX
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
m14.3,2.872974,1.368173,3.770985,2.522132,4.801385,3.390751,2.872974,3.770985,6.534858,5.283715,⋯,1.368173,4.956393,5.550334,3.929119,0.0,2.522132,2.057578,2.522132,1.368173,4.430194
m26.2,2.635297,2.295656,1.850606,4.056505,4.216764,3.810708,2.295656,4.826213,6.221762,4.406026,⋯,3.140704,4.056505,4.449701,3.938835,2.635297,3.029968,2.295656,2.295656,2.090225,3.998869


### Filter on highly variable genes

In [183]:
head(neutrophils,2)

Unnamed: 0_level_0,AATK,ABCA7,ABHD2,ABHD5,ABTB1,ACAP2,ACOX1,ACSL1,ACTB,ACTG1,⋯,ZFAS1,ZFP36,ZFP36L1,ZFP36L2,ZMAT2,ZNF106,ZNF281,ZNF467,ZNFX1,ZYX
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
m14.3,2.872974,1.368173,3.770985,2.522132,4.801385,3.390751,2.872974,3.770985,6.534858,5.283715,⋯,1.368173,4.956393,5.550334,3.929119,0.0,2.522132,2.057578,2.522132,1.368173,4.430194
m26.2,2.635297,2.295656,1.850606,4.056505,4.216764,3.810708,2.295656,4.826213,6.221762,4.406026,⋯,3.140704,4.056505,4.449701,3.938835,2.635297,3.029968,2.295656,2.295656,2.090225,3.998869


In [184]:
variance = apply(neutrophils, 2, var) # calculate variance per gene

In [185]:
head(variance)

In [186]:
### Set a variance threshold for filtering

In [187]:
var_threshold = quantile(variance, probs = seq(0, 1, 0.01), na.rm = FALSE,
         names = TRUE)['25%']

In [188]:
var_threshold

In [189]:
keep_genes = names(variance[variance > var_threshold])

In [190]:
### Filter based on threshold

In [191]:
ncol(neutrophils)

In [192]:
neutrophils = neutrophils[, keep_genes]

In [193]:
ncol(neutrophils)

### Apply quantile normalization

In [194]:
neutrophils$sample_id = NULL

In [195]:
head(neutrophils,2)

Unnamed: 0_level_0,AATK,ABCA7,ABHD2,ABHD5,ABTB1,ACAP2,ACOX1,ACSL1,ACTG1,ADAM10,⋯,ZBTB7B,ZC3HAV1,ZFAND5,ZFAS1,ZFP36L2,ZMAT2,ZNF106,ZNF281,ZNF467,ZNFX1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
m14.3,2.872974,1.368173,3.770985,2.522132,4.801385,3.390751,2.872974,3.770985,5.283715,2.522132,⋯,0.0,0.0,1.368173,1.368173,3.929119,0.0,2.522132,2.057578,2.522132,1.368173
m26.2,2.635297,2.295656,1.850606,4.056505,4.216764,3.810708,2.295656,4.826213,4.406026,2.090225,⋯,1.203676,2.295656,0.0,3.140704,3.938835,2.635297,3.029968,2.295656,2.295656,2.090225


In [196]:
quantile_normalization_neutrophils

In [197]:
if(quantile_normalization_neutrophils  == TRUE){
    neutrophils = t(neutrophils)
    genes_neutrophils = rownames(neutrophils)
    neutrophils  = quantile_normalization(neutrophils ) 
    rownames(neutrophils) = genes_neutrophils
    neutrophils = data.frame(t(neutrophils))
    }

### Filter only genes that are also in single cell 

In [198]:
### Filter neutrophil genes only on subset of genes also available in single-cell data?

In [199]:
path = paste0(result_path, '/C-Analysis/C0_Filter_Genes_Input_Correlations_Perc_Values.csv')
genes_subset = read.csv(path) # cluster alternative
print(file.info(path)$mtime)

[1] "2024-01-03 14:10:30 CET"


In [200]:
head(genes_subset,2)

Unnamed: 0_level_0,X,perc_cells_expressing_gene,total_amount_cells_expressing_gene,gene,cluster
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<chr>
1,AL627309.1,0.24327612,18,AL627309.1,8_B-cell
2,AL627309.4,0.05406136,4,AL627309.4,8_B-cell


In [201]:
genes = unique(genes_subset$gene)

In [202]:
align_genes

In [203]:
if(align_genes == TRUE){
    neutrophils = neutrophils[,colnames(neutrophils) %in% genes]
    }

In [204]:
dim(neutrophils)

### Prepare long format

In [205]:
neutrophils$sample_id = rownames(neutrophils)

In [206]:
neutrophils_long = melt(neutrophils)

Using sample_id as id variables



In [207]:
neutrophils_long$type = 'neutrophil'

In [208]:
head(neutrophils_long[neutrophils_long$sample_id %in% c( 'k1', 'k10'),],2)#
# TBD: just different ordering of samples

Unnamed: 0_level_0,sample_id,variable,value,type
Unnamed: 0_level_1,<chr>,<fct>,<dbl>,<chr>
5,k1,AATK,2.746071,neutrophil
46,k10,AATK,3.074688,neutrophil


In [209]:
length(unique(neutrophils_long$variable))

In [210]:
length(unique(neutrophils_long$sample_id))

## RNA-Single-Seq

### Load info about gene-cell-expression per cluster

In [211]:
## Load dataframe from previous step containing information about percentage of cells expressing genes

In [212]:
path = paste0(result_path, '/G-Analysis/G0_Filter_Genes_Input_Correlations_Perc_Values.csv')
cell_perc_cluster = read.csv( path) 
print(path)
print(file.info(path)$mtime)

[1] "../results/current/G-Analysis/G0_Filter_Genes_Input_Correlations_Perc_Values.csv"
[1] "2024-01-08 22:23:26 CET"


In [213]:
head(cell_perc_cluster,2)

Unnamed: 0_level_0,X,perc_cells_expressing_gene,total_amount_cells_expressing_gene,gene,cluster
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<chr>
1,AL627309.1,0.23400142,23,AL627309.1,B cell
2,AL627309.4,0.06104385,6,AL627309.4,B cell


In [214]:
nrow(cell_perc_cluster)

In [215]:
length(unique(cell_perc_cluster$gene))

In [216]:
length(unique(cell_perc_cluster$cluster))

In [217]:
### Filter the dataset based on threshold to exclude lowly expressed genes

In [218]:
cell_perc_cluster =  cell_perc_cluster[((cell_perc_cluster$perc_cells > 50) & 
                                        (cell_perc_cluster$total_amount_cells_expressing_gene > 1200)) | 
                                       ((cell_perc_cluster$perc_cells > 40) & 
                                        (cell_perc_cluster$total_amount_cells_expressing_gene > 3000)) ,] 

In [219]:
nrow(cell_perc_cluster)

### Load info about amount of cells per cluster

In [220]:
### Load dataframe from previous script

In [221]:
path = paste0(result_path, '/G-Analysis/G1_Cell_Sample_Cluster_Distribution.csv')
amount_cells_data = read.csv(path)
print(path)
print(file.info(path)$mtime)

[1] "../results/current/G-Analysis/G1_Cell_Sample_Cluster_Distribution.csv"
[1] "2024-01-08 22:20:01 CET"


In [222]:
amount_cells_data$X = NULL

In [223]:
amount_cells_data = data.frame(cluster = names(colSums(amount_cells_data)), amount_cells = colSums(amount_cells_data))

In [224]:
head(amount_cells_data,2)

Unnamed: 0_level_0,cluster,amount_cells
Unnamed: 0_level_1,<chr>,<dbl>
ASDC,ASDC,8
B.cell,B.cell,9829


### Load Pseudobulk aggregated RNA data from C0

In [225]:
## Load pseudobulk data

In [226]:
name

In [227]:
path = paste0(result_path, '/G-Analysis/G0_aggregated_RNA_input_correlations_all.RDS')
load(path)   
print(path)
print(file.info(path)$mtime)

[1] "../results/current/G-Analysis/G0_aggregated_RNA_input_correlations_all.RDS"
[1] "2024-01-08 22:41:23 CET"


In [228]:
## get all genes in data

In [229]:
all_genes = rownames(pb)

In [230]:
head(all_genes)

In [231]:
length(all_genes)

In [232]:
pb

class: SingleCellExperiment 
dim: 19221 121 
metadata(2): experiment_info agg_pars
assays(29): ASDC B cell ... Platelet Treg
rownames(19221): AL627309.1 AL627309.4 ... AC004556.1 AC240274.1
rowData names(0):
colnames(121): 1.1-L1 10-L11 ... 9.2-L4 9.3-L7
colData names(26): group_id classification_measurement ... library_char
  ident
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

### Pre-Process

#### Remove Clusters

In [233]:
### Filter out clusters with too low amount of cells per sample per cluster

In [234]:
names(assays(pb))

In [235]:
assay(pb, 'Platelet') = NULL

In [236]:
assay(pb, 'Plasmablast') = NULL

In [237]:
assay(pb, 'pDC') = NULL

In [238]:
assay(pb, 'NK_CD56bright') = NULL

In [239]:
assay(pb, 'NK Proliferating') = NULL

In [240]:
assay(pb, 'ILC') = NULL

In [241]:
assay(pb, 'HSPC') = NULL

In [242]:
assay(pb, 'Eryth') = NULL

In [243]:
assay(pb, 'Doublet') = NULL

In [244]:
assay(pb, 'dnT') = NULL

In [245]:
assay(pb, 'cDC1') = NULL

In [246]:
assay(pb, 'CD8 TCM') = NULL

In [247]:
assay(pb, 'CD8 Proliferating') = NULL

In [248]:
assay(pb, 'CD4 Proliferating') = NULL

In [249]:
assay(pb, 'ASDC') = NULL

In [250]:
length(names(assays(pb)))

In [251]:
names(assays(pb))

#### Prepare gene-cluster dataframe + normalize

In [252]:
## get names of the clusters/ cell-types
nodes = names(assays(pb))

In [253]:
head(nodes)

In [254]:
cell_types = nodes

In [255]:
# generate data.frame to save counts

In [256]:
final_data = data.frame(samples = colnames(pb))

In [257]:
rownames(final_data) = final_data$samples

In [258]:
head(final_data,2)

Unnamed: 0_level_0,samples
Unnamed: 0_level_1,<chr>
1.1-L1,1.1-L1
10-L11,10-L11


In [259]:
### set genes to use to filtered data frame based on expression

In [260]:
genes_subset = cell_perc_cluster

In [261]:
### Apply normalization to data

In [262]:

for(i in unique(genes_subset$cluster)){
    data = assay(pb, i)


    ##### Normalize counts per sample (library size) 

    scaling_factor = colSums(data) /mean(colSums(data))

    for (j in 1:ncol(data)){
        if(scaling_factor[j] != 0){
            data[,j] = data[,j]/ scaling_factor[j]
            }
        }
        

    ### Subset data on genes with minimum expression in cluster
    data = data[rownames(data) %in% genes_subset$gene[genes_subset$cluster == i],]


    ##### Normalize data (Log + Quantile normalization)

    if(is.na(str_extract(name, 'scano')) == TRUE){
        data = log2(data+1) # logarithmize count values (optional!)
        }

    #### Quantile normalization 
    if(quantile_normalization_single_cell == TRUE){
        data_rows = rownames(data)
        data  = quantile_normalization(data ) 
        rownames(data) = data_rows
        }

    rownames(data) = paste0(i, '__' ,rownames(data))

    data = data.frame(t(data))

    expr_mean = data.frame( mean_expr = rowMeans(data))
    colnames(expr_mean) = i
    rownames(expr_mean) = rownames(data)

    final_data = merge(final_data, data, by = 0)

    rownames(final_data) =  final_data$Row.names
    final_data$Row.names = NULL
    }

   

In [263]:
### Merge data to sample_data

In [264]:
sample_data$sample_merge = paste0(sample_data$id, '-', sample_data$library)

In [265]:
#head(sample_data)

In [266]:
dim(final_data)

In [267]:
final_data = merge(final_data, sample_data[,c('sample_id', 'sample_merge')], by.x = 'samples', by.y = 'sample_merge')

In [268]:
dim(final_data)

In [269]:
rownames(final_data)  = final_data$samples

In [270]:
head(final_data,2)

Unnamed: 0_level_0,samples,B.cell__RPL22,B.cell__PARK7,B.cell__ENO1,B.cell__PRDM2,B.cell__CAPZB,B.cell__HP1BP3,B.cell__CDC42,B.cell__HNRNPR,B.cell__RPL11,⋯,CD4.CTL__MT.CO2,CD4.CTL__MT.ATP6,CD4.CTL__MT.CO3,CD4.CTL__MT.ND3,CD4.CTL__MT.ND4L,CD4.CTL__MT.ND4,CD4.CTL__MT.ND5,CD4.CTL__MT.ND6,CD4.CTL__MT.CYB,sample_id
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1.1-L1,1.1-L1,3.489087,0.7517455,0.4938072,1.332314,0.6630314,0.7887402,1.698647,0.7887402,5.000988,⋯,5.380742,5.755433,4.975793,5.198533,0.2775368,4.667043,3.326732,0.6091993,5.072069,m1.1
10-L11,10-L11,3.462751,0.9412188,0.8205378,1.18275,0.9206895,0.7610354,1.479767,0.7213612,4.968836,⋯,5.039021,4.625803,4.605801,4.482661,1.4139015,4.136982,2.834366,1.2093461,4.524055,k10


In [271]:
dim(final_data)

#### Filter genes

In [272]:
### Remove mitochondrial & ribosomal genes

In [273]:
head(final_data,2)

Unnamed: 0_level_0,samples,B.cell__RPL22,B.cell__PARK7,B.cell__ENO1,B.cell__PRDM2,B.cell__CAPZB,B.cell__HP1BP3,B.cell__CDC42,B.cell__HNRNPR,B.cell__RPL11,⋯,CD4.CTL__MT.CO2,CD4.CTL__MT.ATP6,CD4.CTL__MT.CO3,CD4.CTL__MT.ND3,CD4.CTL__MT.ND4L,CD4.CTL__MT.ND4,CD4.CTL__MT.ND5,CD4.CTL__MT.ND6,CD4.CTL__MT.CYB,sample_id
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1.1-L1,1.1-L1,3.489087,0.7517455,0.4938072,1.332314,0.6630314,0.7887402,1.698647,0.7887402,5.000988,⋯,5.380742,5.755433,4.975793,5.198533,0.2775368,4.667043,3.326732,0.6091993,5.072069,m1.1
10-L11,10-L11,3.462751,0.9412188,0.8205378,1.18275,0.9206895,0.7610354,1.479767,0.7213612,4.968836,⋯,5.039021,4.625803,4.605801,4.482661,1.4139015,4.136982,2.834366,1.2093461,4.524055,k10


In [274]:
ncol(final_data)

In [275]:
final_data = final_data[, !colnames(final_data) %in% (colnames(final_data)[!is.na(str_extract(colnames(final_data), '__MT.*|__RPL.*|__RPS.*'))])]

In [276]:
ncol(final_data) 

In [277]:
head(final_data,2)

Unnamed: 0_level_0,samples,B.cell__PARK7,B.cell__ENO1,B.cell__PRDM2,B.cell__CAPZB,B.cell__HP1BP3,B.cell__CDC42,B.cell__HNRNPR,B.cell__PNRC2,B.cell__SRSF10,⋯,CD4.CTL__SNU13,CD4.CTL__SMDT1,CD4.CTL__TSPO,CD4.CTL__SOD1,CD4.CTL__SON,CD4.CTL__TTC3,CD4.CTL__HMGN1,CD4.CTL__ITGB2,CD4.CTL__PRMT2,sample_id
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1.1-L1,1.1-L1,0.7517455,0.4938072,1.332314,0.6630314,0.7887402,1.698647,0.7887402,0.8199788,0.9933239,⋯,0.8392725,1.2053873,0.6091993,0.8392725,1.57955,0.2775368,1.756926,1.053814,0.6091993,m1.1
10-L11,10-L11,0.9412188,0.8205378,1.18275,0.9206895,0.7610354,1.479767,0.7213612,0.7969667,0.8442397,⋯,0.9897038,0.4184606,1.0203473,1.3312251,1.57955,0.8628635,1.331225,1.560533,0.7464948,k10


In [278]:
final_data$samples = NULL

In [279]:
final_data$sample_id = NULL

In [280]:
final_data$samples = rownames(final_data)

In [281]:
final_data = merge(final_data, sample_data[,c('sample_id', 'sample_merge')], by.x = 'samples', by.y = 'sample_merge')

#### Prepare long format

In [282]:
final_data_long = melt(final_data)

Using samples, sample_id as id variables



In [283]:
head(final_data_long,2)

Unnamed: 0_level_0,samples,sample_id,variable,value
Unnamed: 0_level_1,<chr>,<chr>,<fct>,<dbl>
1,1.1-L1,m1.1,B.cell__PARK7,0.7517455
2,10-L11,k10,B.cell__PARK7,0.9412188


In [284]:
final_data_long$samples = NULL

In [285]:
final_data_long$type = 'single_cell'

In [286]:
final_data_long = final_data_long %>% group_by(sample_id, type, variable) %>% summarise(value = mean(value))  # take average for samples measured twice

[1m[22m`summarise()` has grouped output by 'sample_id', 'type'. You can override using
the `.groups` argument.


In [287]:
length(unique(final_data_long$variable))

# Integration of all data sources (V1 with gene-gene correletations)

## Combine all data sources and save the data

In [288]:
data_long = rbind(final_data_long, cytokines_long,proteomics_long, neutrophils_long, clinical_data_long )
#data_long = rbind(final_data_long, cytokines_long,proteomics_long, neutrophils_long ) # version without clinical data

In [289]:
length(unique(final_data_long$variable))

In [290]:
head(data_long,2)

sample_id,type,variable,value
<chr>,<chr>,<chr>,<dbl>
k10,single_cell,B.cell__PARK7,0.9412188
k10,single_cell,B.cell__ENO1,0.8205378


In [291]:
length(unique(data_long$variable))

In [292]:
unique(data_long$type)

In [293]:
data_long$config = paste0(quantile_normalization_cyto, '-', quantile_normalization_proteomics, '-', quantile_normalization_neutrophils, '-', neutrophil_threshold, '-', regress_neutrophils, '-', library_adjustment_neutrophils, '-')

In [294]:
print(paste0(result_path, '/G-Analysis/Combined_Data_', name, '.csv'))

[1] "../results/current/G-Analysis/Combined_Data_V_AZIMUTH.csv"


In [295]:
write.csv(data_long, paste0(result_path, '/G-Analysis/Combined_Data_', name, '.csv'))

In [296]:
name