In [1]:
### Script to integrate and align all available data sources:
# Single Cell RNA Seq
# Cytokine Data
# Neutrophil Data
# Proteomics

#############################################
# Prerequisites - Load Libraries

In [2]:
source('MS0_Libraries.r')


Attaching package: ‘igraph’


The following objects are masked from ‘package:stats’:

    decompose, spectrum


The following object is masked from ‘package:base’:

    union



Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
   

###############################################
# Preqrequisites Configurations & Parameters

In [3]:
data_path = '../data/current'

In [4]:
result_path = '../results/current'

In [5]:
data_path

In [6]:
### Should quantile normalization be applied?

In [7]:
quantile_normalization_cyto = FALSE

In [8]:
quantile_normalization_proteomics = FALSE

In [9]:
library_adjustment_neutrophils = TRUE

In [10]:
regress_neutrophils = FALSE

In [11]:
neutrophil_threshold = 0.2 # decide how many 0 are allowed on genes measured(percentage zeros)

In [12]:
quantile_normalization_neutrophils = TRUE

In [13]:
quantile_normalization_single_cell = TRUE

In [14]:
align_genes = FALSE  # decide whether to take only genes of single-cell data

In [15]:
add_duplicates = FALSE

In [16]:
# Name on which to Save the Data
name = 'V_AZIMUTH'

# Functions

In [17]:
### Function for quantile normalization

quantile_normalization = function(X){
  ranks = apply(X, 2, rank, ties.method = 'min')  # determine ranks of each entry
  
  sorted = data.frame(apply(X, 2, sort)) # sort the entries
  means = apply(sorted, 1, mean) # calculate the means
  
  normalized_data = apply(ranks, 2 ,function(x){ means[x]}) # substitute the means into ranks matrix
}


In [18]:
### Gene wise quantile normalization


stdnorm <- function(x) {
  r = rank(x, ties.method="random")
  qnorm(r / (length(x) + 1))
}

# Load Data 

## Sample Meta Data

### Load

In [19]:
path = paste0(result_path, '/00_Data_Overview/Available_Data_per_Sample_Overview.csv')
all_samples_info = read.csv(path)
print(file.info(path)$mtime)

[1] "2023-01-30 10:34:54 CET"


In [20]:
path = paste0(result_path, '/00_Data_Overview/Merged_Sample_Meta_Data.csv')
sample_data = read.csv(path)
print(file.info(path)$mtime)

[1] "2023-01-30 10:34:54 CET"


In [23]:
#patients_filter = unique(sample_data$sample_id[is.na(str_extract(sample_data$sample_id, 'k'))])  # use only acs samples
patients_filter = unique(sample_data$sample_id) #  use all samples

### Process Clinical Data

In [26]:
### Select relevant columsn

In [27]:
clinical_data = sample_data[,c('sample_id', 'measurement', 'CK', 'CK_MB', 'Troponin','CRP', 'delta_ef_value', 'sample')]

In [28]:
clinical_data = clinical_data[clinical_data$sample_id %in% patients_filter,]

In [29]:
### Data transformations

In [30]:
clinical_data$CK_MB = as.numeric(clinical_data$CK_MB)

“NAs introduced by coercion”


In [31]:
clinical_data$CRP = as.numeric(clinical_data$CRP)

“NAs introduced by coercion”


In [33]:
clinical_data[,3:6] = log2(clinical_data[,3:6]+1)   # logarithmize

In [35]:
### Column Generation

In [55]:
clinical_data$measurement = NULL

In [56]:
clinical_data$sample = NULL

In [57]:
clinical_data$timepoint = NULL

In [58]:
clinical_data = clinical_data %>% group_by(sample_id) %>% summarise(CK = mean(CK), CK_MB = mean(CK_MB), Troponin = mean(Troponin), delta_ef_value = mean(delta_ef_value), CRP =mean(CRP) )

In [59]:
clinical_data = data.frame(clinical_data)

In [72]:
### Long format to integrate in clustering

In [74]:
clinical_data_long = melt(clinical_data)

Using sample_id as id variables



In [76]:
clinical_data_long$type = 'clinical_data'

In [77]:
clinical_data_dupl = clinical_data_long[clinical_data_long$sample_id %in% c('m13.2', 'm6.4'),]

In [78]:
clinical_data_dupl$sample_id[clinical_data_dupl$sample_id == 'm13.2'] = 'm13.22'

In [79]:
clinical_data_dupl$sample_id[clinical_data_dupl$sample_id == 'm6.4'] = 'm6.42'

In [80]:
head(clinical_data_dupl,2)

Unnamed: 0_level_0,sample_id,variable,value,type
Unnamed: 0_level_1,<chr>,<fct>,<dbl>,<chr>
52,m13.22,CK,11.608255,clinical_data
134,m6.42,CK,5.906891,clinical_data


In [81]:
if(add_duplicates == TRUE){
    clinical_data_long = rbind(clinical_data_long, clinical_data_dupl)
    }

In [82]:
unique(clinical_data_long$variable)

In [83]:
sum(clinical_data_long$sample_id == 'm.20.1')

## Cytokine Data

### Load

In [84]:
### Load processed cytokine data

In [85]:
path = paste0(result_path, '/00_Data_Overview/Prepared_Cytokine_Data.csv')
cytokines = read.csv( path)
print(file.info(path)$mtime)

[1] "2023-01-30 10:34:54 CET"


In [86]:
### Load cytokine gene mapping

In [87]:
path = paste0(data_path, '/preprocessed-data/meta-data/Cytokine_Gene_Mapping.csv')
cytokine_gene_mapping = read.csv( path)
print(file.info(path)$mtime)

[1] "2022-07-13 11:17:31 CEST"


In [88]:
head(cytokines,2)

Unnamed: 0_level_0,X,sample_id,sCD40L,EGF,Eotaxin,FGF2,FLT3L,Fractalkine,GCSF,GMCSF,⋯,MCP4,MIP1.,SCF,SDF1alpha.beta,TARC,TPO,TRAIL,TSLP,id,cytokine_data
Unnamed: 0_level_1,<int>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,⋯,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<int>
1,1,M1.1,1033.94,4.49,48.09,52.77,14.4,57.66,8.70,OOR <,⋯,91.13,2665.03,5.24,4033.41,59.34,354.84,44.25,4.65,m1.1,1
2,2,M11.1,232.19,16.74,32.19,54.38,52.98,107.78,OOR <,OOR <,⋯,73.06,5105.8,74.24,10498.23,10.47,OOR <,84.87,OOR <,m11.1,1


In [89]:
head(cytokine_gene_mapping,2)

Unnamed: 0_level_0,cytokine,mapped_name
Unnamed: 0_level_1,<chr>,<chr>
1,IL8,IL8__CXCL8
2,MIP1alpha,MIP1alpha__CCL3


In [90]:
cytokine_gene_mapping[cytokine_gene_mapping$cytokine == 'IL15',]

Unnamed: 0_level_0,cytokine,mapped_name
Unnamed: 0_level_1,<chr>,<chr>
26,IL15,IL15__IL15


In [92]:
ncol(cytokines)  # about 75 cytokines

### Pre-process

In [93]:
#### Set OOR values to 0

In [94]:
cytokines[cytokines == 'OOR <'] = 0

In [95]:
cytokines[cytokines == 'OOR'] = 0

In [96]:
cytokines[cytokines == ''] = 0

In [97]:
rownames(cytokines) = cytokines$id

In [98]:
samples = cytokines$id

In [99]:
samples[samples == 'm.20.1']

In [100]:
cytokines$id = NULL

In [101]:
cytokines$sample_id = NULL

In [102]:
cytokines$X = NULL

In [103]:
colnames(cytokines) = str_replace(colnames(cytokines), '\\.|\\.\\.|\\.\\.\\.', '_')

In [104]:
for(i in colnames(cytokines)){
    cytokines[,i] = as.numeric(   cytokines[,i])
    }

In [105]:
head(cytokines,2)

Unnamed: 0_level_0,sCD40L,EGF,Eotaxin,FGF2,FLT3L,Fractalkine,GCSF,GMCSF,GROalpha,IFNalpha2,⋯,MCP2,MCP4,MIP1_,SCF,SDF1alpha_beta,TARC,TPO,TRAIL,TSLP,cytokine_data
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
m1.1,1033.94,4.49,48.09,52.77,14.4,57.66,8.7,0,3.43,16.31,⋯,34.73,91.13,2665.03,5.24,4033.41,59.34,354.84,44.25,4.65,1
m11.1,232.19,16.74,32.19,54.38,52.98,107.78,0.0,0,8.2,18.32,⋯,24.26,73.06,5105.8,74.24,10498.23,10.47,0.0,84.87,0.0,1


In [106]:
cytokine_names = colnames(cytokines)

In [107]:
cytokines_trans_adapted = cytokines

In [108]:
cytokines_trans_adapted = data.frame(cytokines_trans_adapted)

In [109]:
cytokines_trans_adapted$X = NULL

In [110]:
cytokines_trans_adapted$sample_id = NULL

In [112]:
#### Logarithmize the values (TBD - other transformations?)

In [113]:
cytokines_trans_adapted = log2(cytokines_trans_adapted + 1)

In [114]:
cytokines_trans_adapted$sample_id = samples

In [116]:
cytokines_trans_adapted$cytokine_data = NULL

In [117]:
head(cytokines_trans_adapted,2)

Unnamed: 0_level_0,sCD40L,EGF,Eotaxin,FGF2,FLT3L,Fractalkine,GCSF,GMCSF,GROalpha,IFNalpha2,⋯,MCP2,MCP4,MIP1_,SCF,SDF1alpha_beta,TARC,TPO,TRAIL,TSLP,sample_id
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
m1.1,10.015331,2.456806,5.617357,5.74873,3.944858,5.874305,3.277985,0,2.147307,4.113534,⋯,5.159064,6.525599,11.38048,2.641546,11.97814,5.915043,8.475085,5.499846,2.498251,m1.1
m11.1,7.865362,4.148934,5.052677,5.791293,5.754353,6.76527,0.0,0,3.201634,4.272023,⋯,4.658783,6.210623,12.3182,6.233428,13.358,3.519793,0.0,6.424082,0.0,m11.1


In [118]:
#### Apply quantile normalization

In [119]:
quantile_normalization_cyto

In [120]:
if(quantile_normalization_cyto == TRUE){
    rownames(cytokines_trans_adapted) = cytokines_trans_adapted$sample_id
    cytokines_trans_adapted$sample_id = NULL
    cytokines_trans_adapted = data.frame(t(cytokines_trans_adapted))
    cyto_names = rownames(cytokines_trans_adapted)
    
    
    cytokines_trans_adapted = quantile_normalization(cytokines_trans_adapted)
    rownames(cytokines_trans_adapted) = cyto_names
    cytokines_trans_adapted = data.frame(t(cytokines_trans_adapted))
    cytokines_trans_adapted$sample_id = rownames(cytokines_trans_adapted)
    }

In [121]:
#### Generate cytokine long format for visualization

In [122]:
head(cytokines_trans_adapted,2)

Unnamed: 0_level_0,sCD40L,EGF,Eotaxin,FGF2,FLT3L,Fractalkine,GCSF,GMCSF,GROalpha,IFNalpha2,⋯,MCP2,MCP4,MIP1_,SCF,SDF1alpha_beta,TARC,TPO,TRAIL,TSLP,sample_id
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
m1.1,10.015331,2.456806,5.617357,5.74873,3.944858,5.874305,3.277985,0,2.147307,4.113534,⋯,5.159064,6.525599,11.38048,2.641546,11.97814,5.915043,8.475085,5.499846,2.498251,m1.1
m11.1,7.865362,4.148934,5.052677,5.791293,5.754353,6.76527,0.0,0,3.201634,4.272023,⋯,4.658783,6.210623,12.3182,6.233428,13.358,3.519793,0.0,6.424082,0.0,m11.1


In [123]:
cytokines_trans_adapted$id = NULL

In [124]:
cytokines_trans_adapted$cytokine_data = NULL

In [125]:
cytokines_long = melt(cytokines_trans_adapted)

Using sample_id as id variables



In [126]:
cytokines_long$type = 'cytokine'

In [127]:
head(cytokines_long,2)

Unnamed: 0_level_0,sample_id,variable,value,type
Unnamed: 0_level_1,<chr>,<fct>,<dbl>,<chr>
1,m1.1,sCD40L,10.015331,cytokine
2,m11.1,sCD40L,7.865362,cytokine


In [128]:
### Adjust names for later mapping

In [129]:
cytokines_long$variable = as.character(cytokines_long$variable)

In [130]:
cytokines_long = merge(cytokines_long, cytokine_gene_mapping, by.x = c('variable'), by.y = c('cytokine'), all.x = TRUE)

In [131]:
cytokines_long$mapped_name[is.na(cytokines_long$mapped_name)] = cytokines_long$variable[is.na(cytokines_long$mapped_name)]

In [132]:
cytokines_long$variable = cytokines_long$mapped_name

In [133]:
cytokines_long$mapped_name = NULL

In [134]:
head(cytokines_long[cytokines_long$variable == 'IL15__IL15',],2)

Unnamed: 0_level_0,variable,sample_id,value,type
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>
2521,IL15__IL15,m11.1,3.350497,cytokine
2522,IL15__IL15,m2.1,3.307429,cytokine


In [135]:
cytokine_names

In [136]:
unique(cytokines_long$variable[is.na(str_extract(cytokines_long$variable , '__'))])

In [137]:
cytokine_names = unique(cytokines_long$variable)

In [138]:
head(cytokines_long,2)

Unnamed: 0_level_0,variable,sample_id,value,type
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>
1,BCA1__CXCL13,m1.1,5.570766,cytokine
2,BCA1__CXCL13,m11.1,5.952334,cytokine


In [139]:
### Add duplicates

In [140]:
cytokines_long_dupl = cytokines_long[cytokines_long$sample_id %in% c('m13.2', 'm6.4'),]

In [141]:
cytokines_long_dupl$sample_id[cytokines_long_dupl$sample_id == 'm13.2'] = 'm13.22'

In [142]:
cytokines_long_dupl$sample_id[cytokines_long_dupl$sample_id == 'm6.4'] = 'm6.42'

In [143]:
head(cytokines_long_dupl,2)

Unnamed: 0_level_0,variable,sample_id,value,type
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>
38,BCA1__CXCL13,m13.22,5.786074,cytokine
80,BCA1__CXCL13,m6.42,5.835419,cytokine


In [144]:
if(add_duplicates == TRUE){
    cytokines_long = rbind(cytokines_long, cytokines_long_dupl)
    }

In [146]:
head(cytokines_long,2)

Unnamed: 0_level_0,variable,sample_id,value,type
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>
1,BCA1__CXCL13,m1.1,5.570766,cytokine
2,BCA1__CXCL13,m11.1,5.952334,cytokine


In [147]:
length(unique(cytokines_long$variable))

## Proteomic Data

### Load

In [148]:
path = paste0(result_path, '/00_Data_Overview/Prepared_Proteomic_Data.csv')
proteomics = read.csv( path)
print(file.info(path)$mtime)

[1] "2023-01-30 10:35:09 CET"


In [149]:
head(proteomics,2)

Unnamed: 0_level_0,X,SERPINA1_A0A024R6I7,IGLV4.69_A0A075B6H9,IGLV8.61_A0A075B6I0,IGLV4.60_A0A075B6I1,IGLV10.54_A0A075B6I4,IGLV7.46_A0A075B6I9,IGLV2.18_A0A075B6J9,IGLV3.16_A0A075B6K0,IGLV3.12_A0A075B6K2,⋯,MINPP1_Q9UNW1,TLN1_Q9Y490,ANGPTL3_Q9Y5C1,LYVE1_Q9Y5Y7,FCGBP_Q9Y6R7,COLEC10_Q9Y6Z7,IGHV3OR16.9_S4R460,APOA2_V9GYM3,sample_id,proteomics_data
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<int>
1,M2.3_P10.2,-1.66338,-2.414588,1.467756,-6.164178,-1.858412,-2.588001,2.128858,-0.110549,1.968863,⋯,4.3852723,-2.882139,-6.1443596,-4.26206,0.1977374,-1.653755,6.184887,4.235612,m2.3,1
2,M5.1_P1,3.744514,-2.672991,2.308556,-5.999634,-3.812528,-2.425649,1.186534,-2.204914,1.172303,⋯,-0.5717449,-3.766868,0.4187132,-5.92088,-0.1863114,-1.655054,6.121172,4.837075,m5.1,1


### Pre-Process

In [150]:
rownames(proteomics) = proteomics$X

In [151]:
proteomics$X = NULL

In [152]:
proteomics$proteomics_data = NULL

In [153]:
ncol(proteomics)  # about 490 proteins measured

In [154]:
head(sort(colnames(proteomics)))

In [155]:
proteomic_names = colnames(proteomics)

#### Adjust distribution

In [156]:
head(proteomics,2)

Unnamed: 0_level_0,SERPINA1_A0A024R6I7,IGLV4.69_A0A075B6H9,IGLV8.61_A0A075B6I0,IGLV4.60_A0A075B6I1,IGLV10.54_A0A075B6I4,IGLV7.46_A0A075B6I9,IGLV2.18_A0A075B6J9,IGLV3.16_A0A075B6K0,IGLV3.12_A0A075B6K2,IGLV3.10_A0A075B6K4,⋯,WWC3_Q9ULE0,MINPP1_Q9UNW1,TLN1_Q9Y490,ANGPTL3_Q9Y5C1,LYVE1_Q9Y5Y7,FCGBP_Q9Y6R7,COLEC10_Q9Y6Z7,IGHV3OR16.9_S4R460,APOA2_V9GYM3,sample_id
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
M2.3_P10.2,-1.66338,-2.414588,1.467756,-6.164178,-1.858412,-2.588001,2.128858,-0.110549,1.968863,2.523862,⋯,1.002701,4.3852723,-2.882139,-6.1443596,-4.26206,0.1977374,-1.653755,6.184887,4.235612,m2.3
M5.1_P1,3.744514,-2.672991,2.308556,-5.999634,-3.812528,-2.425649,1.186534,-2.204914,1.172303,1.799312,⋯,-1.063912,-0.5717449,-3.766868,0.4187132,-5.92088,-0.1863114,-1.655054,6.121172,4.837075,m5.1


In [157]:
#colMeans(proteomics)

In [160]:
quantile_normalization_proteomics

In [161]:
if (quantile_normalization_proteomics == TRUE){
    rownames(proteomics) = proteomics$sample_id
    proteomics$sample_id = NULL
    proteomics = t(proteomics)
    names = rownames(proteomics)
    
    proteomics  = quantile_normalization(proteomics )  # works on proteomics data
    rownames(proteomics) = names
    proteomics = data.frame(t(proteomics))
    proteomics$sample_id = rownames(proteomics)
    }

#### Prepare long format

In [162]:
proteomics_long =  melt(proteomics)

Using sample_id as id variables



In [163]:
proteomics_long$type = 'proteomics'

In [164]:
head(proteomics_long,2)

Unnamed: 0_level_0,sample_id,variable,value,type
Unnamed: 0_level_1,<chr>,<fct>,<dbl>,<chr>
1,m2.3,SERPINA1_A0A024R6I7,-1.66338,proteomics
2,m5.1,SERPINA1_A0A024R6I7,3.744514,proteomics


In [165]:
### Add dupl

In [166]:
proteomics_long_dupl = proteomics_long[proteomics_long$sample_id %in% c('m13.2', 'm6.4'),]

In [167]:
proteomics_long_dupl$sample_id[proteomics_long_dupl$sample_id == 'm13.2'] = 'm13.22'

In [168]:
proteomics_long_dupl$sample_id[proteomics_long_dupl$sample_id == 'm6.4'] = 'm6.42'

In [169]:
head(proteomics_long_dupl,2)

Unnamed: 0_level_0,sample_id,variable,value,type
Unnamed: 0_level_1,<chr>,<fct>,<dbl>,<chr>
42,m13.22,SERPINA1_A0A024R6I7,-3.496639,proteomics
45,m6.42,SERPINA1_A0A024R6I7,4.621727,proteomics


In [170]:
if(add_duplicates == TRUE){
    proteomics_long = rbind(proteomics_long, proteomics_long_dupl)
    }

In [171]:
length(unique(proteomics_long$variable))

## Neutrophil Data

### Load

In [172]:
path = paste0(result_path, '/00_Data_Overview/Prepared_Neutrophil_Data.csv')
neutrophils = read.csv(path)
print(file.info(path)$mtime)

[1] "2023-01-30 10:35:09 CET"


In [173]:
nrow(neutrophils)

In [174]:
head(neutrophils,2)

Unnamed: 0_level_0,X,ENSG00000000003.15,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,⋯,ENSG00000288612.1,ENSG00000288615.1,ENSG00000288617.1,ENSG00000288621.1,ENSG00000288631.1,ENSG00000288637.1,ENSG00000288638.1,ENSG00000288642.1,sample_id,neutrophil_data
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<int>
1,k4_0_ccs_ccs,0,1,0,0,14,0,0,0,1,⋯,0,0,0,0,0,0,0,0,k4,1
2,m14_3_acs_acs_noinf,0,0,1,0,6,0,0,0,0,⋯,0,0,0,0,0,0,0,0,m14.3,1


### Pre-Process

#### Adjust gene-names

In [175]:
rownames(neutrophils) = neutrophils$sample_id

In [176]:
neutrophils$X = NULL

In [177]:
neutrophils$sample_id = NULL

In [178]:
neutrophils$neutrophil_data = NULL

In [179]:
head(neutrophils,2)

Unnamed: 0_level_0,ENSG00000000003.15,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,ENSG00000001460.18,⋯,ENSG00000288607.1,ENSG00000288611.1,ENSG00000288612.1,ENSG00000288615.1,ENSG00000288617.1,ENSG00000288621.1,ENSG00000288631.1,ENSG00000288637.1,ENSG00000288638.1,ENSG00000288642.1
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
k4,0,1,0,0,14,0,0,0,1,0,⋯,0,0,0,0,0,0,0,0,0,0
m14.3,0,0,1,0,6,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [180]:
summary(rowSums(neutrophils))

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1840   12677   19266   24080   27994  105324 

In [181]:
genes = colnames(neutrophils)[!is.na(str_extract(colnames(neutrophils), 'ENSG'))]

In [182]:
length(unique(genes))

In [183]:
length(genes)

In [184]:
tail(genes)

In [185]:
genes = str_replace(genes, '\\..*', '') ## Adjust format for mapping

In [188]:
genes = genes[! genes %in% names(table(genes)[table(genes) > 1])]

In [189]:
length(genes)

In [190]:
length(unique(genes))

In [191]:
length(unique(genes))

In [192]:
### Map genes to SYMBOL

In [193]:
genes_mapped = bitr(genes, fromType="ENSEMBL", toType="SYMBOL", OrgDb = 'org.Hs.eg.db') ### Map genes to SYMBOL

'select()' returned 1:many mapping between keys and columns

“33.75% of input gene IDs are fail to map...”


In [194]:
head(genes_mapped,2)

Unnamed: 0_level_0,ENSEMBL,SYMBOL
Unnamed: 0_level_1,<chr>,<chr>
1,ENSG00000000003,TSPAN6
2,ENSG00000000419,DPM1


In [195]:
nrow(genes_mapped)  # 25.338 genes that can be mapped to SYMBOL!

In [198]:
### Adjust neutrophil gene names

In [199]:
head(neutrophils,2)

Unnamed: 0_level_0,ENSG00000000003.15,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,ENSG00000001460.18,⋯,ENSG00000288607.1,ENSG00000288611.1,ENSG00000288612.1,ENSG00000288615.1,ENSG00000288617.1,ENSG00000288621.1,ENSG00000288631.1,ENSG00000288637.1,ENSG00000288638.1,ENSG00000288642.1
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
k4,0,1,0,0,14,0,0,0,1,0,⋯,0,0,0,0,0,0,0,0,0,0
m14.3,0,0,1,0,6,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [200]:
neutrophils$neutrophil_data = NULL

In [201]:
neutrophils$sample_id = NULL

In [202]:
neutrophils = data.frame(t(neutrophils))

In [203]:
head(neutrophils,2)

Unnamed: 0_level_0,k4,m14.3,m26.2,m14.4,m3.3,k1,k2,m24.4,m25.2,m21.1,⋯,m20.3,m10.2,m14.1,m24.1,m13.2,k7,m18.3,k26,m4.3,k9
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
ENSG00000000003.15,0,0,0,0,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000000419.13,1,0,1,0,1,0,1,1,0,3,⋯,0,2,0,2,0,1,0,1,0,0


In [204]:
neutrophils$gene = rownames(neutrophils)

In [205]:
neutrophils$gene = str_replace(neutrophils$gene, '\\..*', '')

In [206]:
head(genes_mapped, 2)

Unnamed: 0_level_0,ENSEMBL,SYMBOL
Unnamed: 0_level_1,<chr>,<chr>
1,ENSG00000000003,TSPAN6
2,ENSG00000000419,DPM1


In [207]:
neutrophils = merge(neutrophils, genes_mapped, by.x = 'gene', by.y = 'ENSEMBL')

In [208]:
nrow(neutrophils)

In [209]:
neutrophils = neutrophils %>% group_by(SYMBOL) %>% summarise(across(-gene, sum, na.rm = TRUE))

[1m[22m[36mℹ[39m In argument: `across(-gene, sum, na.rm = TRUE)`.
[36mℹ[39m In group 1: `SYMBOL = "A1BG"`.
[1m[22m[33m![39m The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))”


In [210]:
neutrophils = data.frame(neutrophils)

In [211]:
nrow(neutrophils)

In [212]:
rownames(neutrophils) = neutrophils$SYMBOL

In [213]:
neutrophils$SYMBOL = NULL

In [214]:
neutrophils = data.frame(t(neutrophils))

In [215]:
neutrophils$sample_id = rownames(neutrophils)

In [216]:
neutrophils$sample_id = NULL

In [217]:
head(neutrophils,2)

Unnamed: 0_level_0,A1BG,A1BG.AS1,A1CF,A2M,A2M.AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,⋯,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZYXP1,ZZEF1,ZZZ3
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
k4,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,1,0,0,9,0,0,0
m14.3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,2,13,0,0,0


In [218]:
neutrophil_names = colnames(neutrophils)

In [219]:
neutrophil_evaluation =  neutrophils

In [220]:
#colMeans(neutrophils)

#### Filter out ribosomal and mitochondrial genes

In [221]:
head(neutrophils,2)

Unnamed: 0_level_0,A1BG,A1BG.AS1,A1CF,A2M,A2M.AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,⋯,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZYXP1,ZZEF1,ZZZ3
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
k4,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,1,0,0,9,0,0,0
m14.3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,2,13,0,0,0


In [222]:
dim(neutrophils)

In [223]:
neutrophils = neutrophils[,is.na(str_extract(colnames(neutrophils), '^MT.*|^RPL.*|^RPS.*'))]

In [224]:
dim(neutrophils)

#### Investigate and Adjust distribution of values

In [225]:
head(neutrophils,2)

Unnamed: 0_level_0,A1BG,A1BG.AS1,A1CF,A2M,A2M.AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,⋯,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZYXP1,ZZEF1,ZZZ3
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
k4,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,1,0,0,9,0,0,0
m14.3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,2,13,0,0,0


In [226]:
### Check out distribution per sample

In [227]:
neutrophils$sample_id = rownames(neutrophils)

#### Filter out low expressed genes

In [245]:
### genes expressed in less than 50% of samples

In [247]:
neutrophils$sample_id = NULL

In [248]:
head(neutrophils)

Unnamed: 0_level_0,A1BG,A1BG.AS1,A1CF,A2M,A2M.AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,⋯,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZYXP1,ZZEF1,ZZZ3
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
k4,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,1,0,0,9,0,0,0
m14.3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,2,13,0,0,0
m26.2,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,1,0,0,23,0,0,0
m14.4,0,0,0,0,1,0,2,0,0,0,⋯,0,0,0,0,0,1,14,0,2,0
m3.3,0,0,0,0,0,0,0,0,13,0,⋯,0,0,0,0,1,1,26,0,3,2
k1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,1,13,0,0,0


In [249]:
nrow(neutrophils)

In [250]:
dim(neutrophils)

In [259]:
neutrophil_threshold

In [260]:
ncol(neutrophils)

In [261]:
nrow(neutrophils)

In [262]:
neutrophils = neutrophils[,((colSums(neutrophils == 0))/ nrow(neutrophils)) <= neutrophil_threshold]

In [263]:
ncol(neutrophils)

In [264]:
ncol(neutrophils[,highly_expressed_genes])

In [270]:
### Check out distribution per sample

In [271]:
neutrophils$sample_id = rownames(neutrophils)

### Optional: remove samples with high amount of 0

In [292]:
head(neutrophils)

Unnamed: 0_level_0,AATK,ABCA7,ABHD2,ABHD5,ABTB1,ACAP2,ACOX1,ACSL1,ACTB,ACTG1,⋯,ZFP36,ZFP36L1,ZFP36L2,ZMAT2,ZNF106,ZNF281,ZNF467,ZNFX1,ZYX,sample_id
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>
k4,1,2,4,7,8,3,0,9,34,12,⋯,6,11,5,2,0,2,4,0,9,k4
m14.3,4,1,8,3,17,6,4,8,58,24,⋯,19,29,9,0,3,2,3,1,13,m14.3
m26.2,8,6,4,24,27,20,6,42,113,31,⋯,24,32,22,8,11,6,6,5,23,m26.2
m14.4,3,1,5,3,23,9,3,12,73,25,⋯,15,23,6,1,2,1,4,0,14,m14.4
m3.3,2,3,5,7,17,10,2,18,172,215,⋯,29,34,11,2,9,5,7,6,26,m3.3
k1,4,1,1,4,17,2,1,11,44,18,⋯,14,33,2,3,6,2,4,0,13,k1


In [293]:
sample_perc_zero = rowSums(neutrophils==0)/ ncol(neutrophils) 

In [294]:
head(sample_perc_zero,2)

In [295]:
remove_samples = sample_perc_zero[sample_perc_zero > 0.1]

In [296]:
sort(names(remove_samples))

In [297]:
length(names(remove_samples))

In [298]:
neutrophils = neutrophils[!rownames(neutrophils) %in% names(remove_samples),]

In [299]:
length(unique(rownames(neutrophils)))

### Normalization

In [300]:
### Adjust for library size (10.000 counts per sample)

In [301]:
dim(neutrophils)

In [302]:
neutrophils$sample_id =  NULL

In [303]:
scaling_factor = rowSums(neutrophils) /mean(rowSums(neutrophils))

In [304]:
head(scaling_factor)

In [305]:
mean(rowSums(neutrophils))

In [306]:
if(library_adjustment_neutrophils == TRUE){
    neutrophils = apply(neutrophils,2, function(x){ x/scaling_factor})
    }

In [307]:
head(neutrophils,2)

Unnamed: 0,AATK,ABCA7,ABHD2,ABHD5,ABTB1,ACAP2,ACOX1,ACSL1,ACTB,ACTG1,⋯,ZFAS1,ZFP36,ZFP36L1,ZFP36L2,ZMAT2,ZNF106,ZNF281,ZNF467,ZNFX1,ZYX
m14.3,6.325736,1.581434,12.651473,4.744302,26.88438,9.488604,6.325736,12.65147,91.72318,37.95442,⋯,1.581434,30.04725,45.86159,14.23291,0.0,4.744302,3.162868,4.744302,1.581434,20.55864
m26.2,5.21303,3.909773,2.606515,15.639091,17.59398,13.032576,3.909773,27.36841,73.63406,20.20049,⋯,7.819546,15.63909,20.85212,14.33583,5.21303,7.167917,3.909773,3.909773,3.258144,14.98746


In [308]:
ncol(neutrophils[,highly_expressed_genes])

In [309]:
head(rowSums(neutrophils) ) # check out counts per sample  - TBD compare with pseudobulk RNA- DATA

In [311]:
head(neutrophils,2)

Unnamed: 0,AATK,ABCA7,ABHD2,ABHD5,ABTB1,ACAP2,ACOX1,ACSL1,ACTB,ACTG1,⋯,ZFAS1,ZFP36,ZFP36L1,ZFP36L2,ZMAT2,ZNF106,ZNF281,ZNF467,ZNFX1,ZYX
m14.3,6.325736,1.581434,12.651473,4.744302,26.88438,9.488604,6.325736,12.65147,91.72318,37.95442,⋯,1.581434,30.04725,45.86159,14.23291,0.0,4.744302,3.162868,4.744302,1.581434,20.55864
m26.2,5.21303,3.909773,2.606515,15.639091,17.59398,13.032576,3.909773,27.36841,73.63406,20.20049,⋯,7.819546,15.63909,20.85212,14.33583,5.21303,7.167917,3.909773,3.909773,3.258144,14.98746


In [312]:
#### logarithmize neutrophil data 
neutrophils = data.frame(log2(neutrophils + 1))

In [313]:
neutrophils = data.frame(neutrophils)

In [314]:
head(neutrophils,2)

Unnamed: 0_level_0,AATK,ABCA7,ABHD2,ABHD5,ABTB1,ACAP2,ACOX1,ACSL1,ACTB,ACTG1,⋯,ZFAS1,ZFP36,ZFP36L1,ZFP36L2,ZMAT2,ZNF106,ZNF281,ZNF467,ZNFX1,ZYX
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
m14.3,2.872974,1.368173,3.770985,2.522132,4.801385,3.390751,2.872974,3.770985,6.534858,5.283715,⋯,1.368173,4.956393,5.550334,3.929119,0.0,2.522132,2.057578,2.522132,1.368173,4.430194
m26.2,2.635297,2.295656,1.850606,4.056505,4.216764,3.810708,2.295656,4.826213,6.221762,4.406026,⋯,3.140704,4.056505,4.449701,3.938835,2.635297,3.029968,2.295656,2.295656,2.090225,3.998869


In [318]:
### Inspect variance

In [319]:
head(neutrophils,2)

Unnamed: 0_level_0,AATK,ABCA7,ABHD2,ABHD5,ABTB1,ACAP2,ACOX1,ACSL1,ACTB,ACTG1,⋯,ZFAS1,ZFP36,ZFP36L1,ZFP36L2,ZMAT2,ZNF106,ZNF281,ZNF467,ZNFX1,ZYX
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
m14.3,2.872974,1.368173,3.770985,2.522132,4.801385,3.390751,2.872974,3.770985,6.534858,5.283715,⋯,1.368173,4.956393,5.550334,3.929119,0.0,2.522132,2.057578,2.522132,1.368173,4.430194
m26.2,2.635297,2.295656,1.850606,4.056505,4.216764,3.810708,2.295656,4.826213,6.221762,4.406026,⋯,3.140704,4.056505,4.449701,3.938835,2.635297,3.029968,2.295656,2.295656,2.090225,3.998869


In [320]:
variance = apply(neutrophils, 2, var)

In [321]:
head(variance)

In [322]:
mean(apply(neutrophils, 2, var))

In [323]:
ncol(neutrophils[,highly_expressed_genes])

In [324]:
var_threshold = quantile(variance, probs = seq(0, 1, 0.01), na.rm = FALSE,
         names = TRUE)['25%']

In [325]:
var_threshold

In [326]:
length(variance)

In [327]:
keep_genes = names(variance[variance > var_threshold])

In [328]:
ncol(neutrophils)

In [329]:
neutrophils = neutrophils[, keep_genes]

In [330]:
ncol(neutrophils)

In [331]:
ncol(neutrophils[,colnames(neutrophils) %in% highly_expressed_genes])

In [332]:
### Test relation to mean expression and amount of counts

In [350]:
neutrophils$sample_id = rownames(neutrophils)

In [351]:
neutrophils_long = melt(neutrophils)

Using sample_id as id variables



In [352]:
summ_stats = neutrophils_long %>% group_by(sample_id) %>% summarise(sum_counts = mean(value), amount_zero = mean(value ==0))

In [353]:
head(summ_stats,2)

sample_id,sum_counts,amount_zero
<chr>,<dbl>,<dbl>
k1,2.374504,0.08520179
k10,2.446978,0.03699552


In [354]:
neutrophils_long = merge(neutrophils_long, neutrophils_mean_raw)

In [355]:
neutrophils_long = merge(neutrophils_long, summ_stats)

In [356]:
head(neutrophils_long,2)

Unnamed: 0_level_0,sample_id,variable,value,mean_expr,sum_counts,amount_zero
Unnamed: 0_level_1,<chr>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>
1,k1,CHI3L1,3.100819,0.5886713,2.374504,0.08520179
2,k1,NACA,2.011361,0.5886713,2.374504,0.08520179


In [357]:
model=  neutrophils_long %>% group_by(variable) %>% do(model = lm(formula = value ~ mean_expr + amount_zero + mean_expr * amount_zero, data = .))

In [359]:
neutrophils_long = cbind(neutrophils_long, data.frame(residuals = unlist(lapply(model$model, function(x){ x = (residuals(x))}))))

In [360]:
head(neutrophils_long,2)

Unnamed: 0_level_0,sample_id,variable,value,mean_expr,sum_counts,amount_zero,residuals
Unnamed: 0_level_1,<chr>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,k1,CHI3L1,3.100819,0.5886713,2.374504,0.08520179,1.021388
2,k1,NACA,2.011361,0.5886713,2.374504,0.08520179,1.276871


In [361]:
regress_neutrophils

In [362]:
if(regress_neutrophils == TRUE){
    neutrophils_long$value = neutrophils_long$residuals}

In [363]:
neutrophils = neutrophils_long %>% dcast(sample_id ~variable, value.var = "value")

In [364]:
head(neutrophils,2)

Unnamed: 0_level_0,sample_id,AATK,ABCA7,ABHD2,ABHD5,ABTB1,ACAP2,ACOX1,ACSL1,ACTG1,⋯,ZBTB7B,ZC3HAV1,ZFAND5,ZFAS1,ZFP36L2,ZMAT2,ZNF106,ZNF281,ZNF467,ZNFX1
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,k1,2.820332,1.331024,1.331024,2.820332,4.742481,2.011361,1.331024,4.143551,4.821946,⋯,2.471821,0.0,0.0,2.471821,2.011361,2.471821,3.335551,2.011361,2.820332,0.0
2,k10,3.103145,0.0,2.934288,3.836689,3.73752,3.929477,2.934288,4.88134,4.320647,⋯,1.535169,2.522448,1.535169,3.254293,3.254293,2.934288,3.103145,1.535169,2.261967,1.94386


In [365]:
rownames(neutrophils) = neutrophils$sample_id

In [366]:
### Check out distribution per sample

In [367]:
neutrophils$sample_id = rownames(neutrophils)

In [375]:
neutrophils$sample_id = NULL

In [376]:
head(neutrophils,2)

Unnamed: 0_level_0,AATK,ABCA7,ABHD2,ABHD5,ABTB1,ACAP2,ACOX1,ACSL1,ACTG1,ADAM10,⋯,ZBTB7B,ZC3HAV1,ZFAND5,ZFAS1,ZFP36L2,ZMAT2,ZNF106,ZNF281,ZNF467,ZNFX1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
k1,2.820332,1.331024,1.331024,2.820332,4.742481,2.011361,1.331024,4.143551,4.821946,3.100819,⋯,2.471821,0.0,0.0,2.471821,2.011361,2.471821,3.335551,2.011361,2.820332,0.0
k10,3.103145,0.0,2.934288,3.836689,3.73752,3.929477,2.934288,4.88134,4.320647,1.94386,⋯,1.535169,2.522448,1.535169,3.254293,3.254293,2.934288,3.103145,1.535169,2.261967,1.94386


In [377]:
quantile_normalization_neutrophils

In [378]:
if(quantile_normalization_neutrophils  == TRUE){
    neutrophils = t(neutrophils)
    genes_neutrophils = rownames(neutrophils)
    neutrophils  = quantile_normalization(neutrophils ) 
    rownames(neutrophils) = genes_neutrophils
    neutrophils = data.frame(t(neutrophils))
    }

In [379]:
### Check out distribution per sample

In [380]:
neutrophils$sample_id = rownames(neutrophils)

### Filter only genes that are also in single cell (?)

In [394]:
path = paste0(result_path, '/C-Analysis/C0_Filter_Genes_Input_Correlations_Perc_Values.csv')
genes_subset = read.csv(path) # cluster alternative
print(file.info(path)$mtime)

[1] "2023-01-21 12:28:53 CET"


In [395]:
head(genes_subset,2)

Unnamed: 0_level_0,X,perc_cells_expressing_gene,total_amount_cells_expressing_gene,gene,cluster
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<chr>
1,AL627309.1,0.24327612,18,AL627309.1,8_B-cell
2,AL627309.4,0.05406136,4,AL627309.4,8_B-cell


In [396]:
genes = unique(genes_subset$gene)

In [397]:
align_genes

In [398]:
if(align_genes == TRUE){
    neutrophils = neutrophils[,colnames(neutrophils) %in% genes]
    }

In [399]:
dim(neutrophils)

In [400]:
### Check out distribution per sample

In [401]:
neutrophils$sample_id = rownames(neutrophils)

### Prepare long format

In [415]:
neutrophils$sample_id = rownames(neutrophils)

In [416]:
neutrophils_long = melt(neutrophils)

Using sample_id as id variables



In [417]:
neutrophils_long$type = 'neutrophil'

In [418]:
head(neutrophils_long,2)

Unnamed: 0_level_0,sample_id,variable,value,type
Unnamed: 0_level_1,<chr>,<fct>,<dbl>,<chr>
1,k1,AATK,2.746071,neutrophil
2,k10,AATK,3.074688,neutrophil


In [419]:
### Add duplicates

In [420]:
neutrophils_long_dupl = neutrophils_long[neutrophils_long$sample_id %in% c('m13.2', 'm6.4'),]

In [421]:
neutrophils_long_dupl$sample_id[neutrophils_long_dupl$sample_id == 'm13.2'] = 'm13.22'

In [422]:
neutrophils_long_dupl$sample_id[neutrophils_long_dupl$sample_id == 'm6.4'] = 'm6.42'

In [423]:
head(neutrophils_long_dupl,2)

Unnamed: 0_level_0,sample_id,variable,value,type
Unnamed: 0_level_1,<chr>,<fct>,<dbl>,<chr>
85,m6.42,AATK,1.162339,neutrophil
177,m6.42,ABCA7,1.919833,neutrophil


In [424]:
if(add_duplicates == TRUE){
    neutrophils_long = rbind(neutrophils_long, neutrophils_long_dupl)
}

In [425]:
length(unique(neutrophils_long$variable))

In [426]:
length(unique(neutrophils_long$sample_id))

## RNA-Single-Seq

### Load cell-expression- gene cluster info

In [427]:
path = paste0(result_path, '/G-Analysis/G0_Filter_Genes_Input_Correlations_Perc_Values.csv')
cell_perc_cluster = read.csv( path) # cluster alternative
print(path)
print(file.info(path)$mtime)

[1] "../results/current/G-Analysis/G0_Filter_Genes_Input_Correlations_Perc_Values.csv"
[1] "2023-02-18 15:19:03 CET"


In [429]:
head(cell_perc_cluster,2)

Unnamed: 0_level_0,X,perc_cells_expressing_gene,total_amount_cells_expressing_gene,gene,cluster
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<chr>
1,AL627309.1,0.23400142,23,AL627309.1,B cell
2,AL627309.4,0.06104385,6,AL627309.4,B cell


In [430]:
head(cell_perc_cluster[cell_perc_cluster$gene == 'CD28',],2)

Unnamed: 0_level_0,X,perc_cells_expressing_gene,total_amount_cells_expressing_gene,gene,cluster
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<chr>
2995,CD28,4.262896,419,CD28,B cell
22216,CD281,5.363248,251,CD28,CD16 Mono


In [431]:
nrow(cell_perc_cluster)

In [432]:
length(unique(cell_perc_cluster$gene))

In [433]:
length(unique(cell_perc_cluster$cluster))

In [434]:
cell_perc_cluster[((cell_perc_cluster$perc_cells > 10) & (cell_perc_cluster$total_amount_cells_expressing_gene > 1200))  ,] %>% group_by(cluster) %>% count()  # investigate amount of genes after filtering

cluster,n
<chr>,<int>
B cell,3827
CD14 Mono,6322
CD16 Mono,2847
CD4 CTL,624
CD4 Naive,1581
CD4 TCM,3898
CD4 TEM,2222
CD8 Naive,561
CD8 TEM,3432
NK,3553


In [435]:
name

In [436]:
##### Decide on conditions for filtering genes out of single-cell data! (uncommented no filtering!)
if(name %in%  c( 'V_AZIMUTH')){
    cell_perc_cluster =  cell_perc_cluster[((cell_perc_cluster$perc_cells > 50) & (cell_perc_cluster$total_amount_cells_expressing_gene > 1200)) | ((cell_perc_cluster$perc_cells > 40) & (cell_perc_cluster$total_amount_cells_expressing_gene > 3000)) ,] 
    }
# condition removed for complete data

In [437]:
head(cell_perc_cluster,2)

Unnamed: 0_level_0,X,perc_cells_expressing_gene,total_amount_cells_expressing_gene,gene,cluster
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<chr>
95,RPL22,99.39974,9770,RPL22,B cell
119,PARK7,49.74056,4889,PARK7,B cell


In [439]:
nrow(cell_perc_cluster)

### Load data about amount of cells per cluster

In [443]:
path = paste0(result_path, '/G-Analysis/G1_Cell_Sample_Cluster_Distribution.csv')
amount_cells_data = read.csv(path)
print(path)
print(file.info(path)$mtime)

[1] "../results/current/G-Analysis/G1_Cell_Sample_Cluster_Distribution.csv"
[1] "2023-02-18 15:16:22 CET"


In [444]:
amount_cells_data$X = NULL

In [445]:
amount_cells_data = data.frame(cluster = names(colSums(amount_cells_data)), amount_cells = colSums(amount_cells_data))

In [446]:
head(amount_cells_data,2)

Unnamed: 0_level_0,cluster,amount_cells
Unnamed: 0_level_1,<chr>,<dbl>
ASDC,ASDC,8
B.cell,B.cell,9829


### Load Pseudobulk aggregated RNA data from C0

In [449]:
name

In [450]:
if(name %in% c('V_AZIMUTH')){
    path = paste0(result_path, '/G-Analysis/G0_aggregated_RNA_input_correlations_all.RDS')
    load(path)   
    print(path)
    print(file.info(path)$mtime)
    }

[1] "../results/current/G-Analysis/G0_aggregated_RNA_input_correlations_all.RDS"
[1] "2023-02-18 15:37:53 CET"


In [451]:
all_genes = rownames(pb)

In [452]:
head(all_genes)

In [453]:
length(all_genes)

In [454]:
pb

class: SingleCellExperiment 
dim: 19221 121 
metadata(2): experiment_info agg_pars
assays(29): ASDC B cell ... Platelet Treg
rownames(19221): AL627309.1 AL627309.4 ... AC004556.1 AC240274.1
rowData names(0):
colnames(121): 1.1-L1 10-L11 ... 9.2-L4 9.3-L7
colData names(26): group_id classification_measurement ... library_char
  ident
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

In [462]:
assays(pb)

List of length 29
names(29): ASDC B cell CD14 Mono CD16 Mono ... pDC Plasmablast Platelet Treg

### Pre-Process

#### Remove Clusters

In [465]:
### Filte pb on relevant clusters

In [466]:
pb

class: SingleCellExperiment 
dim: 19221 121 
metadata(2): experiment_info agg_pars
assays(29): ASDC B cell ... Platelet Treg
rownames(19221): AL627309.1 AL627309.4 ... AC004556.1 AC240274.1
rowData names(0):
colnames(121): 1.1-L1 10-L11 ... 9.2-L4 9.3-L7
colData names(26): group_id classification_measurement ... library_char
  ident
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

In [467]:
names(assays(pb))

In [468]:
assay(pb, 'Platelet') = NULL

In [469]:
assay(pb, 'Plasmablast') = NULL

In [470]:
assay(pb, 'pDC') = NULL

In [471]:
assay(pb, 'NK_CD56bright') = NULL

In [472]:
assay(pb, 'NK Proliferating') = NULL

In [473]:
assay(pb, 'ILC') = NULL

In [474]:
assay(pb, 'HSPC') = NULL

In [475]:
assay(pb, 'Eryth') = NULL

In [476]:
assay(pb, 'Doublet') = NULL

In [477]:
assay(pb, 'dnT') = NULL

In [478]:
assay(pb, 'cDC1') = NULL

In [479]:
assay(pb, 'CD8 TCM') = NULL

In [480]:
assay(pb, 'CD8 Proliferating') = NULL

In [481]:
assay(pb, 'CD4 Proliferating') = NULL

In [482]:
assay(pb, 'ASDC') = NULL

In [483]:
length(names(assays(pb)))

In [484]:
names(assays(pb))

#### Prepare gene-cluster dataframe + normalize

In [485]:
assays(pb)

List of length 14
names(14): B cell CD14 Mono CD16 Mono CD4 CTL CD4 Naive ... gdT MAIT NK Treg

In [487]:
nodes = names(assays(pb))

In [488]:
head(nodes)

In [489]:
cell_types = nodes

In [490]:
#nodes = nodes[1:3]

In [491]:
final_data = data.frame(samples = colnames(pb))

In [492]:
final_data_vis = data.frame(samples = colnames(pb))

In [493]:
rownames(final_data) = final_data$samples

In [494]:
rownames(final_data_vis) = final_data_vis$samples

In [495]:
head(final_data,2)

Unnamed: 0_level_0,samples
Unnamed: 0_level_1,<chr>
1.1-L1,1.1-L1
10-L11,10-L11


In [496]:
head(genes_subset,2)

Unnamed: 0_level_0,X,perc_cells_expressing_gene,total_amount_cells_expressing_gene,gene,cluster
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<chr>
1,AL627309.1,0.24327612,18,AL627309.1,8_B-cell
2,AL627309.4,0.05406136,4,AL627309.4,8_B-cell


In [497]:
nodes

In [508]:
genes_subset = cell_perc_cluster

In [529]:

for(i in unique(genes_subset$cluster)){
    data = assay(pb, i)


    ##### Normalize counts per sample (library size) 

    if(is.na(str_extract(name, 'scano')) == TRUE){
        scaling_factor = colSums(data) /mean(colSums(data))

        for (j in 1:ncol(data)){
            if(scaling_factor[j] != 0){
                data[,j] = data[,j]/ scaling_factor[j]
                }
            }
        }

    ### Subset data on genes with minimum expression in cluster
    data = data[rownames(data) %in% genes_subset$gene[genes_subset$cluster == i],]


    ##### TBD pre-processing stepd

    if(is.na(str_extract(name, 'scano')) == TRUE){
        data = log2(data+1) # logarithmize count values (optional!)
        }

    #### Quantile normalization 

    if(quantile_normalization_single_cell == TRUE){
        data_rows = rownames(data)
        data  = quantile_normalization(data ) 
        rownames(data) = data_rows
        }

    rownames(data) = paste0(i, '__' ,rownames(data))

    data = data.frame(t(data))

    expr_mean = data.frame( mean_expr = rowMeans(data))
    colnames(expr_mean) = i
    rownames(expr_mean) = rownames(data)

    final_data = merge(final_data, data, by = 0)
    final_data_vis = merge(final_data_vis, expr_mean, by = 0)

    rownames(final_data) =  final_data$Row.names
    rownames(final_data_vis) = final_data_vis$Row.names
    final_data$Row.names = NULL
    final_data_vis$Row.names = NULL
    }

   

In [530]:
head(final_data,2)

Unnamed: 0_level_0,samples,B.cell__RPL22,B.cell__PARK7,B.cell__ENO1,B.cell__PRDM2,B.cell__CAPZB,B.cell__HP1BP3,B.cell__CDC42,B.cell__HNRNPR,B.cell__RPL11,⋯,CD4.CTL__MT.CO1,CD4.CTL__MT.CO2,CD4.CTL__MT.ATP6,CD4.CTL__MT.CO3,CD4.CTL__MT.ND3,CD4.CTL__MT.ND4L,CD4.CTL__MT.ND4,CD4.CTL__MT.ND5,CD4.CTL__MT.ND6,CD4.CTL__MT.CYB
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1.1-L1,1.1-L1,3.489087,0.7517455,0.4938072,1.332314,0.6630314,0.7887402,1.698647,0.7887402,5.000988,⋯,5.153109,5.380742,5.755433,4.975793,5.198533,0.2775368,4.667043,3.326732,0.6091993,5.072069
10-L11,10-L11,3.462751,0.9412188,0.8205378,1.18275,0.9206895,0.7610354,1.479767,0.7213612,4.968836,⋯,4.884216,5.039021,4.625803,4.605801,4.482661,1.4139015,4.136982,2.834366,1.2093461,4.524055


In [531]:
ncol(final_data)

In [545]:
nrow(genes_subset)

In [546]:
sample_data$sample_merge = paste0(sample_data$id, '-', sample_data$library)

In [547]:
#head(sample_data)

In [548]:
dim(final_data)

In [549]:
final_data = merge(final_data, sample_data[,c('sample_id', 'sample_merge')], by.x = 'samples', by.y = 'sample_merge')

In [550]:
dim(final_data)

In [551]:
rownames(final_data)  = final_data$samples

In [555]:
dim(final_data)

In [556]:
dim(final_data_vis)

#### Filter genes

In [557]:
### Remove mitochondrial & ribosomal genes

In [558]:
head(final_data,2)

Unnamed: 0_level_0,samples,B.cell__RPL22,B.cell__PARK7,B.cell__ENO1,B.cell__PRDM2,B.cell__CAPZB,B.cell__HP1BP3,B.cell__CDC42,B.cell__HNRNPR,B.cell__RPL11,⋯,CD4.CTL__MT.CO2,CD4.CTL__MT.ATP6,CD4.CTL__MT.CO3,CD4.CTL__MT.ND3,CD4.CTL__MT.ND4L,CD4.CTL__MT.ND4,CD4.CTL__MT.ND5,CD4.CTL__MT.ND6,CD4.CTL__MT.CYB,sample_id
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1.1-L1,1.1-L1,3.489087,0.7517455,0.4938072,1.332314,0.6630314,0.7887402,1.698647,0.7887402,5.000988,⋯,5.380742,5.755433,4.975793,5.198533,0.2775368,4.667043,3.326732,0.6091993,5.072069,m1.1
10-L11,10-L11,3.462751,0.9412188,0.8205378,1.18275,0.9206895,0.7610354,1.479767,0.7213612,4.968836,⋯,5.039021,4.625803,4.605801,4.482661,1.4139015,4.136982,2.834366,1.2093461,4.524055,k10


In [559]:
ncol(final_data)

In [560]:
final_data = final_data[, !colnames(final_data) %in% (colnames(final_data)[!is.na(str_extract(colnames(final_data), '__MT.*|__RPL.*|__RPS.*'))])]

In [561]:
ncol(final_data)   # minus sample + sample_id column --> 11.831

In [562]:
head(final_data,2)

Unnamed: 0_level_0,samples,B.cell__PARK7,B.cell__ENO1,B.cell__PRDM2,B.cell__CAPZB,B.cell__HP1BP3,B.cell__CDC42,B.cell__HNRNPR,B.cell__PNRC2,B.cell__SRSF10,⋯,CD4.CTL__SNU13,CD4.CTL__SMDT1,CD4.CTL__TSPO,CD4.CTL__SOD1,CD4.CTL__SON,CD4.CTL__TTC3,CD4.CTL__HMGN1,CD4.CTL__ITGB2,CD4.CTL__PRMT2,sample_id
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1.1-L1,1.1-L1,0.7517455,0.4938072,1.332314,0.6630314,0.7887402,1.698647,0.7887402,0.8199788,0.9933239,⋯,0.8392725,1.2053873,0.6091993,0.8392725,1.57955,0.2775368,1.756926,1.053814,0.6091993,m1.1
10-L11,10-L11,0.9412188,0.8205378,1.18275,0.9206895,0.7610354,1.479767,0.7213612,0.7969667,0.8442397,⋯,0.9897038,0.4184606,1.0203473,1.3312251,1.57955,0.8628635,1.331225,1.560533,0.7464948,k10


In [564]:
## Genes with high variance

In [565]:
head(final_data,2)

Unnamed: 0_level_0,samples,B.cell__PARK7,B.cell__ENO1,B.cell__PRDM2,B.cell__CAPZB,B.cell__HP1BP3,B.cell__CDC42,B.cell__HNRNPR,B.cell__PNRC2,B.cell__SRSF10,⋯,CD4.CTL__SNU13,CD4.CTL__SMDT1,CD4.CTL__TSPO,CD4.CTL__SOD1,CD4.CTL__SON,CD4.CTL__TTC3,CD4.CTL__HMGN1,CD4.CTL__ITGB2,CD4.CTL__PRMT2,sample_id
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1.1-L1,1.1-L1,0.7517455,0.4938072,1.332314,0.6630314,0.7887402,1.698647,0.7887402,0.8199788,0.9933239,⋯,0.8392725,1.2053873,0.6091993,0.8392725,1.57955,0.2775368,1.756926,1.053814,0.6091993,m1.1
10-L11,10-L11,0.9412188,0.8205378,1.18275,0.9206895,0.7610354,1.479767,0.7213612,0.7969667,0.8442397,⋯,0.9897038,0.4184606,1.0203473,1.3312251,1.57955,0.8628635,1.331225,1.560533,0.7464948,k10


In [566]:
final_data$samples = NULL

In [567]:
final_data$sample_id = NULL

In [568]:
gene_variance = apply(final_data, 2, function(x) {var( x,na.rm = TRUE)})

In [569]:
head(gene_variance)

In [585]:
ncol(final_data)

In [586]:
final_data$samples = rownames(final_data)

In [587]:
final_data = merge(final_data, sample_data[,c('sample_id', 'sample_merge')], by.x = 'samples', by.y = 'sample_merge')

#### Prepare long format

In [588]:
final_data_long = melt(final_data)

Using samples, sample_id as id variables



In [589]:
### Decide what to do with duplicates

In [590]:
head(final_data_long,2)

Unnamed: 0_level_0,samples,sample_id,variable,value
Unnamed: 0_level_1,<chr>,<chr>,<fct>,<dbl>
1,1.1-L1,m1.1,B.cell__PARK7,0.7517455
2,10-L11,k10,B.cell__PARK7,0.9412188


In [652]:
add_duplicates

In [591]:
if(add_duplicates == TRUE){
    final_data_long$sample_id[final_data_long$samples == '13.2-L6']  = 'm13.22'      #13.2-L5, 13.2-L6	, 6.4-L10, 6.4-L14	
    final_data_long$sample_id[final_data_long$samples == '6.4-L14']  = 'm6.42'
    }


In [592]:
final_data_long$samples = NULL

In [593]:
final_data_long$type = 'single_cell'

In [594]:
final_data_long = final_data_long %>% group_by(sample_id, type, variable) %>% summarise(value = mean(value))  # take average for samples measured twice

[1m[22m`summarise()` has grouped output by 'sample_id', 'type'. You can override using
the `.groups` argument.


In [596]:
length(unique(final_data_long$variable))

# Integration of all data sources (V1 with gene-gene correletations)

## Combine all data sources

In [613]:
data_long = rbind(final_data_long, cytokines_long,proteomics_long, neutrophils_long, clinical_data_long )
#data_long = rbind(final_data_long, cytokines_long,proteomics_long, neutrophils_long ) # vesion without clinical data

In [614]:
length(unique(final_data_long$variable))

In [615]:
head(data_long,2)

sample_id,type,variable,value
<chr>,<chr>,<chr>,<dbl>
k10,single_cell,B.cell__PARK7,0.9412188
k10,single_cell,B.cell__ENO1,0.8205378


In [616]:
length(unique(data_long$variable))

In [617]:
unique(data_long$type)

In [618]:
data_long$config = paste0(quantile_normalization_cyto, '-', quantile_normalization_proteomics, '-', quantile_normalization_neutrophils, '-', neutrophil_threshold, '-', regress_neutrophils, '-', library_adjustment_neutrophils, '-')

In [619]:
print(paste0(result_path, '/G-Analysis/Combined_Data_', name, '.csv'))

[1] "../results/current/G-Analysis/Combined_Data_V_AZIMUTH.csv"


In [620]:
write.csv(data_long, paste0(result_path, '/G-Analysis/Combined_Data_', name, '.csv'))

In [621]:
name