Jupyter notebook 
-------

This notebook illustrates the codes used to derive log2 scaled foldchange for the protein expression data used in the paper **"Data independent acquisition mass spectrometry in severe Rheumatic Heart Disease (RHD) identifies a proteomic signature showing ongoing inflammation and effectively classifying RHD cases"**

Author: **Jing Yang**

Date: **17/11/2021**

Contact: Jing.Yang@manchester.ac.uk


In [1]:
library(data.table)
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.5     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m   masks [34mdata.table[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m     masks [34mdata.table[39m::first()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m       masks [34mstats[39m::lag()
[31m✖[39m [34mdplyr[39m::[32mlast()[39m      masks [34mdata.table[39m::last()
[31m✖[39m [34mpurrr[39m::[32mtranspose()[39m masks [34mdata.table[39m::transpose(

In [2]:
sessionInfo()

R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04.7 LTS

Matrix products: default
BLAS:   /usr/lib/openblas-base/libblas.so.3
LAPACK: /usr/lib/libopenblasp-r0.2.18.so

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] forcats_0.5.1     stringr_1.4.0     dplyr_1.0.7       purrr_0.3.4      
 [5] readr_2.0.2       tidyr_1.1.4       tibble_3.1.5      ggplot2_3.3.5    
 [9] tidyverse_1.3.1   data.table_1.14.2

loaded via a namespace (and not attached):
 [1] pbdZMQ_0.3-5     tidyselect_1.1.1 repr_1.1.3       have

### read protein expression data

In [3]:
data <- read.csv(file='RHD_data_filtered.csv')

In [4]:
### protein expression data is already log2 scaled
head(data)


Unnamed: 0_level_0,StollerID,A5YKK6,B9A064,O00187,O00391,O00429,O00533,O14556,O14791,O14980,⋯,Q9UQ35,Q9Y2S2,Q9Y2Z0,Q9Y446,Q9Y490,Q9Y4L1,Q9Y5Y7,Q9Y6R7,Q9Y6U3,Group
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
1,RHD01_1,18.72726,13.32757,9.545403,10.93532,,9.693684,10.765847,13.54541,14.28571,⋯,,16.33353,17.04409,13.99638,18.5501,10.78719,11.558106,,16.0257,Case
2,RHD01_10,17.09342,11.99219,11.11964,11.36314,12.65523,,11.287226,13.51076,,⋯,,,15.68102,14.41961,18.27449,11.17612,,17.5759,15.99541,Control
3,RHD01_100,,16.9941,9.84126,12.05345,13.49063,10.120308,9.788836,13.07768,15.6301,⋯,,16.21441,14.98246,15.27436,18.64911,11.67593,9.798376,18.05495,15.21582,Control
4,RHD01_101,,11.05973,10.855083,11.52179,14.05859,10.599453,10.542719,13.9848,15.74108,⋯,10.03196,16.51174,,,19.14743,12.67218,10.626237,,16.07679,Control
5,RHD01_102,18.01907,14.42647,,12.03946,13.58525,10.212223,10.447536,14.15161,15.58056,⋯,9.875406,16.80517,,15.85524,18.39157,11.65981,11.53189,19.18588,16.05441,Control
6,RHD01_103,17.32733,14.57,10.600376,12.43219,13.43931,10.531699,10.678076,13.50614,14.92608,⋯,10.815293,16.79194,,15.80052,19.43061,11.51,11.223427,,16.29087,Control


### calculate log2 fold change of the data

In [5]:
protein_foldchange <- data %>% select(-StollerID) %>% gather(UniProtID, value, -Group) %>% group_by(UniProtID,Group) %>% summarize(value=list(value)) %>%
spread(Group,value) %>%   mutate(mean_Case = mean(unlist(Case), na.rm=TRUE), mean_Control = mean(unlist(Control), na.rm=TRUE), 
                                 log2foldchange = mean_Case-mean_Control, p_value = t.test(unlist(Case), unlist(Control))$p.value, 
                                 t_value = t.test(unlist(Case), unlist(Control))$statistic) %>% select(-c(Case, Control)) %>% arrange(desc(log2foldchange))

`summarise()` has grouped output by 'UniProtID'. You can override using the `.groups` argument.



In [6]:
### mapping between UniProtID and protein name

protein_withname = read.table('protein_withname.txt', header=TRUE)

In [7]:
head(protein_withname)

Unnamed: 0_level_0,UniProtID,ProteinName
Unnamed: 0_level_1,<fct>,<fct>
1,A5YKK6,CNOT1
2,B9A064,IGLL5
3,O00187,MASP2
4,O00391,QSOX1
5,O00429,DNM1L
6,O00533,CHL1


In [8]:
protein_foldchange_full <- left_join(protein_foldchange,protein_withname) %>% select(UniProtID, ProteinName, everything())

Joining, by = "UniProtID"



In [9]:
head(protein_foldchange_full)

UniProtID,ProteinName,mean_Case,mean_Control,log2foldchange,p_value,t_value
<chr>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
P05164,MPO,17.43545,15.98783,1.4476155,0.002440581,3.064512
P02741,CRP,13.93564,12.76735,1.1682886,4.488543e-08,5.591432
P0DJI8,SAA1,12.33971,11.19757,1.1421394,7.991483e-06,4.556095
Q15848,ADIPOQ,11.21368,10.0774,1.1362792,6.494611e-22,10.646823
P04406,GAPDH,13.44243,12.47953,0.9629039,0.01315826,2.507173
P01861,IGHG4,13.83671,12.978,0.8587099,0.0004533116,3.546973


### write results to a file

In [10]:
write.table(file='Protein_withfoldchange.csv',protein_foldchange_full, quote=F, row.names=F, sep=',')