-
Notifications
You must be signed in to change notification settings - Fork 2
/
sharing.Rmd
199 lines (162 loc) · 6.15 KB
/
sharing.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
---
title: "The benefit of sharing information across genes"
author: "John Blischak"
date: "2018-08-20"
output: workflowr::wflow_html
---
## Introduction
The simulation and visualizations below demonsrate the differences in the
results due to limma sharing information across genes to shrink the estimates of
the variance.
## Setup
```{r packages, message=FALSE}
library("cowplot")
library("dplyr")
library("ggplot2")
theme_set(theme_classic(base_size = 16))
library("knitr")
opts_chunk$set(fig.width = 10, fig.height = 5, message = FALSE)
library("stringr")
library("tidyr")
```
## Simulation
Create some synthetic data for illustrating concepts. The simulated gene
expression matrix has 100 genes and 6 samples (3 treatment and 3 control).
```{r simulation}
set.seed(12345)
create_exp_mat <- function(n1, n2, ng,
alpha_mean, beta_mean, epsilon_sd) {
status <- c(rep(0, n1), rep(1, n2))
ns <- length(status)
status <- matrix(status, nrow = 1)
alpha <- rnorm(ng, mean = alpha_mean, sd = 1)
beta <- matrix(rnorm(ng, mean = beta_mean, sd = 1), ncol = 1)
epsilon <- matrix(rnorm(ng * ns, mean = 0, sd = epsilon_sd),
nrow = ng, ncol = ns)
Yg <- alpha + beta %*% status + epsilon
return(Yg)
}
gexp <- rbind(
# 30 non-DE genes with high variance
create_exp_mat(n1 = 3, n2 = 3, ng = 30, alpha_mean = 10, beta_mean = -1:1, epsilon_sd = 3),
# 30 non-DE genes with low variance
create_exp_mat(n1 = 3, n2 = 3, ng = 30, alpha_mean = 10, beta_mean = -1:1, epsilon_sd = 1),
# 10 upregulated DE genes with low variance
create_exp_mat(n1 = 3, n2 = 3, ng = 10, alpha_mean = 10, beta_mean = 5, epsilon_sd = 1),
# 10 upregulated DE genes with high variance
create_exp_mat(n1 = 3, n2 = 3, ng = 10, alpha_mean = 10, beta_mean = 5, epsilon_sd = 3),
# 10 downregulated DE genes with low variance
create_exp_mat(n1 = 3, n2 = 3, ng = 10, alpha_mean = 10, beta_mean = -5, epsilon_sd = 1),
# 10 downregulated DE genes with high variance
create_exp_mat(n1 = 3, n2 = 3, ng = 10, alpha_mean = 10, beta_mean = -5, epsilon_sd = 3)
)
# Add names for samples
group <- rep(c("con", "treat"), each = ncol(gexp) / 2)
samples <- paste0(group, 1:3)
colnames(gexp) <- samples
# Add names for genes
genes <- sprintf("gene%02d", 1:nrow(gexp))
rownames(gexp) <- genes
heatmap(gexp)
```
## Standard linear model
Find differentially expressed genes using a standard linear model.
```{r lm}
lm_beta <- numeric(length = nrow(gexp))
lm_se <- numeric(length = nrow(gexp))
lm_p <- numeric(length = nrow(gexp))
for (i in 1:length(lm_p)) {
mod <- lm(gexp[i, ] ~ group)
result <- summary(mod)
lm_beta[i] <- result$coefficients[2, 1]
lm_se[i] <- result$coefficients[2, 2]
lm_p[i] <- result$coefficients[2, 4]
}
hist(lm_p, xlab = "p-values", main = "Standard linear model")
```
## limma linear model
Find differentially expressed genes using limma.
```{r limma}
library("limma")
design <- model.matrix(~group)
colnames(design) <- c("Intercept", "treat")
fit <- lmFit(gexp, design)
head(fit$coefficients)
fit <- eBayes(fit)
results <- decideTests(fit[, 2])
summary(results)
stats <- topTable(fit, coef = "treat", number = nrow(fit), sort.by = "none")
hist(stats[, "P.Value"], xlab = "p-values", main = "limma linear model")
```
## Comparison
Compare the p-values from `lm` and `limma` (both adjusted for multiple testing
with the BH FDR).
```{r comparison}
stats <- cbind(stats,
sd = apply(gexp, 1, sd),
var = apply(gexp, 1, var),
lm_beta, lm_se,
lm_p = p.adjust(lm_p, method = "BH"))
stats$labels_pre <- c(rep("non-DE; high-var", 30),
rep("non-DE; low-var", 30),
rep("DE-up; low-var", 10),
rep("DE-up; high-var", 10),
rep("DE-down; low-var", 10),
rep("DE-down; high-var", 10))
stats$labels <- rep("non-DE", nrow(stats))
stats$labels[stats$adj.P.Val < 0.05 & stats$lm_p < 0.05] <- "DE"
stats$labels[stats$adj.P.Val < 0.05 & stats$lm_p >= 0.05] <- "limma-only"
stats$labels[stats$adj.P.Val >= 0.05 & stats$lm_p < 0.05] <- "lm-only"
table(stats$labels)
table(stats$labels, stats$labels_pre)
stopifnot(stats$logFC == stats$lm_beta)
de <- data.frame(effect_size = stats$lm_beta,
std_dev = stats$sd,
lm = stats$lm_p < 0.05,
limma = stats$adj.P.Val < 0.05)
head(de)
# View the number of discrepancies
table(de$lm, de$limma)
# Plot effect size (y-axis) vs. standard deviation (x-axis)
ggplot(de, aes(x = std_dev, y = effect_size, color = limma)) +
geom_point()
ggplot(stats, aes(x = sd, y = logFC, color = labels)) +
geom_point()
ggplot(stats, aes(x = logFC, y = -log10(P.Value), color = labels)) +
geom_point()
```
## Example genes
Visualize example genes with boxplots. Note that the limma-only gene has higher
variance compared to the lm-only gene.
```{r example-genes}
# Find a good example of a DE gene
index <- which(stats$labels_pre == "DE-up; low-var" & stats$labels == "DE")[1]
single_gene <- gexp %>% as.data.frame %>%
slice(index) %>%
gather(key = "group", value = "gene") %>%
mutate(group = str_extract(group, "[a-z]*")) %>%
as.data.frame()
# Find a gene that is DE for both, DE for lm-only, and DE for limma-only
de_not <- de_lm <- which(stats$labels == "non-DE" &
stats$labels_pre == "non-DE; high-var" &
stats$logFC > 0)[1]
de_both <- which(stats$labels == "DE" &
stats$labels_pre == "DE-up; low-var")[1]
de_lm <- which(stats$labels == "lm-only" &
stats$labels_pre == "non-DE; low-var" &
stats$logFC > 0)[1]
de_limma <- which(stats$labels == "limma-only" &
stats$labels_pre == "DE-up; high-var")[1]
compare <- gexp %>%
as.data.frame() %>%
slice(c(de_not, de_both, de_lm, de_limma)) %>%
mutate(type = c("neither", "both", "lm-only", "limma-only")) %>%
gather(key = "group", value = "gene", con1:treat3) %>%
mutate(group = str_extract(group, "[a-z]*")) %>%
as.data.frame()
head(compare)
# Plot gene expression (gene; y-axis) vs. group (x-axis)
ggplot(compare, aes(x = group, y = gene)) +
geom_boxplot() +
facet_wrap(~type, nrow = 1)
```