-
Notifications
You must be signed in to change notification settings - Fork 2
/
pca_tf.Rmd
179 lines (134 loc) · 4.83 KB
/
pca_tf.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
---
title: "PCA vs Technical Variables"
author: "Po-Yuan Tung"
date: 2018-01-31
output: workflowr::wflow_html
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Setup
```{r packages, message=FALSE}
library("cowplot")
library("dplyr")
library("edgeR")
library("ggplot2")
library("heatmap3")
library("reshape2")
library("SingleCellExperiment")
source("code/utility.R")
```
## PCA
### Before fileter
```{r data}
sce_raw <- readRDS("data/sce-raw.rds")
## look at human genes
sce_raw_hs <- sce_raw[rowData(sce_raw)$source == "H. sapiens", ]
head(colData(sce_raw_hs))
## remove genes of all 0s
sce_raw_hs_clean <- sce_raw_hs[rowSums(assay(sce_raw_hs)) != 0, ]
dim(sce_raw_hs_clean)
## convert to log2 cpm
mol_raw_hs_cpm <- edgeR::cpm(assay(sce_raw_hs_clean), log = TRUE)
mol_raw_hs_cpm_means <- rowMeans(mol_raw_hs_cpm)
summary(mol_raw_hs_cpm_means)
## keep genes with reasonable expression levels
mol_raw_hs_cpm <- mol_raw_hs_cpm[mol_raw_hs_cpm_means > median(mol_raw_hs_cpm_means), ]
dim(mol_raw_hs_cpm)
anno_raw = data.frame(colData(sce_raw))
anno_raw_hs = data.frame(colData(sce_raw_hs))
```
```{r before-filter}
## pca of genes with reasonable expression levels
pca_raw_hs <- run_pca(mol_raw_hs_cpm)
## a function of pca vs technical factors
get_r2 <- function(x, y) {
stopifnot(length(x) == length(y))
model <- lm(y ~ x)
stats <- summary(model)
return(stats$adj.r.squared)
}
## selection of technical factor
covariates <- anno_raw %>% dplyr::select(experiment, well, concentration, raw:unmapped,
starts_with("detect"), chip_id, molecules)
## look at the first 6 PCs
pcs <- pca_raw_hs$PCs[, 1:6]
## generate the data
r2_before <- matrix(NA, nrow = ncol(covariates), ncol = ncol(pcs),
dimnames = list(colnames(covariates), colnames(pcs)))
for (cov in colnames(covariates)) {
for (pc in colnames(pcs)) {
r2_before[cov, pc] <- get_r2(covariates[, cov], pcs[, pc])
}
}
## plot
heatmap3(r2_before, cexRow=1, cexCol=1, margins=c(8,8), scale = "none",
ylab="technical factor", main = "Before filter")
plot_pca(pca_raw_hs$PCs, pcx = 1, pcy = 2, explained = pca_raw_hs$explained,
metadata = anno_raw_hs, color="chip_id")
```
### After filter
```{r filter-data}
sce_filtered = sce_raw[,sce_raw$filter_all == TRUE]
```
Compute log2 CPM based on the library size before filtering.
```{r log2}
log2cpm <- edgeR::cpm(assay(sce_filtered), log = TRUE)
dim(log2cpm)
```
```{r after-filter}
pca_log2cpm <- run_pca(log2cpm)
anno = data.frame(colData(sce_filtered))
anno$experiment <- as.factor(anno$experiment)
plot_pca(x=pca_log2cpm$PCs, explained=pca_log2cpm$explained,
metadata=anno, color="chip_id")
plot_pca(x=pca_log2cpm$PCs, explained=pca_log2cpm$explained,
metadata=anno, color="experiment")
```
```{r after-filter-tf}
## selection of technical factor
covariates <- anno %>% dplyr::select(experiment, well, chip_id,
concentration, raw:unmapped,
starts_with("detect"), molecules)
## look at the first 6 PCs
pcs <- pca_log2cpm$PCs[, 1:6]
## generate the data
r2 <- matrix(NA, nrow = ncol(covariates), ncol = ncol(pcs),
dimnames = list(colnames(covariates), colnames(pcs)))
for (cov in colnames(covariates)) {
for (pc in colnames(pcs)) {
r2[cov, pc] <- get_r2(covariates[, cov], pcs[, pc])
}
}
## plot heatmap
heatmap3(r2, cexRow=1, cexCol=1, margins=c(8,8), scale = "none",
ylab="technical factor", main = "After filter")
```
PC1 correlated with number of genes detected, which is described in [Hicks et al 2017](https://academic.oup.com/biostatistics/advance-article/doi/10.1093/biostatistics/kxx053/4599254)
Number of genes detected also highly correlated with sequencing metrics, especially total molecule number per sample.
```{r cor}
cor_tech <- cor(as.matrix(covariates[,4:11]),use="pairwise.complete.obs")
heatmap(cor_tech, symm = TRUE)
```
Look at the top 10% expression genes to see if the correlation of PC1 and number of detected gene would go away. However, the PC1 is still not individual (chip_id).
```{r top}
## look at top 10% of genes
log2cpm_mean <- rowMeans(log2cpm)
summary(log2cpm_mean)
log2cpm_top <- log2cpm[rank(log2cpm_mean) / length(log2cpm_mean) > 1 - 0.1, ]
dim(log2cpm_top)
pca_top <- run_pca(log2cpm_top)
## look at the first 6 PCs
pcs <- pca_top$PCs[, 1:6]
## generate the data
r2_top <- matrix(NA, nrow = ncol(covariates), ncol = ncol(pcs),
dimnames = list(colnames(covariates), colnames(pcs)))
for (cov in colnames(covariates)) {
for (pc in colnames(pcs)) {
r2_top[cov, pc] <- get_r2(covariates[, cov], pcs[, pc])
}
}
## plot heatmap
heatmap3(r2_top, cexRow=1, cexCol=1, margins=c(8,8), scale = "none",
ylab="technical factor", main = "Top 10 % gene")
```