-
Notifications
You must be signed in to change notification settings - Fork 0
/
Toil_Norm.Rmd
252 lines (217 loc) · 10.1 KB
/
Toil_Norm.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
---
title: "Toil_Selection"
author: "Michael Kesling"
date: "9/22/2019"
output: rmarkdown::github_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
require(matrixStats)
######## FUNCTIONS NOT TO BE PRINTED IN MARKDOWN DOC:
scaleData <- function(DM, MEAN=TRUE, ROW=FALSE){
# this function takes a data matrix, DM, and scales the data by each row's
# standard deviation (if ROW=TRUE). If ROW=FALSE, then all scaling will be
# performed by column. It may also first subtract off the (row)'s mean if
# MEAN=TRUE. For (rows) whose SD=0, they are set aside and added back once
# the normalization is finished
# We assume that counts have already been normalized across samples!!
rowcol <- ifelse(ROW==TRUE, 1, 2) # normalization by row or column
rowcolInv <- ifelse(rowcol==1, 2, 1) # inverse val needed for norm
SD <- apply(DM, rowcol, sd) # calc std deviations
numSDzeros <- sum(SD==0)
# remove row / col if SD == 0
if(numSDzeros>0){ # rm row/col if SD == 0
zeroSD <- which(SD == 0) # id examples with zero SD
DM <- ifelse(rowcol==2, list(DM[, -zeroSD]), list(DM[-zeroSD,]))[[1]]
SD <- SD[-zeroSD]
}
means <- apply(DM, rowcol, mean)
# apply normalization with or without mean subtraction:
if(MEAN==FALSE){
DM <- t(apply(DM, rowcolInv, function(x){x / SD}))
}
else{
DM <- t(apply(DM, rowcolInv, function(x){(x - means) / SD}))
}
if(rowcol == 1){ # transpose matrix if normalization is across rows
DM <- t(DM)
}
# add back all-zero row / column taken out earlier (if done at all)
if(numSDzeros>0){
if(rowcol ==1){
zeros <- matrix(rep(0, length(zeroSD)*dim(DM)[2]), ncol=dim(DM)[2])
rownames(zeros) <- names(zeroSD)
DM <- rbind(DM, zeros)
}
else{
zeros <- matrix(rep(0, length(zeroSD)*dim(DM)[1]), nrow=dim(DM)[1])
colnames(zeros) <- names(zeroSD)
DM <- cbind(DM, zeros)
}
}
return(DM)
}
```
### Looking at Batch Effects
As I've had trouble with the samples that went through the ComBat Batch normalization, I've downloaded the input files of that process, but which have already gone through the Toil-standardization (RSEM_expected_counts with DESeq2-standardization).
Here, I'm going to subset the 8GB file with those files that are contained within the 398 samples I've been working with.
Then, I'm going to convert the Ensemble IDs to the more readable HUGO gene IDs.
```{r}
require(dplyr)
require(magrittr)
wangMatrix <- read.table("/Users/mjk/Desktop/Tresorit_iOS/projects/RNA-Seq/data/wangBreastFPKM398_Attrib.txt", header=TRUE) # rownames okay now
wangSelectedSamples <- colnames(wangMatrix)
wangTCGAsamples <- wangSelectedSamples[grep("^TCGA", wangSelectedSamples)] %>%
{gsub("\\.", "-", .)} %>%
{gsub("([TCGA]-[^-]+[-][^-]+[-][^-]+)[-].*", "\\1", .)} %>%
{gsub("[AB]$", "", .)}
wangGTEXsamples <- wangSelectedSamples[grep("^GTEX", wangSelectedSamples)] %>%
{gsub("\\.", "-", .)}
wangSamples <- c(wangGTEXsamples, wangTCGAsamples)
#toilSampleNames <- system(" head -1 /Users/mjk/RNA-Seq_2019/TOIL_Data/TCGA-GTEx-TARGET-gene-exp-counts.deseq2-normalized.log2 | sed 's/\t/ /g'", intern = TRUE) %>% strsplit(" ")
toilSampleNames <- read.table("toilDataHeader", sep="\t")
#grep(wangSamples[1], toilSampleNames[[1]])
#apply(wangSamples, grep, toilSampleNames[[1]])
multiGrep <- function(var1, var2){
return(grep(var1, var2))
}
wangSampleMapping <- sapply(wangSamples, multiGrep, var2=toilSampleNames[[1]])
# missing samples in TOIL set:
missingSamples <- wangSampleMapping[grep("integer", wangSampleMapping)]
```
Next, I read in the toil dataset (8GB+) and subset it by the Wang sample labels.
```{r}
# code only run the first time:
#toilData <- read.table("/Users/mjk/RNA-Seq_2019/TOIL_Data/TCGA-GTEx-TARGET-gene-exp-counts.deseq2-normalized.log2", header=TRUE, stringsAsFactors = FALSE)
# QA:
#sum(gsub("-", "\\.", toilSampleNames[[1]]) %in% colnames(toilData))
#length(toilSampleNames[[1]])
#write.table(paste(unique(sort(unlist(wangSampleMapping[grep("integer", wangSampleMapping, invert=TRUE)], use.names = FALSE))),
# collapse=","), "wangSamples.uniq", sep=",",
# row.names = FALSE, col.names = FALSE)
#relevantCols <- unique(sort(unlist(wangSampleMapping[grep("integer", wangSampleMapping, invert=TRUE)], use.names = FALSE)-1))
#toilSubset <- toilData[, relevantCols]
#write.table(toilSubset, "toilSubset382.txt", sep="\t")
```
```{r}
# just read already-subsetted dataframe
toilSubset <- read.table("toilSubset382.txt", sep="\t")
```
Next, I add gene names to the dataframe based on the Ensembl ID:
```{r}
require(dplyr)
toilGeneAnnot <- read.table("~/RNA-Seq_2019/TOIL_Data/gencode.v23.annotation.gene.probemap",
header=TRUE)
id2gene <- setNames(as.list(as.character(toilGeneAnnot$gene)),
toilGeneAnnot$id)
toilSubset <- toilSubset %>% tibble::rownames_to_column()
toilSubset <- toilSubset %>% mutate(gene=id2gene[toilSubset$rowname])
```
Next, I reorder the toilSubset data frame to group "like" samples and make the gene name the row name.
```{r}
colReOrder <- grep("^GTEX", colnames(toilSubset))
colReOrder <- c(colReOrder, grep("11$", colnames(toilSubset)))
colReOrder <- c(colReOrder, grep("01$", colnames(toilSubset)))
tmp <- toilSubset[,colReOrder]
rownames(tmp) <- paste0(toilSubset$gene, "-", toilSubset$rowname) #complicated
# but needed to avoid duplicates
toilSubset <- tmp
rm(tmp)
```
Next, I'll remove every gene if >75% of it's samples have XXX
```{r}
require(ggplot2)
# Let's start with distribution of median counts for each gene.
quant75LevelsGenes <- apply(toilSubset, 1, quantile)[4,]
hist(quant75LevelsGenes, breaks=100)
```
Remember that these values are on the log2 scale! Therefore, genes under log2(exp) = 5 are likely very noisy. For now, I'm only going to filter out the lowest genes:
```{r}
print(paste("The maximum log2(Gene 75th quantile) is ", max(quant75LevelsGenes), " and the number of genes whose log2(75th quantile expression-level) is less than 0.2 is ", sum(quant75LevelsGenes < 0.2)))
```
Gene Filtering, Convert Values to Normal Scale, and transpose Matrix
```{r}
# filter genes virtually non-existent:
toilSubset <- toilSubset %>% tibble::rownames_to_column("genename") %>% filter(quant75LevelsGenes > 0.2) # 31995 genes left across 382 samples plus genename column
# to natural scale and transpose the matrix:
toilSubNatural <- as.matrix(exp(toilSubset[,2:dim(toilSubset)[2]]) -1)
rownames(toilSubNatural) <- toilSubset$genename
toilSubNatural <- t(toilSubNatural)
```
# t-SNE to view Cancer/Healthy Diagnosis Partitioning
```{r}
require(Rtsne);
require(ggplot2);
set.seed(31234)
tSNEout_toilSN <- Rtsne(toilSubNatural, dims=2)
prognosis <- c(rep(1, 76), rep(2,109), rep(3, 197)) # 1/2 = healthy, 3 = cancer
tsne_plot_toil <- data.frame(x=tSNEout_toilSN$Y[,1], y = tSNEout_toilSN$Y[,2], col=prognosis)
colors3pal <- c("#FDAE6B", "#E6550D", "#56B4E9")
#tsne_plot_3class <- data.frame(x=tSNEout_full$Y[,1], y = tSNEout_full$Y[,2], col=prognosis)
ggplot(tsne_plot_toil) +
geom_point(aes(x=x, y=y, color=as.factor(col))) +
ggtitle("t-SNE Plot of Training Data Using All Filtered Predictors of Toil Data") +
scale_color_manual(name="Category",
breaks = c("1", "2", "3"),
values = c(colors3pal[1], colors3pal[2], colors3pal[3]),
labels = c("Healthy-GTEX", "Healthy-TCGA", "Cancer-TCGA"))
```
The t-SNE plot shows significant batch effects between healthy TCGA samples and healthy GTEX samples.
### Logistic Regression with Lasso Regularizer on Toil Data
I'd like to compare the performance on this breast cancer dataset in the absence of batch normalization (ComBat).
```{r, fig.height=8, fig.width=8}
require(glmnet);
require(ggplot2);
# install_github("ririzarr/rafalib")
require(rafalib);
dim(toilSubNatural)
##############
### Split toil matrix into training and test sets:
require(caTools)
set.seed(233992812)
outcome <- c(rep(0, 185), rep(1, 197))
# bind outcome variable on data frame for even, random partitioning
toilSubNatural <- data.frame(cbind(toilSubNatural, outcome))
idxTrain <- sample.split(toilSubNatural$outcome, SplitRatio = 0.75)
toilSubNatTrain <- subset(toilSubNatural, idxTrain==TRUE)
outcomeTrain <- subset(toilSubNatural$outcome, idxTrain==TRUE)
toilSubNatTest <- subset(toilSubNatural, idxTrain==FALSE)
outcomeTest <- subset(toilSubNatural$outcome, idxTrain==FALSE)
# remove outcome variable:
toilSubNatTrainPred <- toilSubNatTrain %>% select(-outcome)
toilSubNatTestPred <- toilSubNatTest %>% select(-outcome)
# convert back to matrices:
toilSubNatTrainPred <- as.matrix(toilSubNatTrainPred)
toilSubNatTestPred <- as.matrix(toilSubNatTestPred)
# center and scale the data:
toilTrainScaled <- scaleData(toilSubNatTrainPred, TRUE, FALSE)
toilTestScaled <- scaleData(toilSubNatTestPred, TRUE, FALSE)
# QA STDEV and MEAN of each column
all(abs(apply(toilTrainScaled, 2, mean)) < 0.00001)
all(abs(apply(toilTestScaled, 2, mean)) < 0.00001)
all(abs(apply(toilTrainScaled, 2, sd)) < 1.01 & abs(apply(toilTrainScaled, 2, sd)) > 0.99)
all(abs(apply(toilTestScaled, 2, sd)) < 1.01 & abs(apply(toilTestScaled, 2, sd)) > 0.99)
###############
### Fitting Logistic Regression with Lasso Regularizer on toilSubNatural matrix
set.seed(1011)
fitToil.lasso <- glmnet(toilTrainScaled, outcomeTrain, family="binomial",
alpha = 1)
plot(fitToil.lasso, xvar="lambda", label=TRUE)
```
It's clear that scaling the predictors before Lasso enables many more to be used. It would be interesting to see how many of these genes have a low level of expression (and therefore might be noisy data).
```{r}
#fitToil.lasso
set.seed(1011)
cv.Toil.lasso <- cv.glmnet(toilTrainScaled, outcomeTrain, family="binomial", alpha=1)
#type.measure = "deviance")
plot(cv.Toil.lasso)
```
```{r}
toilTest <- cbind(1, toilTestScaled)
colnames(toilTest)[1] <- "(Intercept)"
toilTest <- as.matrix(toilTest)
coefsToil <- coef(fitToil.lasso, s=cv.Toil.lasso$lambda.1se)
testPredictions_Toil <- ifelse(toilTest %*% coefsToil > 0, 1, 0)
table(outcomeTest, testPredictions_Toil)
```
##### (in progress)