Toil_Norm.Rmd

---
title: "Toil_Selection"
author: "Michael Kesling"
date: "9/22/2019"
output: rmarkdown::github_document 
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
require(matrixStats)

######## FUNCTIONS NOT TO BE PRINTED IN MARKDOWN DOC:
scaleData <- function(DM, MEAN=TRUE, ROW=FALSE){
   # this function takes a data matrix, DM, and scales the data by each row's
   # standard deviation (if ROW=TRUE).  If ROW=FALSE, then all scaling will be
   # performed by column.  It may also first subtract off the (row)'s mean if
   # MEAN=TRUE.  For (rows) whose SD=0, they are set aside and added back once
   # the normalization is finished
   # We assume that counts have already been normalized across samples!!
   rowcol <- ifelse(ROW==TRUE, 1, 2)          # normalization by row or column
   rowcolInv <- ifelse(rowcol==1, 2, 1)       # inverse val needed for norm
   SD <- apply(DM, rowcol, sd)                # calc std deviations
   numSDzeros <- sum(SD==0)
   # remove row / col if SD == 0
   if(numSDzeros>0){                          # rm row/col if SD == 0
      zeroSD <- which(SD == 0)                # id examples with zero SD
      DM <- ifelse(rowcol==2, list(DM[, -zeroSD]), list(DM[-zeroSD,]))[[1]]
      SD <- SD[-zeroSD]
   }

   means <- apply(DM, rowcol, mean)
   
   # apply normalization with or without mean subtraction:
   if(MEAN==FALSE){
      DM <- t(apply(DM, rowcolInv, function(x){x / SD}))
   }
   else{
      DM <- t(apply(DM, rowcolInv, function(x){(x - means) / SD}))
   }

   if(rowcol == 1){           # transpose matrix if normalization is across rows
      DM <- t(DM)
   }
   
   # add back all-zero row / column taken out earlier (if done at all)
   if(numSDzeros>0){
      if(rowcol ==1){
         zeros <- matrix(rep(0, length(zeroSD)*dim(DM)[2]), ncol=dim(DM)[2])
         rownames(zeros) <- names(zeroSD)
         DM <- rbind(DM, zeros)
      }
      else{
         zeros <- matrix(rep(0, length(zeroSD)*dim(DM)[1]), nrow=dim(DM)[1])
         colnames(zeros) <- names(zeroSD)
         DM <- cbind(DM, zeros)
      }
   }
   return(DM)
}
```
### Looking at Batch Effects

As I've had trouble with the samples that went through the ComBat Batch normalization, I've downloaded the input files of that process, but which have already gone through the Toil-standardization (RSEM_expected_counts with DESeq2-standardization).

Here, I'm going to subset the 8GB file with those files that are contained within the 398 samples I've been working with.

Then, I'm going to convert the Ensemble IDs to the more readable HUGO gene IDs.

```{r}
require(dplyr)
require(magrittr)
wangMatrix <- read.table("/Users/mjk/Desktop/Tresorit_iOS/projects/RNA-Seq/data/wangBreastFPKM398_Attrib.txt", header=TRUE) # rownames okay now
wangSelectedSamples <- colnames(wangMatrix)
wangTCGAsamples <- wangSelectedSamples[grep("^TCGA", wangSelectedSamples)] %>%
   {gsub("\\.", "-", .)} %>% 
   {gsub("([TCGA]-[^-]+[-][^-]+[-][^-]+)[-].*", "\\1", .)} %>% 
   {gsub("[AB]$", "", .)}

wangGTEXsamples <- wangSelectedSamples[grep("^GTEX", wangSelectedSamples)] %>%
   {gsub("\\.", "-", .)}

wangSamples <- c(wangGTEXsamples, wangTCGAsamples)

#toilSampleNames <- system(" head -1 /Users/mjk/RNA-Seq_2019/TOIL_Data/TCGA-GTEx-TARGET-gene-exp-counts.deseq2-normalized.log2 | sed 's/\t/ /g'", intern = TRUE) %>% strsplit(" ")
toilSampleNames <- read.table("toilDataHeader", sep="\t")

#grep(wangSamples[1], toilSampleNames[[1]])
#apply(wangSamples, grep, toilSampleNames[[1]])
multiGrep <- function(var1, var2){
   return(grep(var1, var2))
}

wangSampleMapping <- sapply(wangSamples, multiGrep, var2=toilSampleNames[[1]])

# missing samples in TOIL set:
missingSamples <- wangSampleMapping[grep("integer", wangSampleMapping)]
```
Next, I read in the toil dataset (8GB+) and subset it by the Wang sample labels.
```{r}
# code only run the first time:
#toilData <- read.table("/Users/mjk/RNA-Seq_2019/TOIL_Data/TCGA-GTEx-TARGET-gene-exp-counts.deseq2-normalized.log2", header=TRUE, stringsAsFactors = FALSE)

# QA:
#sum(gsub("-", "\\.", toilSampleNames[[1]]) %in% colnames(toilData))
#length(toilSampleNames[[1]])


#write.table(paste(unique(sort(unlist(wangSampleMapping[grep("integer", wangSampleMapping, invert=TRUE)], use.names = FALSE))), 
#                  collapse=","), "wangSamples.uniq", sep=",",
#            row.names = FALSE, col.names = FALSE)
#relevantCols <- unique(sort(unlist(wangSampleMapping[grep("integer", wangSampleMapping, invert=TRUE)], use.names = FALSE)-1))

#toilSubset <- toilData[, relevantCols]

#write.table(toilSubset, "toilSubset382.txt", sep="\t")
```

```{r}
# just read already-subsetted dataframe
toilSubset <- read.table("toilSubset382.txt", sep="\t")
```
Next, I add gene names to the dataframe based on the Ensembl ID:
```{r}
require(dplyr)
toilGeneAnnot <- read.table("~/RNA-Seq_2019/TOIL_Data/gencode.v23.annotation.gene.probemap",
                            header=TRUE)
id2gene <- setNames(as.list(as.character(toilGeneAnnot$gene)),
                    toilGeneAnnot$id)
toilSubset <- toilSubset %>% tibble::rownames_to_column()
toilSubset <- toilSubset %>% mutate(gene=id2gene[toilSubset$rowname])
```
Next, I reorder the toilSubset data frame to group "like" samples and make the gene name the row name.
```{r}
colReOrder <- grep("^GTEX", colnames(toilSubset))
colReOrder <- c(colReOrder, grep("11$", colnames(toilSubset)))
colReOrder <- c(colReOrder, grep("01$", colnames(toilSubset)))
tmp <- toilSubset[,colReOrder]
rownames(tmp) <- paste0(toilSubset$gene, "-", toilSubset$rowname) #complicated
# but needed to avoid duplicates
toilSubset <- tmp
rm(tmp)
```
Next, I'll remove every gene if >75% of it's samples have XXX
```{r}
require(ggplot2)
# Let's start with distribution of median counts for each gene.
quant75LevelsGenes <- apply(toilSubset, 1, quantile)[4,]
hist(quant75LevelsGenes, breaks=100)
```
Remember that these values are on the log2 scale!  Therefore, genes under log2(exp) = 5 are likely very noisy.  For now, I'm only going to filter out the lowest genes:

```{r}
print(paste("The maximum log2(Gene 75th quantile) is ", max(quant75LevelsGenes), " and the number of genes whose log2(75th quantile expression-level) is less than 0.2 is ", sum(quant75LevelsGenes < 0.2)))

```
Gene Filtering, Convert Values to Normal Scale, and transpose Matrix
```{r}
# filter genes virtually non-existent:
toilSubset <- toilSubset %>% tibble::rownames_to_column("genename") %>%  filter(quant75LevelsGenes > 0.2)    # 31995 genes left across 382 samples plus genename column
# to natural scale and transpose the matrix:
toilSubNatural <- as.matrix(exp(toilSubset[,2:dim(toilSubset)[2]]) -1)
rownames(toilSubNatural) <- toilSubset$genename
toilSubNatural <- t(toilSubNatural)

```
# t-SNE to view Cancer/Healthy Diagnosis Partitioning
```{r}
require(Rtsne);
require(ggplot2);
set.seed(31234)
tSNEout_toilSN <- Rtsne(toilSubNatural, dims=2)
prognosis <- c(rep(1, 76), rep(2,109), rep(3, 197))       # 1/2 = healthy, 3 = cancer
tsne_plot_toil <- data.frame(x=tSNEout_toilSN$Y[,1], y = tSNEout_toilSN$Y[,2], col=prognosis)
colors3pal <- c("#FDAE6B", "#E6550D",  "#56B4E9")
#tsne_plot_3class <- data.frame(x=tSNEout_full$Y[,1], y = tSNEout_full$Y[,2], col=prognosis)
ggplot(tsne_plot_toil) + 
   geom_point(aes(x=x, y=y, color=as.factor(col))) + 
   ggtitle("t-SNE Plot of Training Data Using All Filtered Predictors of Toil Data") +
   scale_color_manual(name="Category",
                      breaks = c("1", "2", "3"),
                      values = c(colors3pal[1], colors3pal[2], colors3pal[3]),
                      labels = c("Healthy-GTEX", "Healthy-TCGA", "Cancer-TCGA"))
```
The t-SNE plot shows significant batch effects between healthy TCGA samples and healthy GTEX samples.
### Logistic Regression with Lasso Regularizer on Toil Data
I'd like to compare the performance on this breast cancer dataset in the absence of batch normalization (ComBat).
```{r, fig.height=8, fig.width=8}
require(glmnet);
require(ggplot2);
# install_github("ririzarr/rafalib")
require(rafalib);
dim(toilSubNatural)
##############
### Split toil matrix into training and test sets:
require(caTools)
set.seed(233992812)
outcome <- c(rep(0, 185), rep(1, 197))
# bind outcome variable on data frame for even, random partitioning
toilSubNatural <- data.frame(cbind(toilSubNatural, outcome))
idxTrain <- sample.split(toilSubNatural$outcome, SplitRatio = 0.75)
toilSubNatTrain <- subset(toilSubNatural, idxTrain==TRUE)
outcomeTrain <- subset(toilSubNatural$outcome, idxTrain==TRUE)
toilSubNatTest <- subset(toilSubNatural, idxTrain==FALSE)
outcomeTest <- subset(toilSubNatural$outcome, idxTrain==FALSE)
# remove outcome variable:
toilSubNatTrainPred <- toilSubNatTrain %>% select(-outcome)
toilSubNatTestPred <- toilSubNatTest %>% select(-outcome)
# convert back to matrices:
toilSubNatTrainPred <- as.matrix(toilSubNatTrainPred)
toilSubNatTestPred <- as.matrix(toilSubNatTestPred)

# center and scale the data:
toilTrainScaled <- scaleData(toilSubNatTrainPred, TRUE, FALSE)
toilTestScaled <- scaleData(toilSubNatTestPred, TRUE, FALSE)

# QA STDEV and MEAN of each column
all(abs(apply(toilTrainScaled, 2, mean)) < 0.00001)
all(abs(apply(toilTestScaled, 2, mean)) < 0.00001)
all(abs(apply(toilTrainScaled, 2, sd)) < 1.01 & abs(apply(toilTrainScaled, 2, sd)) > 0.99)
all(abs(apply(toilTestScaled, 2, sd)) < 1.01 & abs(apply(toilTestScaled, 2, sd)) > 0.99)
###############
### Fitting Logistic Regression with Lasso Regularizer on toilSubNatural matrix
set.seed(1011)
fitToil.lasso <- glmnet(toilTrainScaled, outcomeTrain, family="binomial",
                           alpha = 1)
plot(fitToil.lasso, xvar="lambda", label=TRUE)

```
It's clear that scaling the predictors before Lasso enables many more to be used.  It would be interesting to see how many of these genes have a low level of expression (and therefore might be noisy data).
```{r}
#fitToil.lasso
set.seed(1011)
cv.Toil.lasso <- cv.glmnet(toilTrainScaled, outcomeTrain, family="binomial", alpha=1) 
                      #type.measure = "deviance")
plot(cv.Toil.lasso)
```
```{r}
toilTest <- cbind(1, toilTestScaled)
colnames(toilTest)[1] <- "(Intercept)"
toilTest <- as.matrix(toilTest)

coefsToil <- coef(fitToil.lasso, s=cv.Toil.lasso$lambda.1se)
testPredictions_Toil <- ifelse(toilTest %*% coefsToil > 0, 1, 0)
table(outcomeTest, testPredictions_Toil)
```
##### (in progress)