# 3.4 Calling differentially expressed peaks with DESeq2 and limma

### IMPORTANT: Please make sure that you are using the R kernel to run this notebook. ###


In this tutorial, we will focus on calling differential peaks: 
![Analysis pipeline](images/part4.png)

## Running DESeq

DESeq(https://bioconductor.org/packages/release/bioc/html/DESeq2.html) uses read count data, such as in our matrix **all.readcount.txt**, to estimate differential gene expression across conditions specified in a metdata file.  We run DESeq with 4 comparisons (which we call "contrasts"): 
* Media 
    * glucose vs ethanol
* Strain: 
    *  WT vs asf1
    *  WT vs rtt109
    *  asf1 vs rtt109 
   

In [1]:
#change to your working directory 
username="annashch"
setwd(paste("/scratch/",username,sep=""))

In [2]:
#load the DESeq2 library
library(DESeq2,quietly = TRUE)



Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which, which.max, which.min


Attaching package: ‘S4Vectors’

The following object is masked from ‘package:base’:

    expand.grid

Welcome to Bioconductor

    Vignettes contain introductory material; view with
    '

In [3]:
#We read in the counts data matrix and the metdata matrix in the same manner as we did in tutorial 3.1 
#load the read count matrix
count_data=read.table("all.readcount.txt",header=TRUE)
rownames(count_data)=paste(count_data$Chrom,count_data$Start,count_data$End,sep='\t')
#remove the columns we will not use 
count_data$Chrom=NULL
count_data$Start=NULL
count_data$End=NULL
count_data$ID=NULL
#We mustn't forget to swap the sample columns for "jkcheng_rtt109_YPGE_3" and "jkcheng_WT_YPD_3"
count_data$tmp1=count_data$jkcheng_rtt109_YPGE_3
count_data$tmp2=count_data$jkcheng_WT_YPD_3
count_data$jkcheng_rtt109_YPGE_3=count_data$tmp2
count_data$jkcheng_WT_YPD_3=count_data$tmp1
count_data$tmp1=NULL
count_data$tmp2=NULL

head(count_data)

Unnamed: 0_level_0,cln3.SCD.0_6MNaCl.Rep1_R1_001,cln3.SCD.0_6MNaCl.Rep2_R1_001,cln3.SCD.Rep1_R1_001,cln3.SCD.Rep2_R1_001,cln3.SCE.0_6MNaCl.Rep1_R1_001,cln3.SCE.0_6MNaCl.Rep2_R1_001,cln3.SCE.Rep1_R1_001,cln3.SCE.Rep2_R1_001,whi5.cln3.SCE.Rep1_R1_001,whi5.cln3.SCE.Rep2_R1_001,whi5.SCE.Rep1_R1_001,whi5.SCE.Rep2_R1_001,WT.SCD.0_6MNaCl.Rep1_R1_001,WT.SCD.0_6MNaCl.Rep2_R1_001,WT.SCD.Rep1_R1_001,WT.SCD.Rep2_R1_001,WT.SCE.0_6MNaCl.Rep1_R1_001,WT.SCE.0_6MNaCl.Rep2_R1_001,WT.SCE.Rep1_R1_001,WT.SCE.Rep2_R1_001
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chrI	0	781,0,0,151,191,226,158,210,127,292,296,232,188,83,246,25,182,241,203,9,244
chrI	6332	6549,0,0,537,820,1342,1050,1157,590,1460,1624,1562,713,590,1585,115,732,2227,2032,90,1230
chrI	9138	9609,0,0,175,222,366,251,304,160,401,483,410,261,143,379,34,220,379,383,17,344
chrI	20611	21197,0,0,249,309,369,282,316,189,394,406,322,314,134,342,60,370,334,310,19,410
chrI	28155	29092,0,0,50,50,48,37,42,22,57,65,55,72,12,49,7,47,65,64,1,60
chrI	29173	30197,0,0,88,115,215,226,225,129,241,390,284,118,86,224,27,164,324,316,25,249


In [5]:
metadata=read.table("/metadata/TC2019_samples.tsv",header=TRUE)
#We use the "factor" function to tell R which variables are categorical rather than continuous 
metadata$Strain=factor(metadata$Strain)
metadata$Media=factor(metadata$Media)
metadata$Researcher=factor(metadata$Researcher)
#we don't need the other metadata columns for this analysis 
metadata$Sample=NULL
metadata$Replicate=NULL
rownames(metadata)=metadata$ID
metadata$ID=NULL
metadata$ResearcherIntended=NULL
metadata$Confounding=NULL
metadata$Lane=NULL
#make sure the rows in metadata match the order of the columns in count_data 
metadata=metadata[names(count_data),]
head(metadata)

ERROR: Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : line 1 did not have 8 elements


In [None]:
#We set threshold for determining differential expression 
padjust_thresh=0.01 


In [None]:
#create a DESeq2 object with the data, metadata, and model information 
ddsMat=DESeqDataSetFromMatrix(countData=as.matrix(count_data),
                            colData=metadata,
                            design=~Strain+Media+Researcher)


In [None]:
#Run DESeq2 analysis 
dds<-DESeq(ddsMat)

In [None]:
#We can examine several contrasts in the resulting DESeq2 object
resultsNames(dds)

In [None]:
#Specify the contrasts we want to examine (we indicated these above)
deseq_contrasts=list(c("Media","YPD","YPGE"),
                     c("Strain","WT","asf1"),
                     c("Strain","WT","rtt109"),
                     c("Strain","asf1","rtt109"))
contrast_names=c("Media_YPD_vs_YPGE",
        "Strain_WT_vs_asf1",
        "Strain_WT_vs_rtt109",
        "Strain_asf1_vs_rtt109")



In [None]:
#Query the DESeq2 results to find differential peaks for each contrast, using our padjust_thresh and lfc_thresh values.
for(contrast_index in seq(1,4))
{
        comparison_name=unlist(contrast_names[contrast_index])    
        print(comparison_name)
        ds=results(dds,
           contrast=unlist(deseq_contrasts[contrast_index]))
       
        #write  entries for all peaks
        write.table(ds,file=paste(comparison_name,".txt",sep=""),quote=FALSE,row.names=TRUE,col.names=TRUE,sep='\t')
    
        #subset the peak set to just the differential peaks 
        ds=na.omit(ds)
        sig=ds[ds$padj<padjust_thresh,] 
    
        #find positive log fold change peaks 
        positive_sig=sig[sig$log2FoldChange > 0,]
    
        #find negative log fold change peaks 
        negative_sig=sig[sig$log2FoldChange <0,]
    
        write.table(positive_sig,
                    file=paste(comparison_name,".differential.positive.txt",sep=""),
                    quote=FALSE,row.names=FALSE,col.names=FALSE,sep='\t')
        write.table(negative_sig,
                    file=paste(comparison_name,".differential.negative.txt",sep=""),
                    quote=FALSE,row.names=FALSE,col.names=FALSE,sep='\t')
}


This code will generate 4 sets of files: 

* Media_YPD_vs_YPGE.txt  
* Media_YPD_vs_YPGE.differential.positive.txt  
* Media_YPD_vs_YPGE.differential.negative.txt  


* Strain_WT_vs_asf1.txt  
* Strain_WT_vs_asdf1.differential.positive.txt
* Strain_WT_vs_asdf1.differential.negative.txt


* Strain_WT_vs_rtt109.txt  
* Strain_WT_vs_rtt109.differential.positive.txt  
* Strain_WT_vs_rtt109.differential.negative.txt  


* Strain_asf1_vs_rtt109.txt
* Strain_asf1_vs_rtt109.differential.positive.txt
* Strain_asf1_vs_rtt109.differential.negative.txt

The first is the raw output from DESeq for all peaks. We will not have time to discuss everything in this file, but feel free to read the DESeq manual and see if you can understand it. The second,  contains a list of the IDs of the differentially open peaks from ATAC‐seq. The p‐value cutoff for differential openness that we use is 0.01. 

### Running limma ###

If you recall, we used the R limma package to remove the "Researcher" batch effect in our data. Limma can also be used for differential peak calling. Limma uses a similar algorithm to DESeq2. We will go through the process of calling differential peaks with limma and see how the peak rankings differ between limma and DESeq2 -- it's always best to sanity check your results by running them through several similar analysis algorithms. 

In [6]:
#import the limma library 

library(limma)
#design the model 
design=model.matrix(~0+Strain+Media+Researcher,data=metadata)

#We use the "voom" function associated with the limma package to normalize the count data 
vm=voom(count_data,design)

#fit the model to the data 
fit=lmFit(vm,design=vm$design)


#We'll examine the media contrast 
cont.matrix=makeContrasts(media="MediaYPGE",levels=fit)
media_model=eBayes(contrasts.fit(fit,cont.matrix))
res_limma=topTable(media_model,n=nrow(count_data))
head(res_limma)


Attaching package: ‘limma’

The following object is masked from ‘package:DESeq2’:

    plotMA

The following object is masked from ‘package:BiocGenerics’:

    plotMA



ERROR: Error in as.data.frame.default(data, optional = TRUE): cannot coerce class ‘structure("standardGeneric", package = "methods")’ to a data.frame


### Comparing DESeq2 and limma voom outputs ### 

In [None]:
#Let's extract the media comparison from DESeq2
res_deseq2=results(dds,
           contrast=unlist(deseq_contrasts[1]))
res_deseq2=as.data.frame(res_deseq2)


In [None]:
#We need to merge the two result dataframes by peak name So that we can generate a scatterplot of
#padj in one vs the other 
res_limma$peak=rownames(res_limma)
res_deseq2$peak=rownames(res_deseq2)
nrow(res_limma)
nrow(res_deseq2)

In [None]:
merged_df=merge(res_limma,res_deseq2,by="peak")
merged_df$limma_padj=-10*log10(merged_df$padj)
merged_df$deseq2_padj=-10*log10(merged_df$adj.P.Val)



In [None]:
head(merged_df)

In [None]:
library(ggplot2)
ggplot(merged_df,aes(x=deseq2_padj,y=limma_padj))+
    geom_point(alpha=0.1)+
    xlim(0,400)+
    ylim(0,400)

The p-values appear to be pretty correlated. Let's make sure by computing the spearman and pearson correlations: 

In [None]:
spearman_cor=cor(merged_df$limma_padj,merged_df$deseq2_padj,method="spearman")
spearman_cor

In [None]:
pearson_cor=cor(merged_df$limma_padj,merged_df$deseq2_padj,method="pearson")
pearson_cor

Excellent, the correlations are near 90%. 

Finally, we plot the rank comparison of the p-values across the two methods. 

In [None]:
#use the "rank" function to generate rank columns for the p-values 
merged_df$limma_padj_rank=rank(merged_df$limma_padj)
merged_df$deseq2_padj_rank=rank(merged_df$deseq2_padj)

ggplot(merged_df,aes(x=deseq2_padj_rank,y=limma_padj_rank))+
    geom_point(alpha=0.1)