# 01__DESeq2

in this notebook, i run the raw counts for the differentiated/undifferentiated populations through DESeq2 to get a l2fc for each sgRNA in the library

figures in this notebook:
- Fig S5E: PCA plots of samples before and after batch-correction

In [1]:
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")

In [2]:
# BiocManager::install("DESeq2")

In [3]:
# install.packages("gridExtra")

In [4]:
suppressMessages(library("DESeq2"))

In [5]:
library("ggplot2")
library("gridExtra")
library("gtable")
library("cowplot")


Attaching package: ‘gridExtra’


The following object is masked from ‘package:Biobase’:

    combine


The following object is masked from ‘package:BiocGenerics’:

    combine



********************************************************

Note: As of version 1.0.0, cowplot does not change the

  default ggplot2 theme anymore. To recover the previous

  behavior, execute:
  theme_set(theme_cowplot())

********************************************************




## 1. import data

In [6]:
cts <- read.csv("../../../data/02__screen/00__counts/Biol_Reps.sgRNA_counts.txt", sep="\t", row.names="sgRNA")
cts <- as.matrix(cts)
head(cts)

Unnamed: 0,DZ_Rep1,DZ_Rep2,Pos_Rep1,Pos_Rep2,Neg_Rep1,Neg_Rep2
TTATCTGCATACTTATACAG,728,637,921,420,0,524
TATGCTTTATCCACTCCGAC,1016,1206,1754,733,660,347
TGAAAGCATAGGATATGGCA,1859,2038,2540,1065,1730,1004
AATGGTAAGTTGCTGCTCAG,961,925,2061,758,1471,1253
TAAGCATGCGTGGTATCAAG,2257,1383,2558,921,1250,0
GCTGGGAGGCGCTGGACGGC,1075,1102,1466,847,916,664


In [7]:
cols <- read.csv("../../../data/02__screen/01__normalized_counts/col_info.txt", sep="\t", row.names="column")
cols$condition <- as.factor(cols$condition)
cols$rep <- as.factor(cols$rep)
cols$time <- as.factor(cols$time)
head(cols)

Unnamed: 0_level_0,condition,rep,time
Unnamed: 0_level_1,<fct>,<fct>,<fct>
DZ_Rep1,DZ,Rep1,initial
DZ_Rep2,DZ,Rep2,initial
Pos_Rep1,Pos,Rep1,late
Pos_Rep2,Pos,Rep2,late
Neg_Rep1,Neg,Rep1,late
Neg_Rep2,Neg,Rep2,late


In [8]:
fc_cts <- cts[, c(5,6,3,4)]
head(fc_cts)

Unnamed: 0,Neg_Rep1,Neg_Rep2,Pos_Rep1,Pos_Rep2
TTATCTGCATACTTATACAG,0,524,921,420
TATGCTTTATCCACTCCGAC,660,347,1754,733
TGAAAGCATAGGATATGGCA,1730,1004,2540,1065
AATGGTAAGTTGCTGCTCAG,1471,1253,2061,758
TAAGCATGCGTGGTATCAAG,1250,0,2558,921
GCTGGGAGGCGCTGGACGGC,916,664,1466,847


In [9]:
fc_dz_cts <- cts[, c(1,2,3,4)]
head(fc_dz_cts)

Unnamed: 0,DZ_Rep1,DZ_Rep2,Pos_Rep1,Pos_Rep2
TTATCTGCATACTTATACAG,728,637,921,420
TATGCTTTATCCACTCCGAC,1016,1206,1754,733
TGAAAGCATAGGATATGGCA,1859,2038,2540,1065
AATGGTAAGTTGCTGCTCAG,961,925,2061,758
TAAGCATGCGTGGTATCAAG,2257,1383,2558,921
GCTGGGAGGCGCTGGACGGC,1075,1102,1466,847


In [10]:
fc_cols <- cols[3:6, ]
fc_cols <- fc_cols[c(3,4,1,2), ]
fc_cols

Unnamed: 0_level_0,condition,rep,time
Unnamed: 0_level_1,<fct>,<fct>,<fct>
Neg_Rep1,Neg,Rep1,late
Neg_Rep2,Neg,Rep2,late
Pos_Rep1,Pos,Rep1,late
Pos_Rep2,Pos,Rep2,late


In [11]:
fc_dz_cols <- cols[0:4, ]
fc_dz_cols

Unnamed: 0_level_0,condition,rep,time
Unnamed: 0_level_1,<fct>,<fct>,<fct>
DZ_Rep1,DZ,Rep1,initial
DZ_Rep2,DZ,Rep2,initial
Pos_Rep1,Pos,Rep1,late
Pos_Rep2,Pos,Rep2,late


## 2. DESeq2

In [12]:
dds <- DESeqDataSetFromMatrix(countData = cts,
                              colData = cols,
                              design = ~ condition + rep)

In [13]:
dds <- estimateSizeFactors(dds)

In [14]:
norm_cts <- counts(dds, normalized=TRUE)
head(norm_cts)

Unnamed: 0,DZ_Rep1,DZ_Rep2,Pos_Rep1,Pos_Rep2,Neg_Rep1,Neg_Rep2
TTATCTGCATACTTATACAG,704.2718,586.4672,563.0922,561.5087,0.0,541.6455
TATGCTTTATCCACTCCGAC,982.8848,1110.3288,1072.3819,979.9664,653.5782,358.6851
TGAAAGCATAGGATATGGCA,1798.4084,1876.3267,1552.9362,1423.8257,1713.1671,1037.8093
AATGGTAAGTTGCTGCTCAG,929.6775,851.6203,1260.0794,1013.3896,1456.6872,1295.1943
TAAGCATGCGTGGTATCAAG,2183.4361,1273.2875,1563.9413,1231.3085,1237.8375,0.0
GCTGGGAGGCGCTGGACGGC,1039.9618,1014.579,896.301,1132.376,907.0873,686.3599


In [15]:
vsd <- vst(dds, blind=FALSE)
head(assay(vsd))

-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



Unnamed: 0,DZ_Rep1,DZ_Rep2,Pos_Rep1,Pos_Rep2,Neg_Rep1,Neg_Rep2
TTATCTGCATACTTATACAG,10.28074,10.20942,10.19445,10.19342,8.179329,10.180377
TATGCTTTATCCACTCCGAC,10.43807,10.51012,10.4885,10.43643,10.250736,10.038638
TGAAAGCATAGGATATGGCA,10.9178,10.96297,10.77196,10.69434,10.867699,10.468938
AATGGTAAGTTGCTGCTCAG,10.40833,10.3647,10.59696,10.45519,10.714068,10.617665
TAAGCATGCGTGGTATCAAG,11.13464,10.60473,10.77857,10.58009,10.583908,8.179329
GCTGGGAGGCGCTGGACGGC,10.47015,10.45585,10.38969,10.52275,10.395718,10.270231


In [16]:
g1 <- plotPCA(vsd, ntop=100, intgroup=c("condition", "rep"))

In [17]:
mat <- assay(vsd)
mat <- limma::removeBatchEffect(mat, vsd$rep)
assay(vsd) <- mat

In [18]:
g2 <- plotPCA(vsd, ntop=100, intgroup=c("condition", "rep"))

In [19]:
g1grob <- ggplotGrob(g1)
g2grob <- ggplotGrob(g2)

In [20]:
pdf("FigS5E.pdf", height = 4, width = 6)
grid::grid.draw(cbind(g1grob, g2grob, size = "first"))
dev.off()

In [21]:
dds <- estimateSizeFactors(dds)

In [22]:
dds <- DESeqDataSetFromMatrix(countData = fc_cts,
                              colData = fc_cols,
                              design = ~ rep + condition)

factor levels were dropped which had no samples



In [23]:
dds <- DESeq(dds, betaPrior=TRUE)

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.

final dispersion estimates

fitting model and testing



In [24]:
res <- results(dds, addMLE=TRUE)
head(res)

log2 fold change (MAP): condition Pos vs Neg 
Wald test p-value: condition Pos vs Neg 
DataFrame with 6 rows and 7 columns
                      baseMean log2FoldChange    lfcMLE     lfcSE      stat
                     <numeric>      <numeric> <numeric> <numeric> <numeric>
TTATCTGCATACTTATACAG   412.256       0.993958  5.664040  1.856036  0.535528
TATGCTTTATCCACTCCGAC   758.031       1.017933  1.070549  0.800159  1.272163
TGAAAGCATAGGATATGGCA  1418.357       0.142474  0.145629  0.496143  0.287163
AATGGTAAGTTGCTGCTCAG  1245.174      -0.287383 -0.292510  0.495230 -0.580303
TAAGCATGCGTGGTATCAAG   997.275       1.401174  6.039779  1.736270  0.807002
GCTGGGAGGCGCTGGACGGC   896.654       0.328608  0.341134  0.707654  0.464362
                        pvalue      padj
                     <numeric> <numeric>
TTATCTGCATACTTATACAG  0.592285  0.891095
TATGCTTTATCCACTCCGAC  0.203315  0.609879
TGAAAGCATAGGATATGGCA  0.773988  0.951552
AATGGTAAGTTGCTGCTCAG  0.561711  0.878275
TAAGCATGCGTGGTATCAAG  0

## 3. write results

In [25]:
write.table(res, file = "../../../data/02__screen/01__normalized_counts/l2fcs.DESeq2.with_batch.txt", 
            sep = "\t", quote = FALSE)

## 4. also calculate l2fc from day zero --> late time points

In [26]:
dds_dz <- DESeqDataSetFromMatrix(countData = fc_dz_cts,
                              colData = fc_dz_cols,
                              design = ~ rep + condition)

factor levels were dropped which had no samples



In [27]:
dds_dz <- estimateSizeFactors(dds_dz)

In [28]:
dds_dz <- DESeq(dds_dz, betaPrior=TRUE)

using pre-existing size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.

final dispersion estimates

fitting model and testing



In [29]:
res_dz <- results(dds_dz, addMLE=TRUE)
head(res_dz)

log2 fold change (MAP): condition Pos vs DZ 
Wald test p-value: condition Pos vs DZ 
DataFrame with 6 rows and 7 columns
                      baseMean log2FoldChange      lfcMLE     lfcSE       stat
                     <numeric>      <numeric>   <numeric> <numeric>  <numeric>
TTATCTGCATACTTATACAG   649.939    -0.11821849 -0.16087762  0.214223 -0.5518478
TATGCTTTATCCACTCCGAC  1116.347     0.00537320  0.00676402  0.188102  0.0285654
TGAAAGCATAGGATATGGCA  1789.186    -0.23757230 -0.27115108  0.159001 -1.4941597
AATGGTAAGTTGCTGCTCAG  1093.491     0.30986277  0.37866970  0.187126  1.6559021
TAAGCATGCGTGGTATCAAG  1681.481    -0.20311175 -0.23460636  0.170238 -1.1931073
GCTGGGAGGCGCTGGACGGC  1099.573     0.00418771  0.00421272  0.190324  0.0220030
                        pvalue      padj
                     <numeric> <numeric>
TTATCTGCATACTTATACAG 0.5810526  0.849839
TATGCTTTATCCACTCCGAC 0.9772112  0.994167
TGAAAGCATAGGATATGGCA 0.1351339  0.496056
AATGGTAAGTTGCTGCTCAG 0.0977417  0.431911
T

## 5. write results day zero --> late

In [30]:
write.table(res_dz, file = "../../../data/02__screen/01__normalized_counts/l2fcs_DZ.DESeq2.with_batch.txt", 
            sep = "\t", quote = FALSE)