Create analysis

iakerman · Oct 15, 2020 · e26295c · e26295c
commit e26295c
Showing 1 changed file with 137 additions and 0 deletions.
diff --git a/analysis b/analysis
@@ -0,0 +1,137 @@
+# The following pipeline is written by Dr. Ildem Akerman (IMSR, University of Birmingham) for the analysis of Lexogen 5' Quantseq RNA-seq data generated by the University of Birmingham Genomics Facility 
+# Quantseq Library kits information : https://www.lexogen.com/quantseq-3mrna-sequencing/ 
+# Pipeline modified from PMID: 28041957  PMCID: PMC5300904  DOI: 10.1016/j.cmet.2016.11.016 Akerman Et al 2017 
+
+# please note that the true directory names have been removed and replaced with the words "folder or directory" - please insert your own directory names. 
+# Any special parameters used for specific pipelines are indicated in the article's materials and methods section. 
+# Please reach me from University of Birmingham e-mail address for questions or to report a fault; https://www.birmingham.ac.uk/staff/profiles/metabolism-systems/akerman-ildem.aspx
+
+#############################################################################################
+# TRIMMING adapters, low quality and polyA tails READS and fastQC to test if it makes a difference 
+#############################################################################################
+# Quality controls were performed using fastQC and reads are trimmed using Trimmomatic and/or bbduk 
+#!/bin/bash
+for file in *.fastq
+do
+java -jar ${TMATIC_ROOT}/trimmomatic-0.32.jar SE -trimlog Triming_LOG_"${file}" ${file} tt_"${file}" ILLUMINACLIP:TruSeq3-SE:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
+done
+echo "trimmomatic done"
+###### ######################
+#Alternatively bbduk script from BBMAP suite 
+#cd /bbmap/
+#for i in *.fastq
+#do 
+#cat $i | bbduk.sh in=${i} out=/trimmo_bbdukpA/"${i}" ref=sourcefolderhere/polyA.fa.gz k=13 ktrim=r useshortkmers=t mink=5 qtrim=r trimq=10 minlength=20
+#done
+########################
+
+
+#############################################################################################
+# Align to human or mouse genome (example mouse) 
+#############################################################################################
+#!/bin/bash
+# Remember to create an index directory for STAR (/genomeDirectory/STARmm10index)
+STAR --runThreadN 6 --runMode genomeGenerate --genomeDir genomeDirectory/STARmm10index --genomeFastaFiles /folder/mm10.fa --sjdbGTFfile folder/gencode.vM20.annotation.gtf --sjdbOverhang 74
+
+# Now align to the mouse (or human) genome
+for file in *.fastq
+do
+STAR --runThreadN 8 --runMode alignReads --genomeDir /genomeDirectory/STARmm10index --readFilesIn ${file} --outFileNamePrefix mm10_"${file}" --outFilterScoreMinOverLread 0.3 --outFilterMatchNminOverLread 0.3 --outFilterMultimapNmax 20 --outReadsUnmapped Fastx_failed --outSAMtype BAM SortedByCoordinate  
+done
+
+#############################################################################################
+# HT-seq count 
+#############################################################################################
+# Count the number of reads taht fall on each gene (gene annotation indicated below for mouse)
+#!/bin/bash
+for file in *.bam
+do
+htseq-count -m intersection-nonempty -s yes -f bam -r pos ${file} directory/gencode.vM20.annotation.gtf > "${file}"_HTseq
+done
+
+# Please note that -s is used for stranded libraries ONLY. 
+
+
+#############################################################################################
+# DeSeq2 in R (not bash)  Example analysis for Nasteska et al (David Hodson Lab) in R Studio / R 
+#############################################################################################
+
+# Get the gene name conversion from source - for MOUSE (similar for human). This makes life easier as one can see gene symbols, rather than gencode names. 
+#source("https://bioconductor.org/biocLite.R")
+#biocLite("EnsDb.Hsapiens.v92")
+source("https://bioconductor.org/biocLite.R")
+biocLite("ensembldb")
+library('biomaRt')
+#### convert genes 
+#get the gene names from gencode mouse file
+x <- read.table("U:AkermanFolder/HTseq_COUNTS_GencodevM20_mm10.txt", header = FALSE)
+x <- x[,1]
+foo <- data.frame(do.call('rbind', strsplit(as.character(x),'.',fixed=TRUE)))
+genes <- foo[,1]
+############
+mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl"))
+G_list <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","mgi_symbol","entrezgene", "description"),values=genes,mart= mart)
+###
+x <- read.table("U:AkermanFolder/DH_ttbb_HTseq_COUNTS_GencodevM20_mm10.txt", header = FALSE)
+x <- x[,1]
+merged <- cbind(x,foo)
+colnames(merged) <- c("ENSG.version","ensembl_gene_id","version") 
+x <- merge(merged,G_list, by = "ensembl_gene_id")
+setwd("U:AkermanFolder")
+write.table(x, file = "GencodeMousev20_conversion_table.txt", sep = "\t", col.names = T, row.names = F, qmethod = "double", quote=FALSE)
+
+############################## deseq2 ###############################
+library(DESeq2)
+
+sampleTable <- read.table("U:AkermanFolder/sample_table.txt", header = TRUE)
+sampleTable <- data.frame(sampleTable)
+directory<-'U:AkermanFolder/HT-seq'
+
+### create the deseq object  
+ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable=sampleTable, directory=directory, design= ~ individual + condition)
+colData(ddsHTSeq)$condition<-factor(colData(ddsHTSeq)$condition, levels=c('M3C','WT')) # CTL vs treatment first 
+
+
+#run deseq2
+############## cut off low expression (at least 3 samples 10 or above) 
+dds<-DESeq(ddsHTSeq)
+dds <- estimateSizeFactors(dds)
+idx <- rowSums( counts(dds, normalized=TRUE) >= 10 ) >= 3
+#This would say, e.g. filter out genes where there are less than 3 samples with normalized counts greater than or equal to 5.
+dds <- dds[idx,]
+dds <- DESeq(dds)
+res  <- results(dds, contrast=c("condition","WT","M3C"))
+
+#write.table(as.data.frame(res),file="HodsonMelton3_WTvM3C_paired_DEseq2.txt")
+## get baseMean for each condition 
+BM <- sapply( levels(dds$condition), function(lvl) rowMeans( counts(dds,normalized=TRUE)[,dds$condition == lvl] ) )
+BM <- as.data.frame(BM)
+BM$ENSG.version <- row.names(BM)
+############ with gene name 
+
+res <- as.data.frame(res)
+rn <- row.names(res)
+res$ENSG.version <- rn
+DH <- merge(res, x, by = "ENSG.version", all.x = TRUE)
+DH <- merge(DH, BM, by = "ENSG.version", all.x = TRUE)
+DH <- DH[,c(1,14,13,2:12)]
+write.table(DH, file = "DH4b_WTvM3_1-3-2019_DESeq2cutoff10.txt", sep = "\t", col.names = T, row.names = F, qmethod = "double", quote=FALSE)
+
+
+
+
+# Please find an example sample table below (text file for DeSeq2) 
+sampleName	fileName	condition	individual
+A1_M3C-3	mm10_4c_50pttpe_A1-Exp-3-L16-M2-M3C_S3.fastqAligned.sortedByCoord.out.bam_HTseq	M3C	exp3
+B2_WT-1	mm10_4c_50pttpe_B2-Exp1-L14-WT-3_S2.fastqAligned.sortedByCoord.out.bam_HTseq	WT	exp1
+B2_WT-6	mm10_4c_50pttpe_B2-Exp-6-L24-M3-WT_S9.fastqAligned.sortedByCoord.out.bam_HTseq	WT	exp6
+C1_WT-3	mm10_4c_50pttpe_C1-Exp-3-L16-M5-WT_S4.fastqAligned.sortedByCoord.out.bam_HTseq	WT	exp3
+C1_M3C-5	mm10_4c_50pttpe_C1-Exp-5-L22-M1-M3C_S7.fastqAligned.sortedByCoord.out.bam_HTseq	M3C	exp5
+C2_M3C-6	mm10_4c_50pttpe_C2-Exp-6-L24-M4-M3C_S1.fastqAligned.sortedByCoord.out.bam_HTseq	M3C	exp6
+D1_WT-5	mm10_4c_50pttpe_D1-Exp-5-L22-M2-WT_S8.fastqAligned.sortedByCoord.out.bam_HTseq	WT	exp5
+E1_WT-3	mm10_4c_50pttpe_E1-Exp-3-L16-M7-WT_S5.fastqAligned.sortedByCoord.out.bam_HTseq	WT	exp3
+F1_M3C-1	mm10_4c_50pttpe_F1-Exp-1-L14-M3C-1_S1.fastqAligned.sortedByCoord.out.bam_HTseq	M3C	exp1
+F1_M3C-3	mm10_4c_50pttpe_F1-Exp-3-L16-M8-M3C_S6.fastqAligned.sortedByCoord.out.bam_HTseq	M3C	exp3
+
+
+