diff --git a/analysis b/analysis new file mode 100644 index 0000000..7b6fabb --- /dev/null +++ b/analysis @@ -0,0 +1,137 @@ +# The following pipeline is written by Dr. Ildem Akerman (IMSR, University of Birmingham) for the analysis of Lexogen 5' Quantseq RNA-seq data generated by the University of Birmingham Genomics Facility +# Quantseq Library kits information : https://www.lexogen.com/quantseq-3mrna-sequencing/ +# Pipeline modified from PMID: 28041957 PMCID: PMC5300904 DOI: 10.1016/j.cmet.2016.11.016 Akerman Et al 2017 + +# please note that the true directory names have been removed and replaced with the words "folder or directory" - please insert your own directory names. +# Any special parameters used for specific pipelines are indicated in the article's materials and methods section. +# Please reach me from University of Birmingham e-mail address for questions or to report a fault; https://www.birmingham.ac.uk/staff/profiles/metabolism-systems/akerman-ildem.aspx + +############################################################################################# +# TRIMMING adapters, low quality and polyA tails READS and fastQC to test if it makes a difference +############################################################################################# +# Quality controls were performed using fastQC and reads are trimmed using Trimmomatic and/or bbduk +#!/bin/bash +for file in *.fastq +do +java -jar ${TMATIC_ROOT}/trimmomatic-0.32.jar SE -trimlog Triming_LOG_"${file}" ${file} tt_"${file}" ILLUMINACLIP:TruSeq3-SE:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 +done +echo "trimmomatic done" +###### ###################### +#Alternatively bbduk script from BBMAP suite +#cd /bbmap/ +#for i in *.fastq +#do +#cat $i | bbduk.sh in=${i} out=/trimmo_bbdukpA/"${i}" ref=sourcefolderhere/polyA.fa.gz k=13 ktrim=r useshortkmers=t mink=5 qtrim=r trimq=10 minlength=20 +#done +######################## + + +############################################################################################# +# Align to human or mouse genome (example mouse) +############################################################################################# +#!/bin/bash +# Remember to create an index directory for STAR (/genomeDirectory/STARmm10index) +STAR --runThreadN 6 --runMode genomeGenerate --genomeDir genomeDirectory/STARmm10index --genomeFastaFiles /folder/mm10.fa --sjdbGTFfile folder/gencode.vM20.annotation.gtf --sjdbOverhang 74 + +# Now align to the mouse (or human) genome +for file in *.fastq +do +STAR --runThreadN 8 --runMode alignReads --genomeDir /genomeDirectory/STARmm10index --readFilesIn ${file} --outFileNamePrefix mm10_"${file}" --outFilterScoreMinOverLread 0.3 --outFilterMatchNminOverLread 0.3 --outFilterMultimapNmax 20 --outReadsUnmapped Fastx_failed --outSAMtype BAM SortedByCoordinate +done + +############################################################################################# +# HT-seq count +############################################################################################# +# Count the number of reads taht fall on each gene (gene annotation indicated below for mouse) +#!/bin/bash +for file in *.bam +do +htseq-count -m intersection-nonempty -s yes -f bam -r pos ${file} directory/gencode.vM20.annotation.gtf > "${file}"_HTseq +done + +# Please note that -s is used for stranded libraries ONLY. + + +############################################################################################# +# DeSeq2 in R (not bash) Example analysis for Nasteska et al (David Hodson Lab) in R Studio / R +############################################################################################# + +# Get the gene name conversion from source - for MOUSE (similar for human). This makes life easier as one can see gene symbols, rather than gencode names. +#source("https://bioconductor.org/biocLite.R") +#biocLite("EnsDb.Hsapiens.v92") +source("https://bioconductor.org/biocLite.R") +biocLite("ensembldb") +library('biomaRt') +#### convert genes +#get the gene names from gencode mouse file +x <- read.table("U:AkermanFolder/HTseq_COUNTS_GencodevM20_mm10.txt", header = FALSE) +x <- x[,1] +foo <- data.frame(do.call('rbind', strsplit(as.character(x),'.',fixed=TRUE))) +genes <- foo[,1] +############ +mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl")) +G_list <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","mgi_symbol","entrezgene", "description"),values=genes,mart= mart) +### +x <- read.table("U:AkermanFolder/DH_ttbb_HTseq_COUNTS_GencodevM20_mm10.txt", header = FALSE) +x <- x[,1] +merged <- cbind(x,foo) +colnames(merged) <- c("ENSG.version","ensembl_gene_id","version") +x <- merge(merged,G_list, by = "ensembl_gene_id") +setwd("U:AkermanFolder") +write.table(x, file = "GencodeMousev20_conversion_table.txt", sep = "\t", col.names = T, row.names = F, qmethod = "double", quote=FALSE) + +############################## deseq2 ############################### +library(DESeq2) + +sampleTable <- read.table("U:AkermanFolder/sample_table.txt", header = TRUE) +sampleTable <- data.frame(sampleTable) +directory<-'U:AkermanFolder/HT-seq' + +### create the deseq object +ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable=sampleTable, directory=directory, design= ~ individual + condition) +colData(ddsHTSeq)$condition<-factor(colData(ddsHTSeq)$condition, levels=c('M3C','WT')) # CTL vs treatment first + + +#run deseq2 +############## cut off low expression (at least 3 samples 10 or above) +dds<-DESeq(ddsHTSeq) +dds <- estimateSizeFactors(dds) +idx <- rowSums( counts(dds, normalized=TRUE) >= 10 ) >= 3 +#This would say, e.g. filter out genes where there are less than 3 samples with normalized counts greater than or equal to 5. +dds <- dds[idx,] +dds <- DESeq(dds) +res <- results(dds, contrast=c("condition","WT","M3C")) + +#write.table(as.data.frame(res),file="HodsonMelton3_WTvM3C_paired_DEseq2.txt") +## get baseMean for each condition +BM <- sapply( levels(dds$condition), function(lvl) rowMeans( counts(dds,normalized=TRUE)[,dds$condition == lvl] ) ) +BM <- as.data.frame(BM) +BM$ENSG.version <- row.names(BM) +############ with gene name + +res <- as.data.frame(res) +rn <- row.names(res) +res$ENSG.version <- rn +DH <- merge(res, x, by = "ENSG.version", all.x = TRUE) +DH <- merge(DH, BM, by = "ENSG.version", all.x = TRUE) +DH <- DH[,c(1,14,13,2:12)] +write.table(DH, file = "DH4b_WTvM3_1-3-2019_DESeq2cutoff10.txt", sep = "\t", col.names = T, row.names = F, qmethod = "double", quote=FALSE) + + + + +# Please find an example sample table below (text file for DeSeq2) +sampleName fileName condition individual +A1_M3C-3 mm10_4c_50pttpe_A1-Exp-3-L16-M2-M3C_S3.fastqAligned.sortedByCoord.out.bam_HTseq M3C exp3 +B2_WT-1 mm10_4c_50pttpe_B2-Exp1-L14-WT-3_S2.fastqAligned.sortedByCoord.out.bam_HTseq WT exp1 +B2_WT-6 mm10_4c_50pttpe_B2-Exp-6-L24-M3-WT_S9.fastqAligned.sortedByCoord.out.bam_HTseq WT exp6 +C1_WT-3 mm10_4c_50pttpe_C1-Exp-3-L16-M5-WT_S4.fastqAligned.sortedByCoord.out.bam_HTseq WT exp3 +C1_M3C-5 mm10_4c_50pttpe_C1-Exp-5-L22-M1-M3C_S7.fastqAligned.sortedByCoord.out.bam_HTseq M3C exp5 +C2_M3C-6 mm10_4c_50pttpe_C2-Exp-6-L24-M4-M3C_S1.fastqAligned.sortedByCoord.out.bam_HTseq M3C exp6 +D1_WT-5 mm10_4c_50pttpe_D1-Exp-5-L22-M2-WT_S8.fastqAligned.sortedByCoord.out.bam_HTseq WT exp5 +E1_WT-3 mm10_4c_50pttpe_E1-Exp-3-L16-M7-WT_S5.fastqAligned.sortedByCoord.out.bam_HTseq WT exp3 +F1_M3C-1 mm10_4c_50pttpe_F1-Exp-1-L14-M3C-1_S1.fastqAligned.sortedByCoord.out.bam_HTseq M3C exp1 +F1_M3C-3 mm10_4c_50pttpe_F1-Exp-3-L16-M8-M3C_S6.fastqAligned.sortedByCoord.out.bam_HTseq M3C exp3 + + +