Permalink
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
259 lines (210 sloc) 6.84 KB
% ChIP-Seq
%
% a ChIP-Seq workflow
%
% Copyright:: 2015-2018 Jörgen Brandt
%
% Licensed under the Apache License, Version 2.0 (the "License");
% you may not use this file except in compliance with the License.
% You may obtain a copy of the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS,
% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
% See the License for the specific language governing permissions and
% limitations under the License.
%
% Input data:
%
% Escherichia coli reference genome
% ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz
%
% Tag sample
% ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByStudy/sra/SRP/SRP015/SRP015911/SRR576933/SRR576933.sra
%
% Control sample
% ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByStudy/sra/SRP/SRP015/SRP015911/SRR576938/SRR576938.sra
%
%
% Tools and versions:
%
% sra-toolkit 2.8.2-5
% FastQC 0.11.5
% Bowtie 1.2.2
% MACS 1.4.2
% deepTools 3.1.3
% bedtools 2.26.0
% SAMtools 1.7-1
%
% MACS
% https://github.com/taoliu/MACS/archive/v1.4.2.tar.gz
%
%--------------------------------------------------------------------
%%===================================================================
%% Function definitions
%%===================================================================
def gunzip( gz : File ) -> <file : File>
in Bash *{
file=unzipped_${gz%.gz}
gzip -c -d $gz > $file
}*
def fastq-dump( sra : File ) -> <fastq : File>
in Bash *{
fastq=$sra.fastq
fastq-dump -Z $sra > $fastq
}*
def fastqc( fastq : File ) -> <zip : File>
in Bash *{
fastqc -f fastq --noextract -o ./ $fastq
zip=`ls *.zip`
}*
def bowtie-build( fa : File ) -> <idx : File>
in Bash *{
idx=idx.tar
bowtie-build $fa idx
tar cf $idx idx.*
}*
def bowtie-align( idx : File, fastq : File ) -> <bam : File>
in Bash *{
bam=$fastq.bam
tar xf $idx
bowtie -S idx $fastq | samtools view -b - > $bam
}*
def macs( tag : File, ctl : File ) ->
<peaks : File,
summits : File,
xls : [File],
tag_bed : File,
ctl_bed : File>
in Bash *{
macs14 -t $tag \
-c $ctl \
--format BAM \
--gsize 4639675 \
--name "macs14" \
--bw 400 \
--keep-dup 1 \
--bdg \
--single-profile \
--diag
peaks=macs14_peaks.bed
summits=macs14_summits.bed
xls=(macs14_diag.xls macs14_negative_peaks.xls)
tag_bed=macs14_MACS_bedGraph/treat/macs14_treat_afterfiting_all.bdg.gz
ctl_bed=macs14_MACS_bedGraph/control/macs14_control_afterfiting_all.bdg.gz
}*
def samtools-sort( bam : File ) -> <sorted : File>
in Bash *{
sorted=sorted.bam
samtools sort -m 2G $bam -o $sorted
}*
def samtools-dedup( bam : File ) -> <dedup : File>
in Bash *{
dedup=dedup.bam
samtools rmdup -s $bam $dedup
}*
def samtools-index( bam : File ) -> <bai : File>
in Bash *{
bai=$bam.bai
samtools index $bam
}*
def samtools-faidx( fa : File ) -> <fai : File>
in Bash *{
samtools faidx $fa
fai=$fa.fai
}*
def bamcoverage( bam : File, bai : File ) -> <bedgraph : File>
in Bash *{
bedgraph=$bam.bedgraph
ln -sf $bai $bam.bai
bamCoverage --bam $bam \
--outFileName $bedgraph \
--outFileFormat bedgraph
}*
def restrict-peaks( bed : File ) -> <restricted : File>
in Bash *{
restricted=$bed.100.bed
perl -lane '$start=$F[1]+100; $end = $F[2]-100 ; print "$F[0]\t$start\t$end"' \
$bed > $restricted
}*
def bedtools-getfasta( fa : File, fai : File, bed : File ) ->
<bed_fa : File>
in Bash *{
bed_fa=$bed.fa
ln -sf $fai $fa.fai
bedtools getfasta -fi $fa -bed $bed -fo $bed_fa
}*
%%===================================================================
%% Input data
%%===================================================================
let ecoli-fa-gz : File = 'GCF_000005845.2_ASM584v2_genomic.fna.gz';
let tag-sra : File = 'SRR576933.sra';
let ctl-sra : File = 'SRR576938.sra';
%%===================================================================
%% Workflow definition
%%===================================================================
%% Data preprocessing -----------------------------------------------
% decompress E.coli reference genome
let <file = ecoli-fa : File> = gunzip( gz = ecoli-fa-gz );
% convert tag and control samples to FastQ
let <fastq = tag-fastq : File> = fastq-dump( sra = tag-sra );
let <fastq = ctl-fastq : File> = fastq-dump( sra = ctl-sra );
% quality control
let <zip = tag-qc : File> = fastqc( fastq = tag-fastq );
let <zip = ctl-qc : File> = fastqc( fastq = ctl-fastq );
% index reference genome
let <idx = ecoli-idx : File> = bowtie-build( fa = ecoli-fa );
% read mapping
let <bam = tag-bam : File> =
bowtie-align( idx = ecoli-idx,
fastq = tag-fastq );
let <bam = ctl-bam : File> =
bowtie-align( idx = ecoli-idx,
fastq = ctl-fastq );
%% Peak calling with deepTools --------------------------------------
% sort
let <sorted = tag-sorted-bam : File> = samtools-sort( bam = tag-bam );
let <sorted = ctl-sorted-bam : File> = samtools-sort( bam = ctl-bam );
% deduplicate
let <dedup = tag-dedup-bam : File> =
samtools-dedup( bam = tag-sorted-bam );
let <dedup = ctl-dedup-bam : File> =
samtools-dedup( bam = ctl-sorted-bam );
% index alignments
let <bai = tag-dedup-bai : File> =
samtools-index( bam = tag-dedup-bam );
let <bai = ctl-dedup-bai : File> =
samtools-index( bam = ctl-dedup-bam );
% call peaks with deepTools
let <bedgraph = tag-deeptools-bedgraph : File> =
bamcoverage( bam = tag-dedup-bam,
bai = tag-dedup-bai );
let <bedgraph = ctl-deeptools-bedgraph : File> =
bamcoverage( bam = ctl-dedup-bam,
bai = ctl-dedup-bai );
%% Peak calling with ḾACS ------------------------------------------
% call peaks with MACS
let <peaks = peaks-macs-bed : File> =
macs( tag = tag-bam,
ctl = ctl-bam );
% restrict peaks
let <restricted = restricted-macs-bed : File> =
restrict-peaks( bed = peaks-macs-bed );
% extract sequence tag from reference genome
let <fai = ecoli-fai : File> = samtools-faidx( fa = ecoli-fa );
% extract peak DNA sequence
let <bed_fa = peaks-macs-fa : File> =
bedtools-getfasta( fa = ecoli-fa,
fai = ecoli-fai,
bed = restricted-macs-bed );
%%===================================================================
%% Query
%%===================================================================
<tag-qc = tag-qc, % quality control
ctl-qc = ctl-qc,
tag-deeptools-bedgraph = tag-deeptools-bedgraph, % deepTools results
ctl-deeptools-bedgraph = ctl-deeptools-bedgraph,
restricted-macs-bed = restricted-macs-bed, % MACS results
peaks-macs-fa = peaks-macs-fa>;