Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
295 lines (218 sloc) 6.23 KB
% VARIANT-CALL
%
% a variant calling workflow
%
%
% Copyright:: 2015-2018 Jörgen Brandt
%
% Licensed under the Apache License, Version 2.0 (the "License");
% you may not use this file except in compliance with the License.
% You may obtain a copy of the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS,
% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
% See the License for the specific language governing permissions and
% limitations under the License.
%
%
% Sample data can be obtained from:
% ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG02025/sequence_read/
%
% The HG38 reference genome can be downloaded from
% http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/
%
% An Annovar HG38 database is expected to reside in
% /opt/data/annodb_hg38
%
% In addition to a Cuneiform interpreter the following tools need to be
% installed to run this analysis:
% - FastQC 0.11.4
% - Bowtie2 2.2.6
% - SAMtools 1.2
% - VarScan 2.3.9
% - Annovar
%
%--------------------------------------------------------------
%% ============================================================
%% Task definitions
%% ============================================================
def split( file : File ) -> <lst : [File]> in Bash *{
split -l 1280000 $file txt
lst=txt*
}*
def untar( tar : File ) ->
<lst : [File]>
in Bash *{
tar xf $tar
lst=`tar tf $tar`
}*
def gunzip( gz : File ) ->
<file : File>
in Bash *{
file=unzipped_${gz%.gz}
gzip -c -d $gz > $file
}*
def fastqc( fastq : File ) ->
<zip : File>
in Bash *{
fastqc -f fastq --noextract -o ./ $fastq
zip=`ls *.zip`
}*
def bowtie2-build( fa : File ) ->
<idx : File>
in Bash *{
bowtie2-build $fa bt2idx
idx=idx.tar
tar cf $idx --remove-files bt2idx.*
}*
def bowtie2-align( idx : File, fastq1 : File, fastq2 : File ) ->
<bam : File>
in Bash *{
tar xf $idx
bam=alignment.bam
bowtie2 -D 20 -R 3 -N 0 -L 20 -i S,1,0.50 --no-unal -x bt2idx \
-1 $fastq1 -2 $fastq2 -S - | samtools view -b - > $bam
rm bt2idx.*
}*
def samtools-faidx( fa : File ) ->
<fai : File>
in Bash *{
samtools faidx $fa
fai=$fa.fai
}*
def samtools-sort( bam : File ) ->
<sorted : File>
in Bash *{
sorted=sorted.bam
samtools sort -m 2G $bam -o $sorted
}*
def samtools-mpileup( bam : File, fa : File, fai : File ) ->
<mpileup : File>
in Bash *{
ln -sf $fai $fa.fai
mpileup=mpileup.csv
samtools mpileup -f $fa $bam > $mpileup
}*
def samtools-merge( bam-lst : [File] ) ->
<merged : File>
{
def samtools-merge( bams : [File] ) ->
<merged : File>
in Bash *{
merged=merged.bam
if [ ${#bams[@]} -eq "1" ]
then
merged=$bam
else
samtools merge -f $merged ${bams[@]}
fi
}*
if isnil bam-lst
then
error "Merge list must not be empty." : <merged : File>
else
samtools-merge( bams = bam-lst )
end
}
def varscan-snp( mpileup : File ) ->
<vcf : File>
in Bash *{
vcf=snp.vcf
varscan mpileup2snp $mpileup --output-vcf > $vcf
}*
def varscan-indel( mpileup : File ) ->
<vcf : File>
in Bash *{
vcf=indel.vcf
varscan mpileup2indel $mpileup --output-vcf > $vcf
}*
def annovar( vcfs : [File], db : File, vsn : Str ) ->
<fun : File, exonic : File>
in Bash *{
fun=table.variant_function
exonic=table.exonic_variant_function
tar xvf $db
cat ${vcfs[@]} | \
convert2annovar.pl -format vcf4 - | \
annotate_variation.pl -buildver $vsn -geneanno -outfile table - db
}*
%% ============================================================
%% Input data
%% ============================================================
let hg38-tar : File =
'hg38/hg38.tar';
let fastq1-lst-gz : [File] =
['kgenomes/SRR359188_1.filt.fastq.gz',
'kgenomes/SRR359195_1.filt.fastq.gz' : File];
let fastq2-lst-gz : [File] =
['kgenomes/SRR359188_2.filt.fastq.gz',
'kgenomes/SRR359195_2.filt.fastq.gz' : File];
let build-vsn : Str =
"hg38";
let db : File =
'annovar/hg38db.tar';
%% ============================================================
%% Workflow definition
%% ============================================================
let <lst = fa-lst : [File]> =
untar( tar = hg38-tar );
let fastq1-lst : [File] =
for gz : File <- fastq1-lst-gz do
( gunzip( gz = gz )|file ) : File
end;
let fastq2-lst : [File] =
for gz : File <- fastq2-lst-gz do
( gunzip( gz = gz )|file ) : File
end;
let qc-lst : [File] =
for fastq : File <- ( fastq1-lst+fastq2-lst ) do
( fastqc( fastq = fastq )|zip ) : File
end;
let mpileup-lst : [File] =
for fa : File <- fa-lst do
let <idx = idx : File> =
bowtie2-build( fa = fa );
let <fai = fai : File> =
samtools-faidx( fa = fa );
let sorted-lst : [File] =
for fastq1 : File <- fastq1-lst, fastq2 : File <- fastq2-lst do
let <lst = split-lst1 : [File]> = split( file = fastq1 );
let <lst = split-lst2 : [File]> = split( file = fastq2 );
let bam-lst : [File] =
for split1 : File <- split-lst1,
split2 : File <- split-lst2 do
let <bam = bam : File> =
bowtie2-align( idx = idx,
fastq1 = split1,
fastq2 = split2 );
( samtools-sort( bam = bam )|sorted ) : File
end;
( samtools-merge( bam-lst = bam-lst )|merged ) : File
end;
let <merged = merged : File> =
samtools-merge( bam-lst = sorted-lst );
( samtools-mpileup( bam = merged,
fa = fa,
fai = fai )|mpileup ) : File
end;
let snp-lst : [File] =
for mpileup : File <- mpileup-lst do
( varscan-snp( mpileup = mpileup )|vcf ) : File
end;
let indel-lst : [File] =
for mpileup : File <- mpileup-lst do
( varscan-indel( mpileup = mpileup )|vcf ) : File
end;
let <fun = fun : File, exonic = exonic : File> =
annovar( vcfs = ( snp-lst+indel-lst ),
db = db,
vsn = build-vsn );
%% ============================================================
%% Query
%% ============================================================
<fun = fun,
exonic = exonic,
qc-lst = qc-lst>;
You can’t perform that action at this time.