Skip to content

Commit

Permalink
Implement #17 (optionnal primer clipping for Illumina with cutadapt).
Browse files Browse the repository at this point in the history
  • Loading branch information
lentendu committed Mar 8, 2018
1 parent d330e74 commit 5c1f372
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 48 deletions.
1 change: 1 addition & 0 deletions bin/variables_definition_Illumina.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
CLIPPING no Primers at 5´ end of Illumina reads no yes
PALG simple_bayesian PandaSeq algorithm simple_bayesian ea_util flash pear rdp_mle stitch uparse
PTRESH 0.6 PandaSeq threshold ^[0-9]+(.[0-9]+)?$
MIN_OV 1 Minimum overlap ^[0-9]+$
64 changes: 27 additions & 37 deletions src/Illumina_fastq.step
Original file line number Diff line number Diff line change
Expand Up @@ -7,46 +7,36 @@ module load DeltaMP/${VERSION[DELTAMP]}

# Library to analyse
read LIB_NAME FWD_SUF RVS_SUF <<<`sed -n ${ARRAY_TASK}'p' config/lib2.list`
FWD_LIB=${LIB_NAME}${FWD_SUF%.*}
RVS_LIB=${LIB_NAME}${RVS_SUF%.*}
FWD_LIB=${LIB_NAME}${FWD_SUF/.$RAW_EXT.*/}
RVS_LIB=${LIB_NAME}${RVS_SUF/.$RAW_EXT.*/}
cd libraries

# Extract reads with valid forward and/or reverse primer
for i in FWD_REGEX RVS_REGEX
do
j=${i%_*}
declare $i=${!j}
while read A B
do
declare $i="`echo ${!i} | sed "s/$A/$B/g"`"
done < <(awk '{print toupper($0)}' $BIN/IUPAC.txt)
done

FWDSIM=`awk -v F=${#FWD} -v D=$PDIFFS 'BEGIN{printf "%.2g\n", 1-D/F}'`
RVSSIM=`awk -v R=${#RVS} -v D=$PDIFFS 'BEGIN{printf "%.2g\n", 1-D/R}'`
sumaclust -t $FWDSIM -e -O $FWD_LIB.suma.map <(obicut -b 1 -e ${#FWD} --fasta-output $FWD_LIB.fastq | sed 's/_SUB sub.*$//;/^>/!s/n/a/g') | obigrep -a cluster_center:True | obisort -k cluster_weight -r | obiannotate -C > $FWD_LIB.suma.fasta
sumaclust -t $RVSSIM -e -O $RVS_LIB.suma.map <(obicut -b 1 -e ${#RVS} --fasta-output $RVS_LIB.fastq | sed 's/_SUB sub.*$//;/^>/!s/n/a/g') | obigrep -a cluster_center:True | obisort -k cluster_weight -r | obiannotate -C > $RVS_LIB.suma.fasta
cat <(grep "$(grep -m 1 -B 1 -i $FWD_REGEX $FWD_LIB.fastq | awk 'NR==1{sub("@","");print $1}')" $FWD_LIB.suma.map) <(grep "$(grep -m 1 -B 1 -i $RVS_REGEX $RVS_LIB.fastq | awk 'NR==1{sub("@","");print $1}')" $RVS_LIB.suma.map) | cut -f 2- | tr "\t" "\n" | sort -u > $LIB_NAME.primer.accnos
obigrep --uppercase --id-list=$LIB_NAME.primer.accnos $FWD_LIB.fastq | sed 's/ / /' > fastq/$LIB_NAME.$FWD_NAME.fastq
obigrep --uppercase --id-list=$LIB_NAME.primer.accnos $RVS_LIB.fastq | sed 's/ / /' > fastq/$LIB_NAME.$RVS_NAME.fastq
# better use cutadapt

# Convert to fasta + qual and raw reads stat
# Primer clipping
if [ $CLIPPING == "yes" ]
then
FWDDIS=`awk -v F=${#FWD} -v D=$PDIFFS 'BEGIN{printf "%.2g\n", D/F}'`
RVSDIS=`awk -v R=${#RVS} -v D=$PDIFFS 'BEGIN{printf "%.2g\n", D/R}'`
# cutadapt
cutadapt -g $FWD -O ${#FWD} -e $FWDDIS --no-indels --trimmed-only -o fastq/$LIB_NAME.fwd.fastq --info-file=raw_stat/$FWD_LIB.cutadapt $FWD_LIB.fastq > fastq/log.cutadapt.$LIB_NAME.txt
cutadapt -g $RVS -O ${#RVS} -e $RVSDIS --no-indels --trimmed-only -o fastq/$LIB_NAME.rvs.fastq --info-file=raw_stat/$RVS_LIB.cutadapt $RVS_LIB.fastq >> fastq/log.cutadapt.$LIB_NAME.txt
# primer logo
weblogo -c classic -s large -t "$LIB_NAME: primer $FWD_NAME" < <(awk 'BEGIN{FS="\t"} NF==11{print ">"$1"\n"$6}' raw_stat/$FWD_LIB.cutadapt) > raw_stat/weblogo.$LIB_NAME.forward.eps
weblogo -c classic -s large -t "$LIB_NAME: primer $RVS_NAME" < <(awk 'BEGIN{FS="\t"} NF==11{print ">"$1"\n"$6}' raw_stat/$RVS_LIB.cutadapt) > raw_stat/weblogo.$LIB_NAME.reverse.eps
else
ln -s $PWD/$FWD_LIB.fastq $PWD/fastq/$LIB_NAME.fwd.fastq
ln -s $PWD/$RVS_LIB.fastq $PWD/fastq/$LIB_NAME.rvs.fastq
fi

# Convert to fasta + qual for raw reads stat
mothur "#set.dir(input=fastq, output=fasta);
fastq.info(fastq=$LIB_NAME.$FWD_NAME.fastq);
fastq.info(fastq=$LIB_NAME.$RVS_NAME.fastq)"
awk '$0!~"^>"{sum=0;for(i=1;i<=NF;i++){sum+=$i};print int(sum/NF)}' fasta/$LIB_NAME.$FWD_NAME.qual > raw_stat/$LIB_NAME.fwd.meanqual
awk '$0!~"^>"{for(i=1;i<=NF;i++){sum[i]+=$i;nb[i]+=1}}END{for(i=1;sum[i];i++){print int(sum[i]/nb[i])}}' fasta/$LIB_NAME.$FWD_NAME.qual > raw_stat/$LIB_NAME.fwd.meanposqual
awk '$0!~"^>"{sum=0;for(i=1;i<=NF;i++){sum+=$i};print int(sum/NF)}' fasta/$LIB_NAME.$RVS_NAME.qual > raw_stat/$LIB_NAME.rvs.meanqual
awk '$0!~"^>"{for(i=1;i<=NF;i++){sum[i]+=$i;nb[i]+=1}}END{for(i=1;sum[i];i++){print int(sum[i]/nb[i])}}' fasta/$LIB_NAME.$RVS_NAME.qual > raw_stat/$LIB_NAME.rvs.meanposqual

# primer logo
obisample -s 10000 fasta/$LIB_NAME.$FWD_NAME.fasta | obicut -b 1 -e ${#FWD} > fasta/$LIB_NAME.sample.forward
obisample -s 10000 fasta/$LIB_NAME.$RVS_NAME.fasta | obicut -b 1 -e ${#RVS} > fasta/$LIB_NAME.sample.reverse
weblogo -c classic -t "$LIB_NAME forward primer $FWD_NAME" < fasta/$LIB_NAME.sample.forward > raw_stat/weblogo.$LIB_NAME.forward.eps
weblogo -c classic -t "$LIB_NAME reverse primer $RVS_NAME" < fasta/$LIB_NAME.sample.reverse > raw_stat/weblogo.$LIB_NAME.reverse.eps

rm fasta/$LIB_NAME.sample* fasta/$LIB_NAME.$FWD_NAME.fasta fasta/$LIB_NAME.$FWD_NAME.qual fasta/$LIB_NAME.$RVS_NAME.fasta fasta/$LIB_NAME.$RVS_NAME.qual
fastq.info(fastq=$FWD_LIB.fastq);
fastq.info(fastq=$RVS_LIB.fastq)"
awk '$0!~"^>"{sum=0;for(i=1;i<=NF;i++){sum+=$i};print int(sum/NF)}' fasta/$FWD_LIB.qual > raw_stat/$LIB_NAME.fwd.meanqual
awk '$0!~"^>"{for(i=1;i<=NF;i++){sum[i]+=$i;nb[i]+=1}}END{for(i=1;sum[i];i++){print int(sum[i]/nb[i])}}' fasta/$FWD_LIB.qual > raw_stat/$LIB_NAME.fwd.meanposqual
awk '$0!~"^>"{sum=0;for(i=1;i<=NF;i++){sum+=$i};print int(sum/NF)}' fasta/$RVS_LIB.qual > raw_stat/$LIB_NAME.rvs.meanqual
awk '$0!~"^>"{for(i=1;i<=NF;i++){sum[i]+=$i;nb[i]+=1}}END{for(i=1;sum[i];i++){print int(sum[i]/nb[i])}}' fasta/$RVS_LIB.qual > raw_stat/$LIB_NAME.rvs.meanposqual

rm fasta/$FWD_LIB.fasta fasta/$FWD_LIB.qual fasta/$RVS_LIB.fasta fasta/$RVS_LIB.qual

# list files and directories
. $BIN/list_step_files.sh
Expand Down
10 changes: 1 addition & 9 deletions src/Illumina_pair_end.step
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,8 @@ module load DeltaMP/${VERSION[DELTAMP]}
read LIB_NAME FWD_SUF RVS_SUF <<<`sed -n ${ARRAY_TASK}'p' config/lib2.list`
cd libraries

# cutadapt
DISS=`awk -v F=${#FWD} -v R=${#RVS} -v D=$PDIFFS 'BEGIN{DISS=D/F;if(D/R>DISS){DISS=D/R};printf "%.2g\n", DISS}'`
FWD_RC=`echo ">a $FWD" | tr " " "\n" | obicomplement --uppercase | grep -v ">"`
RVS_RC=`echo ">a $RVS" | tr " " "\n" | obicomplement --uppercase | grep -v ">"`
cutadapt -g $FWD -G $RVS -e $DISS --trimmed-only -o fasta/$LIB_NAME.$FWD_NAME.tmp.fastq -p fasta/$LIB_NAME.$RVS_NAME.tmp.fastq fastq/$LIB_NAME.$FWD_NAME.fastq fastq/$LIB_NAME.$RVS_NAME.fastq > fasta/log.cutadapt.$LIB_NAME.txt
cutadapt -a $RVS_RC -A $FWD_RC -e $DISS -q 10 -o fasta/$LIB_NAME.$FWD_NAME.cut.fastq -p fasta/$LIB_NAME.$RVS_NAME.cut.fastq fasta/$LIB_NAME.$FWD_NAME.tmp.fastq fasta/$LIB_NAME.$RVS_NAME.tmp.fastq >> fasta/log.cutadapt.$LIB_NAME.txt
rm fasta/$LIB_NAME.*.tmp.fastq

# Pair end
pandaseq -f fasta/$LIB_NAME.$FWD_NAME.cut.fastq -r fasta/$LIB_NAME.$RVS_NAME.cut.fastq -g fasta/log.pandaseq.$LIB_NAME.txt -F -A $PALG -o $MIN_OV -t $PTRESH -T 1 > fasta/$LIB_NAME.pairend.fastq
pandaseq -f fasta/$LIB_NAME.fwd.fastq -r fasta/$LIB_NAME.rvs.fastq -g fasta/log.pandaseq.$LIB_NAME.txt -F -A $PALG -o $MIN_OV -t $PTRESH -T 1 > fasta/$LIB_NAME.pairend.fastq

# Convert to fasta + qual
cd fasta
Expand Down
2 changes: 0 additions & 2 deletions src/Illumina_quality.step
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@ cut -f 1-4,$(($QUAL - $MINQUAL + 5)) $SUBPROJECT.sample_counts.tsv > $SUBPROJECT
dd if=$SUBPROJECT.summary.stat.tsv of=../archives/$SUBPROJECT.outputs/$SUBPROJECT.read_counts.tsv bs=1M

# Merge raw stat statistics and weblogos
module unload freetype
module load ghostscript/8.70-1
cd ..
gs -q -sDEVICE=pdfwrite -o archives/$SUBPROJECT.outputs/$SUBPROJECT.raw_and_pair-end_reads_statistics.pdf libraries/raw_stat/$SUBPROJECT.raw_reads_with_primer_quality.pdf libraries/raw_stat/$SUBPROJECT.pair-end_reads_quality.pdf libraries/raw_stat/$SUBPROJECT.legend.pdf
gs -q -sDEVICE=pdfwrite -dEPSCrop -o archives/$SUBPROJECT.outputs/$SUBPROJECT.forward.weblogo.pdf libraries/raw_stat/weblogo.*.forward.eps
Expand Down

0 comments on commit 5c1f372

Please sign in to comment.