Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 570 lines (487 sloc) 24.9 KB
#!/usr/bin/env bash
export LC_ALL=en_US.UTF-8
export SHELL=bash
v="2.9.5"
if [[ -z "$7" ]]; then
echo "Usage is RefMapOpt minK1 maxK1 minK2 maxK2 cluster_similarity Assembly_Type Num_of_Processors optional_list_of_individuals"
exit 1
fi
echo "Checking for required software"
DEP=(freebayes mawk bwa samtools vcftools rainbow gnuplot seqtk cd-hit-est bamToBed bedtools parallel vcfcombine pearRM fastp)
NUMDEP=0
for i in "${DEP[@]}"
do
if which $i &> /dev/null; then
foo=0
else
echo "The dependency" $i "is not installed or is not in your" '$PATH'"."
NUMDEP=$((NUMDEP + 1))
fi
done
SAMV1=$(samtools 2>&1 >/dev/null | grep Ver | sed -e 's/Version://' | cut -f2 -d " " | sed -e 's/-.*//' | cut -f1 -d ".")
SAMV2=$(samtools 2>&1 >/dev/null | grep Ver | sed -e 's/Version://' | cut -f2 -d " " | sed -e 's/-.*//' | cut -f2 -d ".")
if [ "$SAMV1" -ge "1" ]; then
if [ "$SAMV2" -lt "3" ]; then
echo "The version of Samtools installed in your" '$PATH' "is not optimized for dDocent."
echo "Please install at least version 1.3.0"
echo -en "\007"
echo -en "\007"
exit 1
fi
else
echo "The version of Samtools installed in your" '$PATH' "is not optimized for dDocent."
echo "Please install at least version 1.3.0"
echo -en "\007"
echo -en "\007"
exit 1
fi
RAINV=(`rainbow | head -1 | cut -f2 -d' ' `)
if [[ "$RAINV" != "2.0.2" && "$RAINV" != "2.0.3" && "$RAINV" != "2.0.4" ]]; then
echo "The version of Rainbow installed in your" '$PATH' "is not optimized for dDocent."
echo -en "\007"
echo -en "\007"
echo -en "\007"
echo "Is the version of rainbow installed newer than 2.0.2? Enter yes or no."
read TEST
if [ "$TEST" != "yes" ]; then
echo "Please install a version newer than 2.0.2"
exit 1
fi
fi
FREEB=(`freebayes | grep -oh 'v[0-9].*' | cut -f1 -d "." | sed -e 's/v//' `)
if [ "$FREEB" != "1" ]; then
echo "The version of FreeBayes installed in your" '$PATH' "is not optimized for dDocent."
echo "Please install at least version 1.0.0"
exit 1
fi
SEQTK=( `seqtk 2>&1 | grep Version | cut -f2 -d ":" | sed -e 's/1.[0-9]-r//g' | sed -e 's/-dirty//g' `)
if [ "$SEQTK" -lt "102" ]; then
echo "The version of seqtk installed in your" '$PATH' "is not optimized for dDocent."
echo "Please install at least version 1.2-r102-dirty"
exit 1
fi
VCFTV=$(vcftools | grep VCF | grep -oh '[0-9]*[a-z]*)$' | sed -e 's/[a-z)]//')
if [ "$VCFTV" -lt "10" ]; then
echo "The version of VCFtools installed in your" '$PATH' "is not optimized for dDocent."
echo "Please install at least version 0.1.11"
exit 1
elif [ "$VCFTV" == "11" ]; then
VCFGTFLAG="--geno"
elif [ "$VCFTV" -ge "12" ]; then
VCFGTFLAG="--max-missing"
fi
BWAV=$(bwa 2>&1 | mawk '/Versi/' | sed -e 's/Version: //g' | sed -e 's/0.7.//g' | sed -e 's/-.*//g' | cut -c 1-2)
if [ "$BWAV" -lt "13" ]; then
echo "The version of bwa installed in your" '$PATH' "is not optimized for dDocent."
echo "Please install at least version 0.7.13"
exit 1
fi
BTC=$( bedtools --version | mawk '{print $2}' | sed -e 's/v//g' | cut -f1,2 -d"." | sed -e 's/2\.//g' )
if [ "$BTC" -ge "26" ]; then
BEDTOOLSFLAG="NEW"
elif [ "$BTC" == "23" ]; then
BEDTOOLSFLAG="OLD"
elif [ "$BTC" != "23" ]; then
echo "The version of bedtools installed in your" '$PATH' "is not optimized for dDocent."
echo "Please install version 2.23.0 or version 2.26.0 and above"
exit 1
fi
FASTP=$(fastp -v 2>&1 | cut -f2 -d " ")
FASTP1=$(echo $FASTP | cut -f1 -d ".")
FASTP2=$(echo $FASTP | cut -f2 -d ".")
FASTP3=$(echo $FASTP | cut -f3 -d ".")
if [ "$FASTP1" -lt "2" ]; then
if [ "$FASTP2" -lt "20" ]; then
if [ "$FASTP2" -lt "5" ]; then
echo "The version of fastp installed in your" '$PATH' "is not optimized for dDocent."
echo "Please install version 0.19.5 or above"
exit 1
fi
fi
fi
if ! sort --version | fgrep GNU &>/dev/null; then
sort=gsort
else
sort=sort
fi
if [ $NUMDEP -gt 0 ]; then
echo -e "\nPlease install all required software before running RefMapOpt again."
exit 1
else
echo -e "\nAll required software is installed!"
fi
simC=$5
ATYPE=$6
if [[ $ATYPE != "SE" && $ATYPE != "PE" && $ATYPE != "OL" && $ATYPE != "HYB" && $ATYPE != "ROL" && $ATYPE != "RPE" ]]; then
echo "Usage is RefMapOpt minK1 maxK1 minK2 maxK2 cluster_similarity Assembly_Type Num_of_Processors optional_list_of_individuals"
echo "Please make sure to choose assembly type."
exit 1
fi
NUMProc=$7
ls *.F.fq.gz > namelist
sed -i'' -e 's/.F.fq.gz//g' namelist
NAMES=( `cat "namelist" `)
echo -e "\ndDocent RefMapOpt version $v"
#This code checks for trimmed sequence files
TEST=$(ls *.R1.fq.gz 2> /dev/null | wc -l )
if [ "$TEST" -gt 0 ]; then
echo -e "\nTrimmed sequences found, proceeding with optimization."
else
echo -e "\nRefMapOpt.sh requires that you have trimmed sequence files.\nPlease include trimmed sequence files with the .R1.fq.gz and .R2.fq.gz naming convention."
echo "dDocent will create these for you"
echo "Please rerun RefMapOpt.sh after trimming sequence files"
exit 1
fi
Reference(){
CUTOFF=$1
CUTOFF2=$2
simC=$3
AWK1='BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}'
AWK2='!/>/'
AWK3='!/NNN/'
AWK4='{for(i=0;i<$1;i++)print}'
PERLT='while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}'
SED1='s/^[ \t]*//'
SED2='s/\s/\t/g'
FRL=$(gunzip -c ${NAMES[0]}.F.fq.gz | mawk '{ print length() | "sort -rn" }' | head -1)
special_uniq(){
mawk -v x=$1 '$1 >= x' $2 |cut -f2 | sed -e 's/NNNNNNNNNN/ /g' | cut -f1 | uniq
}
export -f special_uniq
if [ ${NAMES[@]:(-1)}.F.fq.gz -nt ${NAMES[@]:(-1)}.uniq.seqs ]; then
if [[ "$ATYPE" == "PE" || "$ATYPE" == "RPE" ]]; then
#If PE assembly, creates a concatenated file of every unique for each individual in parallel
cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.R.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
if [ "$ATYPE" = "RPE" ]; then
cat namelist | parallel --no-notice -j $NUMProc "paste {}.forward {}.reverse | $sort -k1 -S 200M > {}.fr"
cat namelist | parallel --no-notice -j $NUMProc "cut -f1 {}.fr | uniq -c > {}.f.uniq && cut -f2 {}.fr > {}.r"
cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK4' {}.f.uniq > {}.f.uniq.e"
cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.f.uniq.e {}.r | mawk '$AWK3'| sed -e 's/-/NNNNNNNNNN/' | sed -e '$SED1' | sed -e '$SED2'> {}.uniq.seqs"
rm *.f.uniq.e *.f.uniq *.r *.fr
else
cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.forward {}.reverse | mawk '$AWK3'| sed -e 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.seqs"
fi
rm *.forward
rm *.reverse
fi
if [ "$ATYPE" == "SE" ]; then
#if SE assembly, creates files of every unique read for each individual in parallel
cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
fi
if [ "$ATYPE" == "OL" ]; then
#If OL assembly, dDocent assumes that the marjority of PE reads will overlap, so the software PEAR is used to merge paired reads into single reads
for i in "${NAMES[@]}"; do
gunzip -c $i.R.fq.gz | head -2 | tail -1 >> lengths.txt
done
MaxLen=$(mawk '{ print length() | "sort -rn" }' lengths.txt| head -1)
LENGTH=$(( $MaxLen / 3))
for i in "${NAMES[@]}"; do
pearRM -f $i.F.fq.gz -r $i.R.fq.gz -o $i -j $NUMProc -n $LENGTH
done
cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK1' {}.assembled.fastq | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
fi
if [ "$ATYPE" == "HYB" ]; then
#If HYB assembly, dDocent assumes some PE reads will overlap but that some will not, so the OL method performed and remaining reads are then put through PE method
for i in "${NAMES[@]}"; do
gunzip -c $i.R.fq.gz | head -2 | tail -1 >> lengths.txt
done
MaxLen=$(mawk '{ print length() | "sort -rn" }' lengths.txt| head -1)
LENGTH=$(( $MaxLen / 3))
for i in "${NAMES[@]}"; do
pearRM -f $i.F.fq.gz -r $i.R.fq.gz -o $i -j $NUMProc -n $LENGTH &>kopt.log
done
cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK1' {}.assembled.fastq | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
cat namelist | parallel --no-notice -j $NUMProc "cat {}.unassembled.forward.fastq | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
cat namelist | parallel --no-notice -j $NUMProc "cat {}.unassembled.reverse.fastq | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.forward {}.reverse | mawk '$AWK3'| sed -e 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.ua.seqs"
rm *.forward
rm *.reverse
fi
fi
#Create a data file with the number of unique sequences and the number of occurrences
if [ -f "uniq.seqs.gz" ]; then
if [ uniq.seqs.gz -nt uniq.seqs ]; then
gunzip uniq.seqs.gz 2>/dev/null
fi
fi
if [ ! -f "uniq.seqs" ]; then
cat *.uniq.seqs > uniq.seqs
fi
if [[ -z $CUTOFF || -z $CUTOFF2 ]]; then
getAssemblyInfo
fi
if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
parallel --no-notice -j $NUMProc --env special_uniq special_uniq $CUTOFF {} ::: *.uniq.seqs | $sort --parallel=$NUMProc -S 2G | uniq -c > uniqCperindv
else
parallel --no-notice -j $NUMProc mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' > uniqCperindv
fi
#Now that data cutoffs have been chosen, reduce data set to specified set of unique reads, convert to FASTA format,
#and remove reads with substantial amounts of adapters
if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
parallel --no-notice -j $NUMProc mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | sed -e 's/NNNNNNNNNN/-/' > total.uniqs
cut -f 1 -d "-" total.uniqs > total.u.F
cut -f 2 -d "-" total.uniqs > total.u.R
paste total.u.F total.u.R | $sort -k1 --parallel=$NUMProc -S 2G > total.fr
parallel --no-notice --env special_uniq special_uniq $CUTOFF {} ::: *.uniq.seqs | $sort --parallel=$NUMProc -S 2G | uniq -c > total.f.uniq
join -1 2 -2 1 -o 1.1,1.2,2.2 total.f.uniq total.fr | mawk '{print $1 "\t" $2 "NNNNNNNNNN" $3}' | mawk -v x=$CUTOFF2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.seqs
rm total.uniqs total.u.* total.fr total.f.uniq*
else
parallel --no-notice mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | mawk -v x=$CUTOFF2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.seqs
fi
$sort -k1 -r -n --parallel=$NUMProc -S 2G uniq.k.$CUTOFF.c.$CUTOFF2.seqs |cut -f2 > totaluniqseq
mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' totaluniqseq > uniq.full.fasta
LENGTH=$(mawk '!/>/' uniq.full.fasta | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
LENGTH=$(($LENGTH * 3 / 4))
seqtk seq -F I uniq.full.fasta > uniq.fq
if [ "$NUMProc" -gt 8 ]; then
NP=8
else
NP=$NUMProc
fi
MaxLen=$(mawk '!/>/' uniq.full.fasta | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
fastp -i uniq.fq -o uniq.fq1 -w $NP -Q -l $MaxLen &>/dev/null
mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' uniq.fq1 | paste - - | sort -k1,1 -V | tr "\t" "\n" > uniq.fasta
mawk '!/>/' uniq.fasta > totaluniqseq
rm uniq.fq*
if [[ "$ATYPE" == "PE" || "$ATYPE" == "RPE" ]]; then
pmerge(){
num=$( echo $1 | sed -e 's/^0*//g')
if [ "$num" -le 100 ]; then
j=$num
k=$(($num -1))
else
num=$(($num - 99))
j=$(python -c "print ("$num" * 100)")
k=$(python -c "print ("$j" - 100)")
fi
mawk -v x="$j" -v y="$k" '$5 <= x && $5 > y' rbdiv.out > rbdiv.out.$1
if [ -s "rbdiv.out.$1" ]; then
rainbow merge -o rbasm.out.$1 -a -i rbdiv.out.$1 -r 2 -N10000 -R10000 -l 20 -f 0.75
fi
}
export -f pmerge
#Reads are first clustered using only the Forward reads using CD-hit instead of rainbow
if [ "$ATYPE" == "PE" ]; then
sed -e 's/NNNNNNNNNN/ /g' uniq.fasta | cut -f1 > uniq.F.fasta
CDHIT=$(python -c "print (max("$simC" - 0.1,0.8))")
cd-hit-est -i uniq.F.fasta -o xxx -c $CDHIT -T $NUMProc -M 0 -g 1 -d 100 &>cdhit.log
mawk '{if ($1 ~ /Cl/) clus = clus + 1; else print $3 "\t" clus}' xxx.clstr | sed -e 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids
paste sort.contig.cluster.ids totaluniqseq > contig.cluster.totaluniqseq
else
sed -e 's/NNNNNNNNNN/ /g' totaluniqseq | cut -f1 | $sort --parallel=$NUMProc -S 2G| uniq | mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' > uniq.F.fasta
CDHIT=$(python -c "print (max("$simC" - 0.1,0.8))")
cd-hit-est -i uniq.F.fasta -o xxx -c $CDHIT -T $NUMProc -M 0 -g 1 -d 100 &>cdhit.log
mawk '{if ($1 ~ /Cl/) clus = clus + 1; else print $3 "\t" clus}' xxx.clstr | sed -e 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids
paste sort.contig.cluster.ids <(mawk '!/>/' uniq.F.fasta) > contig.cluster.Funiq
sed -e 's/NNNNNNNNNN/ /g' totaluniqseq | $sort --parallel=$NUMProc -k1 -S 2G | mawk '{print $0 "\t" NR}' > totaluniqseq.CN
join -t $'\t' -1 3 -2 1 contig.cluster.Funiq totaluniqseq.CN -o 2.3,1.2,2.1,2.2 > contig.cluster.totaluniqseq
fi
#CD-hit output is converted to rainbow format
$sort -k2,2 -g contig.cluster.totaluniqseq -S 2G --parallel=$NUMProc | sed -e 's/NNNNNNNNNN/ /g' > rcluster
rainbow div -i rcluster -o rbdiv.out -f 0.5 -K 10
CLUST=(`tail -1 rbdiv.out | cut -f5`)
CLUST1=$(( $CLUST / 100 + 1))
CLUST2=$(( $CLUST1 + 100 ))
seq -w 1 $CLUST2 | parallel --no-notice -j $NUMProc --env pmerge pmerge {}
cat rbasm.out.[0-9]* > rbasm.out
rm rbasm.out.[0-9]* rbdiv.out.[0-9]*
#This AWK code replaces rainbow's contig selection perl script
LENGTH=$(cut -f3 rbdiv.out |mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
LENGTH=$(( $LENGTH * 11 / 10 ))
cat rbasm.out <(echo "E") |sed -e 's/[0-9]*:[0-9]*://g' | mawk -v mlen=$LENGTH '{
if (NR == 1) e=$2;
else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_A_Contig_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
else if ($1 ~/C/) clus=$2;
else if ($1 ~/L/) len=$2;
else if ($1 ~/S/) seq=$2;
else if ($1 ~/N/) freq=$2;
else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/ && $0 ~/^R 0/ && len <= mlen) {seq1 = seq; fclus=clus;lenf=len}
else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/ && $0 ~!/^R 0/ && len > mlen) {seq1 = seq; fclus=clus; len1=len}
else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/ && $0 ~!/^R 0/ && len <= mlen) {seq1 = seq; fclus=clus; lenf=len}
else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
}' > rainbow.fasta
seqtk seq -r rainbow.fasta > rainbow.RC.fasta
mv rainbow.RC.fasta rainbow.fasta
#The rainbow assembly is checked for overlap between newly assembled Forward and Reverse reads using the software PEAR
grep -A1 "dDocent_A_Contig_" rainbow.fasta | mawk '!/^--/' | sed -e 's/dDocent_A_Contig_/dDocent_Contig_/g' > rainbow.asm.fasta
grep -A1 "dDocent_Contig_" rainbow.fasta | mawk '!/^--/' > rainbow.n.fasta
sed -e 's/NNNNNNNNNN/ /g' rainbow.asm.fasta | cut -f1 | seqtk seq -F I - > ref.F.fq
sed -e 's/NNNNNNNNNN/ /g' rainbow.asm.fasta | cut -f2 | seqtk seq -F I - > ref.R.fq
seqtk seq -r ref.R.fq > ref.RC.fq
mv ref.RC.fq ref.R.fq
LENGTH=$(mawk '!/>/' rainbow.fasta | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
LENGTH=$(( $LENGTH * 5 / 4))
pearRM -f ref.F.fq -r ref.R.fq -o overlap -p 0.001 -j $NUMProc -n $LENGTH &>kopt.log
rm ref.F.fq ref.R.fq
mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.assembled.fastq > overlap.fasta
mawk '/>/' overlap.fasta > overlap.loci.names
mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.unassembled.forward.fastq > other.F
mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.unassembled.reverse.fastq > other.R
paste other.F other.R | mawk '{if ($1 ~ />/) print $1; else print $0}' | sed -e 's/ /NNNNNNNNNN/g' > other.FR
cat other.FR overlap.fasta rainbow.n.fasta > totalover.fasta
paste <(mawk '{if (NR % 2) print $0}' totalover.fasta) <(mawk '{if (NR % 2 == 0) print $0}' totalover.fasta) | sort -V | sed -e 's/ /\'$'\n/g' > totalover.s.fasta
mv totalover.s.fasta totalover.fasta
rm *.F *.R
fi
if [[ "$ATYPE" == "HYB" ]];then
parallel --no-notice mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.ua.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | mawk -v x=$2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs
AS=$(cat uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs | wc -l)
if [ "$AS" -gt 1 ]; then
cut -f2 uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs > totaluniqseq.ua
mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' totaluniqseq.ua > uniq.full.ua.fasta
LENGTH=$(mawk '!/>/' uniq.full.ua.fasta | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
LENGTH=$(($LENGTH * 3 / 4))
seqtk seq -F I uniq.full.ua.fasta > uniq.ua.fq
if [ "$NUMProc" -gt 8 ]; then
NP=8
else
NP=$NUMProc
fi
MaxLen=$(mawk '!/>/' uniq.full.ua.fasta | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
fastp -i uniq.ua.fq -o uniq.ua.fq1 -w $NP -Q -l $MaxLen &>/dev/null
mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' uniq.ua.fq1 > uniq.ua.fasta
mawk '!/>/' uniq.ua.fasta > totaluniqseq.ua
rm uniq.ua.fq*
#Reads are first clustered using only the Forward reads using CD-hit instead of rainbow
sed -e 's/NNNNNNNNNN/ /g' uniq.ua.fasta | cut -f1 > uniq.F.ua.fasta
CDHIT=$(python -c "print(max("$simC" - 0.1,0.8))")
cd-hit-est -i uniq.F.ua.fasta -o xxx -c $CDHIT -T 0 -M 0 -g 1 -d 100 &>cdhit.log
mawk '{if ($1 ~ /Cl/) clus = clus + 1; else print $3 "\t" clus}' xxx.clstr | sed -e 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids.ua
paste sort.contig.cluster.ids.ua totaluniqseq.ua > contig.cluster.totaluniqseq.ua
$sort -k2,2 -g -S 2G --parallel=$NUMProc contig.cluster.totaluniqseq.ua | sed -e 's/NNNNNNNNNN/ /g' > rcluster.ua
#CD-hit output is converted to rainbow format
rainbow div -i rcluster.ua -o rbdiv.ua.out -f 0.5 -K 10
if [ "$ATYPE" == "PE" ]; then
rainbow merge -o rbasm.ua.out -a -i rbdiv.ua.out -r 2 -N10000 -R10000 -l 20 -f 0.75
else
rainbow merge -o rbasm.ua.out -a -i rbdiv.ua.out -r 2 -N10000 -R10000 -l 20 -f 0.75
fi
#This AWK code replaces rainbow's contig selection perl script
cat rbasm.ua.out <(echo "E") |sed -e 's/[0-9]*:[0-9]*://g' | mawk ' {
if (NR == 1) e=$2;
else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_Contig_UA_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_UA_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
else if ($1 ~/C/) clus=$2;
else if ($1 ~/L/) len=$2;
else if ($1 ~/S/) seq=$2;
else if ($1 ~/N/) freq=$2;
else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/) {seq1 = seq; fclus=clus; len1=len}
else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
}' > rainbow.ua.fasta
seqtk seq -r rainbow.ua.fasta > rainbow.RC.fasta
mv rainbow.RC.fasta rainbow.ua.fasta
cat rainbow.ua.fasta uniq.fasta > totalover.fasta
paste <(mawk '{if (NR % 2) print $0}' totalover.fasta) <(mawk '{if (NR % 2 == 0) print $0}' totalover.fasta) | sort -V | sed -e 's/ /\'$'\n/g' > totalover.s.fasta
mv totalover.s.fasta totalover.fasta
fi
fi
if [[ "$ATYPE" != "PE" && "$ATYPE" != "RPE" && "$ATYPE" != "HYB" ]]; then
cp uniq.fasta totalover.fasta
paste <(mawk '{if (NR % 2) print $0}' totalover.fasta) <(mawk '{if (NR % 2 == 0) print $0}' totalover.fasta) | sort -V | sed -e 's/ /\'$'\n/g' > totalover.s.fasta
mv totalover.s.fasta totalover.fasta
fi
cd-hit-est -i totalover.fasta -o reference.fasta.original -M 0 -T 0 -c $simC &>cdhit2.log
sed -e 's/^C/NC/g' -e 's/^A/NA/g' -e 's/^G/NG/g' -e 's/^T/NT/g' -e 's/T$/TN/g' -e 's/A$/AN/g' -e 's/C$/CN/g' -e 's/G$/GN/g' reference.fasta.original > reference.fasta
if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
sed -i 's/dDocent/dDocentR/g' reference.fasta
fi
samtools faidx reference.fasta &> index.log
bwa index reference.fasta >> index.log 2>&1
SEQS=$(mawk 'END {print NR}' uniq.k.$CUTOFF.c.$CUTOFF2.seqs)
TIGS=$(grep ">" -c reference.fasta)
echo -e "\ndDocent assembled $SEQS sequences (after cutoffs) into $TIGS contigs"
}
ls -S *.F.fq.gz > namelist
sed -i'' -e 's/.F.fq.gz//g' namelist
if [[ -z "$8" ]]; then
NUMIND=$(cat namelist | wc -l)
NUM1=$(($NUMIND / 10))
NUM2=$(($NUMIND - $NUM1))
NUM3=$(($NUM2 - $NUM1))
cat namelist | head -$NUM2 | tail -$NUM3 > newlist
NAMESR=( `cat "newlist" `)
LEN=${#NAMESR[@]}
LEN=$(($LEN - 1))
echo "5" >randlist
for ((rr = 1; rr<=50; rr++));
do
INDEX=$[ 1 + $[ RANDOM % $LEN ]]
if grep -q ${NAMESR[$INDEX]} randlist;
then x=x
else
echo ${NAMESR[$INDEX]} >> randlist
fi
done
RANDNAMES=( `mawk '!/^5$/' "randlist" | head -21 `)
else
RANDNAMES=( `cat "$8" `)
fi
rm lengths.txt &> /dev/null
for k in "${RANDNAMES[@]}";
do
if [ -f "$k.R.fq.gz" ]; then
gunzip -c $k.R.fq.gz | head -2 | tail -1 >> lengths.txt
fi
done
rm rand.proc 2>/dev/null
for k in "${RANDNAMES[@]}"
do
echo $k >> rand.proc
done
echo -e "Cov\tNon0Cov\tContigs\tMeanContigsMapped\tK1\tK2\tSUM_Mapped\tSUM_Properly\tMean_Mapped\tMean_Properly\tMisMatched" > mapping.results
for ((r = $1; r <= $2; r++));
do
for ((j = $3; j <= $4; j++));
do
PP=$(($r + $j))
if [ "$PP" != "2" ]; then
Reference $r $j $simC
#BWA for mapping for all samples
rm $r.$j.results 2>/dev/null
map_reads(){
r=$2;j=$3
if [[ "$ATYPE" == "HYB" || "$ATYPE" == "ROL" || "$ATYPE" == "RPE" ]]; then
if [ -f "$1.R2.fq.gz" ]; then
bwa mem -L 20,5 -t 8 -a -M -T 10 -A1 -B 3 -O 5 -R "@RG\tID:$1\tSM:$1\tPL:Illumina" reference.fasta $1.R1.fq.gz $1.R2.fq.gz 2> bwa.$1.log | mawk '!/\t[2-9].[SH].*/' | mawk '!/[2-9].[SH]\t/' | samtools view -@4 -q 1 -SbT reference.fasta - > $1.bam
else
bwa mem -L 20,5 -t 8 -a -M -T 10 -A1 -B 3 -O 5 -R "@RG\tID:$1\tSM:$1\tPL:Illumina" reference.fasta $1.R1.fq.gz 2> bwa.$1.log | mawk '!/\t[2-9].[SH].*/' | mawk '!/[2-9].[SH]\t/' | samtools view -@4 -q 1 -SbT reference.fasta - > $1.bam
fi
else
if [ -f "$1.R2.fq.gz" ]; then
INSERT=$(mawk '!/^>/' reference.fasta | mawk '{ print length() }' | mawk '{ sum += $1; n++ } END { if (n > 0) print int(sum / n); }')
INSERTH=$(($INSERT + 100 ))
INSERTL=$(($INSERT - 100 ))
SD=$(($INSERT / 5))
bwa mem -L 20,5 -I $INSERT,$SD,$INSERTH,$INSERTL -t 8 -a -M -T 10 -A 1 -B 3 -O 5 -R "@RG\tID:$1\tSM:$1\tPL:Illumina" reference.fasta $1.R1.fq.gz $1.R2.fq.gz 2> bwa.$1.log | mawk '!/\t[2-9].[SH].*/' | mawk '!/[2-9].[SH]\t/' | samtools view -@4 -q 1 -SbT reference.fasta - > $1.bam
else
bwa mem -L 20,5 -t 8 -a -M -T 10 -A 1 -B 3 -O 5 -R "@RG\tID:$1\tSM:$1\tPL:Illumina" reference.fasta $1.R1.fq.gz 2> bwa.$1.log | mawk '!/\t[2-9].[SH].*/' | mawk '!/[2-9].[SH]\t/' | samtools view -@4 -q 1 -SbT reference.fasta - > $1.bam
fi
fi
samtools sort -@4 $1.bam -o $1.bam 2> /dev/null
samtools index $1.bam
MM=$(samtools flagstat $1.bam | grep -E 'mapped \(|properly' | cut -f1 -d '+' | tr -d '\n')
CM=$(samtools idxstats $1.bam | mawk '$3 > 0' | wc -l)
CC=$(samtools idxstats $1.bam | mawk '{ sum += $3; n++ } END { if (n > 0) print sum / n; }')
DD=$(samtools idxstats $1.bam | mawk '$3 >0' | mawk '{ sum += $3; n++ } END { if (n > 0) print sum / n; }')
BM=$(samtools flagstat $1.bam | grep mapQ | cut -f1 -d ' ')
echo -e "$MM\t$CM\t$CC\t$DD\t$BM" >> $r.$j.results
}
export -f map_reads
cat rand.proc | parallel --no-notice -j $NUMProc --env map_reads map_reads {} $r $j
SUMM=$(mawk '{ sum+=$1} END {print sum}' $r.$j.results)
SUMPM=$(mawk '{ sum+=$2} END {print sum}' $r.$j.results)
AVEM=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $r.$j.results)
AVEP=$(mawk '{ sum += $2; n++ } END { if (n > 0) print sum / n; }' $r.$j.results)
AC=$(mawk '{ sum += $3; n++ } END { if (n > 0) print sum / n; }' $r.$j.results)
CON=$(mawk '/>/' reference.fasta | wc -l)
CCC=$(mawk '{ sum += $4; n++ } END { if (n > 0) print sum / n; }' $r.$j.results)
DDD=$(mawk '{ sum += $5; n++ } END { if (n > 0) print sum / n; }' $r.$j.results)
BBM=$(mawk '{ sum += $6; n++ } END { if (n > 0) print sum / n; }' $r.$j.results)
echo -e "$CCC\t$DDD\t$CON\t$AC\t$r\t$j\t$SUMM\t$SUMPM\t$AVEM\t$AVEP\t$BBM" >> mapping.results
fi
done
done