Skip to content

Commit

Permalink
Implements #9, #13
Browse files Browse the repository at this point in the history
  • Loading branch information
lentendu committed Feb 15, 2018
1 parent b83c6b1 commit 3c24669
Show file tree
Hide file tree
Showing 8 changed files with 581 additions and 67 deletions.
10 changes: 8 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,17 @@ deltamp := $(patsubst src/%,bin/%,$(patsubst %.main,%,$(wildcard $(addsuffix *.m
batch_spec := $(addprefix bin/,$(notdir $(shell ls lib/$(batch)/deltamp.*)))
steps := $(patsubst src/%,bin/%,$(patsubst %.step,%.sh,$(wildcard $(addsuffix *.step,src/))))
highmems := $(patsubst %.head,%_highmem.head,$(shell ls lib/$(batch)/*.head | grep -v "_"))
test_config := $(patsubst src/%,test/%,$(patsubst %.config,%.tsv,$(wildcard $(addsuffix *.config,src/))))

# search paths
vpath %.main src
vpath %.step src
vpath %.head lib/$(batch)
vpath %.config src

# main rule
.PHONY: all clean
all: $(deltamp) $(module) $(steps) $(batch_spec)
all: $(deltamp) $(module) $(steps) $(batch_spec) $(test_config)

# rule to build deltamp and pipeline_master
$(deltamp): bin/% : %.main | lib/$(batch)/option_variables
Expand Down Expand Up @@ -61,6 +63,10 @@ endif
$(batch_spec): bin/deltamp.% : lib/$(batch)/deltamp.%
cp $^ $@ && chmod +x $@

# rule to build test configuration file
$(test_config): test/%.tsv : %.config
sed "s#USER#$$USER#;s#CURDIR#$(CURDIR)#" $< > $@

# clean rule
clean :
rm -r $(deltamp) modulefiles $(steps) $(highmems) $(batch_spec)
rm -r $(deltamp) modulefiles $(steps) $(highmems) $(batch_spec) $(test_config)
5 changes: 3 additions & 2 deletions bin/variables_definition.tsv
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
PROJECT # Project name .+
OWNER $USER User name .+
EMAIL $USER@example.com Email address .+@.+
OUT_PATH /home/$USER Path to output location ^/.+(/.+)*$
EXEC_PATH /work/$USER Path to execution location ^/.+(/.+)*$
TECH Illumina Sequencing technology 454 Illumina
Expand All @@ -24,7 +23,9 @@ CLUST mcl Clustering algorithm mcl sumaclust cd-hit-est vsearch swarm
TRESH 97 Clustering similarity threshold ^[0-9]+(.[0-9]+)?$
PREV_PATH no Cluster with previous subproject reference sequences no ^/.+(/.+)*$
DEL_SING no Remove singletons before chimera re-check no yes
REF_DB UNITE Reference database UNITE SILVA PR2 NCBI NCBI_16ranks GBOL GBOL_16ranks
DBFOLD /home/$USER Directory path to database ^/.+(/.+)*$
REF_DB no Database prefix name ^[^ \t]+$
CUT_DB no Reduce database to amplified fragment no yes
ASSIGN_ALL no Assign all reads no yes
MIN_SAMP 1 Minimum number of sample for abundant OTUs ^[0-9]+$
MIN_DOM 4 Minimum number of reads for abundant OTUs ^[0-9]+$
52 changes: 52 additions & 0 deletions src/configuration_PRJEB5170_Cercozoa.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
PROJECT
Project name MultiBarSoilEuk
User name USER
Path to output location CURDIR/test
Path to execution location CURDIR/test

LIBRAIRIES
Sequencing technology 454
Directory path to archives or libraries OR BioProject accession PRJEB5170

TARGET
Target organisms Cercozoa
Target region 18S
Forward primer name Cer2f
Forward primer sequence (5' to 3') ATTTCTGCCCTATCAGCT
Reverse primer name Cer1r
Reverse primer sequence (5' to 3') ATACTAGCACCCCCAACT
Sequencing direction forward

TRIMMING
Number of mismatches allowed on the barcode sequence 1
Number of mismatches allowed on the primer sequence 4
Maximum number of ambiguities allowed in the sequence 0
Maximum homopolymer length allowed 8
Minimum sequence length 300
Maximum sequence length 700
Minimum average quality on the trimmed sequence length 25
Denoising yes

PIPELINE
Minimum number of trimmed reads per sample 700
Clustering algorithm vsearch
Clustering similarity threshold 96
Directory path to database CURDIR/test
Reference database pr2_test.gb203
Minimum number of sample for abundant OTUs 1
Minimum number of reads for abundant OTUs 2

BARCODES
Barcode sequence (454 only) Sample name [Forward] library name OR URL Reverse library name OR URL (Illumina only)
ACGAGTGCGT cerso01a ftp.sra.ebi.ac.uk/vol1/ERA279/ERA279305/sff/AMP12_HV6G38L03-04_cerso01a.sff
ACGCTCGACA cerso01b ftp.sra.ebi.ac.uk/vol1/ERA279/ERA279305/sff/AMP12_HV6G38L03-04_cerso01b.sff
AGACGCACTC cerso01c ftp.sra.ebi.ac.uk/vol1/ERA279/ERA279305/sff/AMP12_HV6G38L03-04_cerso01c.sff
AGCACTGTAG cerso06a ftp.sra.ebi.ac.uk/vol1/ERA279/ERA279305/sff/AMP12_HV6G38L03-04_cerso06a.sff
ATCAGACACG cerso06b ftp.sra.ebi.ac.uk/vol1/ERA279/ERA279305/sff/AMP12_HV6G38L03-04_cerso06b.sff
ATATCGCGAG cerso06c ftp.sra.ebi.ac.uk/vol1/ERA279/ERA279305/sff/AMP12_HV6G38L03-04_cerso06c.sff
CGTGTCTCTA cerso13a ftp.sra.ebi.ac.uk/vol1/ERA279/ERA279305/sff/AMP12_HV6G38L03-04_cerso13a.sff
CTCGCGTGTC cerso13b ftp.sra.ebi.ac.uk/vol1/ERA279/ERA279305/sff/AMP12_HV6G38L03-04_cerso13b.sff
TAGTATCAGC cerso13c ftp.sra.ebi.ac.uk/vol1/ERA279/ERA279305/sff/AMP12_HV6G38L03-04_cerso13c.sff
TCTCTATGCG cerso18a ftp.sra.ebi.ac.uk/vol1/ERA279/ERA279305/sff/AMP12_HV6G38L03-04_cerso18a.sff
TGATACGTCT cerso18b ftp.sra.ebi.ac.uk/vol1/ERA279/ERA279305/sff/AMP12_HV6G38L03-04_cerso18b.sff
TACTGAGCTA cerso18c ftp.sra.ebi.ac.uk/vol1/ERA279/ERA279305/sff/AMP12_HV6G38L03-04_cerso18c.sff
78 changes: 15 additions & 63 deletions src/deltamp.main
Original file line number Diff line number Diff line change
Expand Up @@ -390,79 +390,31 @@ fi
SAMP_SIZE=`sed -n '$=' config/lib4.list`

# Databases
DBFOLD=/data/ecogen/databases/mothur
if [ $REF_DB == "UNITE" ] && [ $TARG == "ITS" ]
if [ $TARG == "ITS" ]
then
VERSION[DB]=v7_2
DB=unite.${VERSION[DB]}
CITATION[DB]="Kõljalg et al., 2013"
FULLCITATION[DB]="Kõljalg U, Nilsson RH, Abarenkov K et al. (2013) Towards a unified paradigm for sequence-based identification of fungi. Molecular Ecology, 22, 5271–5277."
if [ $CUT_DB == "yes" ]
then
echo "Database can only be reduced to amplified fragment for SSU fragments."
show_doc | fmt -s -w $(tput cols) >&2
exit 1
fi
CITATION[FUN]="Nguyen et al., 2016"
FULLCITATION[FUN]="Nguyen, N. H., Song, Z., Bates, S. T., Branco, S., Tedersoo, L., Menke, J., … Kennedy, P. G. (2016). FUNGuild: An open annotation tool for parsing fungal community datasets by ecological guild. Fungal Ecology, 20, 241–248."
VERSION[FUN]=1.1
elif [ $REF_DB == "SILVA" ]
then
VERSION[DB]=128
CITATION[DB]="Quast et al., 2013"
FULLCITATION[DB]="Quast C, Pruesse E, Yilmaz P et al. (2013) The SILVA ribosomal RNA gene database project: improved data processing and web-based tools. Nucleic Acids Research, 41, D590–D596."
if [ $TARG == "16S" ]
then
DB=silva.${VERSION[DB]}.Nr99.prok
elif [ $TARG == "18S" ]
then
DB=silva.${VERSION[DB]}.Nr99.euk
fi
elif [ $REF_DB == "PR2" ] && [ $TARG == "18S" ]
then
VERSION[DB]=gb203
DB=PR2_${VERSION[DB]}
CITATION[DB]="Guillou et al., 2012"
FULLCITATION[DB]="Guillou L, Bachar D, Audic S et al. (2012) The Protist Ribosomal Reference database (PR2): a catalog of unicellular eukaryote Small Sub-Unit rRNA sequences with curated taxonomy. Nucleic Acids Research, 41, D597–D604."
elif [ $REF_DB == "NCBI" ] && [ $TARG == "COI" ]
then
VERSION[DB]=gb211
DB=ncbi_COI
CITATION[DB]="Benson et al., 2013"
FULLCITATION[DB]="Benson DA, Cavanaugh M, Clark K et al. (2013) GenBank. Nucleic Acids Research, 41, D36–42."
elif [ $REF_DB == "NCBI_16ranks" ] && [ $TARG == "COI" ]
then
VERSION[DB]=gb211
DB=ncbi_COI_16ranks
CITATION[DB]="Benson et al., 2013"
FULLCITATION[DB]="Benson DA, Cavanaugh M, Clark K et al. (2013) GenBank. Nucleic Acids Research, 41, D36–42."
elif [ $REF_DB == "GBOL" ] && [ $TARG == "COI" ]
then
DB=ncbi_GBOL_COI
CITATION[DB]="GBOL: http://www.bolgermany.de"
elif [ $REF_DB == "GBOL_16ranks" ] && [ $TARG == "COI" ]
then
DB=ncbi_GBOL_COI_16ranks
CITATION[DB]="GBOL: http://www.bolgermany.de"
else
echo "# The reference database does not match with the target gene."
show_doc | fmt -s -w $(tput cols) >&2
cd .. && rm -r $SUBPROJECT
if [ ! "$(ls -A)" ] ; then cd .. && rmdir $PROJECT ; fi
exit 1
fi

if [[ $DB == "silva"* ]]
if ( [ $TARG == "18S" ] || [ $TARG == "16S" ] ) && [ $CUT_DB == "yes" ]
then
DBALIGN=$(echo $DB | sed 's/^\(.*\)\.\([^\.]*\)$/\1\.align\.\2/')
if [ $TECH == "454" ]
if [ $TECH == "454" ] && [ $FLIP == "true" ]
then
if [ $FLIP == "true" ]
then
DBCHOP=$RVS_NAME.$FWD_NAME.$DBALIGN
DBCUT=$RVS_NAME.$FWD_NAME.$DB
else
DBCHOP=$FWD_NAME.$RVS_NAME.$DBALIGN
DBCUT=$FWD_NAME.$RVS_NAME.$DB
fi
else
DBCHOP=$FWD_NAME.$RVS_NAME.$DBALIGN
DBCUT=$RVS_NAME.$FWD_NAME.$DB
else
DBCUT=$FWD_NAME.$RVS_NAME.$DB
fi
if [ ! -z $DBALIGN ] || [ $DBALIGN != "no" ]
then
DBCHOP=`echo $DBCUT | sed "s/$DB$/$DBALIGN/"`
fi
fi

# Save newly set variables to file (avoiding lowercase variables from loops):
Expand Down
Loading

0 comments on commit 3c24669

Please sign in to comment.