diff --git a/src/Algorithm/ErrorCorrectProcess.cpp b/src/Algorithm/ErrorCorrectProcess.cpp index ffa5c54b..ffe87f8a 100644 --- a/src/Algorithm/ErrorCorrectProcess.cpp +++ b/src/Algorithm/ErrorCorrectProcess.cpp @@ -41,7 +41,12 @@ ErrorCorrectProcess::~ErrorCorrectProcess() // ErrorCorrectResult ErrorCorrectProcess::process(const SequenceWorkItem& workItem) { - + ErrorCorrectResult result = correct(workItem); + return result; +} + +ErrorCorrectResult ErrorCorrectProcess::correct(const SequenceWorkItem& workItem) +{ switch(m_algorithm) { case ECA_HYBRID: diff --git a/src/Algorithm/ErrorCorrectProcess.h b/src/Algorithm/ErrorCorrectProcess.h index e808fe95..d6d28992 100644 --- a/src/Algorithm/ErrorCorrectProcess.h +++ b/src/Algorithm/ErrorCorrectProcess.h @@ -60,7 +60,8 @@ class ErrorCorrectProcess ~ErrorCorrectProcess(); ErrorCorrectResult process(const SequenceWorkItem& item); - + ErrorCorrectResult correct(const SequenceWorkItem& item); + private: ErrorCorrectResult kmerCorrection(const SequenceWorkItem& item); diff --git a/src/Algorithm/Makefile.am b/src/Algorithm/Makefile.am index daa0207f..1cde7408 100644 --- a/src/Algorithm/Makefile.am +++ b/src/Algorithm/Makefile.am @@ -16,6 +16,7 @@ libalgorithm_a_SOURCES = \ OverlapBlock.h OverlapBlock.cpp \ SearchHistory.h SearchHistory.cpp \ ErrorCorrectProcess.h ErrorCorrectProcess.cpp \ + QCProcess.h QCProcess.cpp \ OverlapTools.h OverlapTools.cpp \ DPAlignment.h DPAlignment.cpp \ ConnectProcess.h ConnectProcess.cpp \ diff --git a/src/Algorithm/QCProcess.cpp b/src/Algorithm/QCProcess.cpp new file mode 100644 index 00000000..c0374a41 --- /dev/null +++ b/src/Algorithm/QCProcess.cpp @@ -0,0 +1,103 @@ +///----------------------------------------------- +// Copyright 2010 Wellcome Trust Sanger Institute +// Written by Jared Simpson (js18@sanger.ac.uk) +// Released under the GPL +//----------------------------------------------- +// +// QCProcess - Process to perform quality checks +// for a sequence work item +// +#include "QCProcess.h" +#include "BWTAlgorithms.h" + +// +// +// +QCProcess::QCProcess(const BWT* pBWT, const BWT* pRBWT, int kmerLength, int kmerThreshold) : + m_pBWT(pBWT), + m_pRBWT(pRBWT), + m_kmerLength(kmerLength), + m_kmerThreshold(kmerThreshold) +{ + +} + +// +QCProcess::~QCProcess() +{ + +} + +// +QCResult QCProcess::process(const SequenceWorkItem& workItem) +{ + // Perform a kmer-based qc check on the read + QCResult result; + + std::string readSequence = workItem.read.seq.toString(); + int k = m_kmerLength; + int n = readSequence.size(); + int nk = n - m_kmerLength + 1; + int threshold = m_kmerThreshold; + + // Are all kmers in the read well-represented? + bool allSolid = true; + + for(int i = 0; i < nk; ++i) + { + std::string kmer = readSequence.substr(i, k); + int count = BWTAlgorithms::countSequenceOccurrences(kmer, m_pBWT, m_pRBWT); + if(count <= threshold) + { + allSolid = false; + break; + } + } + + if(allSolid) + result.qcPassed = true; + else + result.qcPassed = false; + return result; +} + +// +// +// +QCPostProcess::QCPostProcess(std::ostream* pCorrectedWriter, + std::ostream* pDiscardWriter) : + m_pCorrectedWriter(pCorrectedWriter), + m_pDiscardWriter(pDiscardWriter), + m_readsKept(0), m_readsDiscarded(0) +{ + +} + +// +QCPostProcess::~QCPostProcess() +{ + std::cout << "Reads kept: " << m_readsKept << "\n"; + std::cout << "Reads discarded: " << m_readsDiscarded << "\n"; +} + +// +void QCPostProcess::process(const SequenceWorkItem& item, const QCResult& result) +{ + SeqRecord record = item.read; + if(result.qcPassed) + { + record.write(*m_pCorrectedWriter); + ++m_readsKept; + } + else + { + // To be able to rebuild the index after discarding the read, we need to write + // the rank of the string (its position in the original read file into the read name) + std::stringstream newID; + newID << item.read.id << ",seqrank=" << item.idx; + record.id = newID.str(); + + record.write(*m_pDiscardWriter); + ++m_readsDiscarded; + } +} diff --git a/src/Algorithm/QCProcess.h b/src/Algorithm/QCProcess.h new file mode 100644 index 00000000..79a3d4cd --- /dev/null +++ b/src/Algorithm/QCProcess.h @@ -0,0 +1,60 @@ +///----------------------------------------------- +// Copyright 2010 Wellcome Trust Sanger Institute +// Written by Jared Simpson (js18@sanger.ac.uk) +// Released under the GPL +//----------------------------------------------- +// +// QCProcess - Process to perform quality checks +// for a sequence work item +// +#ifndef QCPROCESS_H +#define QCPROCESS_H + +#include "Util.h" +#include "BWT.h" +#include "SequenceProcessFramework.h" +#include "SequenceWorkItem.h" + +class QCResult +{ + public: + QCResult() : qcPassed(false) {} + + bool qcPassed; +}; + +// +class QCProcess +{ + public: + QCProcess(const BWT* pBWT, const BWT* pRBWT, int kmerLength, int kmerThreshold); + ~QCProcess(); + QCResult process(const SequenceWorkItem& item); + + private: + + const BWT* m_pBWT; + const BWT* m_pRBWT; + const int m_kmerLength; + const int m_kmerThreshold; +}; + +// Write the results from the overlap step to an ASQG file +class QCPostProcess +{ + public: + QCPostProcess(std::ostream* pCorrectedWriter, std::ostream* pDiscardWriter); + ~QCPostProcess(); + + void process(const SequenceWorkItem& item, const QCResult& result); + + private: + + std::ostream* m_pCorrectedWriter; + std::ostream* m_pDiscardWriter; + + size_t m_readsKept; + size_t m_readsDiscarded; +}; + +#endif diff --git a/src/SGA/Makefile.am b/src/SGA/Makefile.am index 83157bbe..5bae47c7 100644 --- a/src/SGA/Makefile.am +++ b/src/SGA/Makefile.am @@ -38,5 +38,6 @@ sga_SOURCES = sga.cpp \ scaffold2fasta.cpp scaffold2fasta.h \ connect.cpp connect.h \ walk.cpp walk.h \ + qc.cpp qc.h \ OverlapCommon.h OverlapCommon.cpp \ SGACommon.h diff --git a/src/SGA/correct.cpp b/src/SGA/correct.cpp index 55633649..60be7d14 100644 --- a/src/SGA/correct.cpp +++ b/src/SGA/correct.cpp @@ -230,6 +230,8 @@ void parseCorrectOptions(int argc, char** argv) case 'a': arg >> algo_str; break; case 'd': arg >> opt::sampleRate; break; case 'c': arg >> opt::conflictCutoff; break; + case 'k': arg >> opt::kmerLength; break; + case 'x': arg >> opt::kmerThreshold; break; case '?': die = true; break; case 'v': opt::verbose++; break; case 'b': arg >> opt::branchCutoff; break; @@ -279,7 +281,7 @@ void parseCorrectOptions(int argc, char** argv) die = true; } - // Determine the correctiona algorithm to use + // Determine the correction algorithm to use if(!algo_str.empty()) { if(algo_str == "hybrid") diff --git a/src/SGA/qc.cpp b/src/SGA/qc.cpp new file mode 100644 index 00000000..03fa7977 --- /dev/null +++ b/src/SGA/qc.cpp @@ -0,0 +1,238 @@ +//----------------------------------------------- +// Copyright 2010 Wellcome Trust Sanger Institute +// Written by Jared Simpson (js18@sanger.ac.uk) +// Released under the GPL +//----------------------------------------------- +// +// qc - Perform a quality check on a set of reads, discarding low quality +// +#include +#include +#include +#include +#include "Util.h" +#include "qc.h" +#include "SuffixArray.h" +#include "BWT.h" +#include "SGACommon.h" +#include "OverlapCommon.h" +#include "Timer.h" +#include "BWTAlgorithms.h" +#include "ASQG.h" +#include "gzstream.h" +#include "SequenceProcessFramework.h" +#include "QCProcess.h" +#include "BWTDiskConstruction.h" + +// Functions + +// +// Getopt +// +#define SUBPROGRAM "qc" +static const char *QC_VERSION_MESSAGE = +SUBPROGRAM " Version " PACKAGE_VERSION "\n" +"Written by Jared Simpson.\n" +"\n" +"Copyright 2010 Wellcome Trust Sanger Institute\n"; + +static const char *QC_USAGE_MESSAGE = +"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] ... READSFILE\n" +"Perform a quality check on a set of reads, discarding low quality reads.\n" +"By default, the quality check looks for a tiled set of high-coverage k-mers across the reads.\n" +"This check is fast and can detect chimeric reads or reads with internal uncorrected mismatches.\n" +"Automatically rebuilds the FM-index without the discarded reads.\n" +"\n" +" --help display this help and exit\n" +" -v, --verbose display verbose output\n" +" -p, --prefix=PREFIX use PREFIX for the names of the index files (default: prefix of the input file)\n" +" -o, --outfile=FILE write the qc-passed reads to FILE (default: READSFILE.ec.fa)\n" +" -t, --threads=NUM use NUM threads to compute the overlaps (default: 1)\n" +" -d, --sample-rate=N use occurrence array sample rate of N in the FM-index. Higher values use significantly\n" +" less memory at the cost of higher runtime. This value must be a power of 2 (default: 128)\n" +" -k, --kmer-size=N The length of the kmer to use. (default: 27)\n" +" -x, --kmer-threshold=N Attempt to correct kmers that are seen less than N times. (default: 3)\n" +"\nReport bugs to " PACKAGE_BUGREPORT "\n\n"; + +static const char* PROGRAM_IDENT = +PACKAGE_NAME "::" SUBPROGRAM; + +namespace opt +{ + static unsigned int verbose; + static int numThreads = 1; + static std::string prefix; + static std::string readsFile; + static std::string outFile; + static std::string discardFile; + static int sampleRate = BWT::DEFAULT_SAMPLE_RATE; + + static int kmerLength = 27; + static int kmerThreshold = 3; + static int gapArrayStorage = 4; +} + +static const char* shortopts = "p:d:t:o:k:x:v"; + +enum { OPT_HELP = 1, OPT_VERSION, OPT_METRICS, OPT_DISCARD }; + +static const struct option longopts[] = { + { "verbose", no_argument, NULL, 'v' }, + { "threads", required_argument, NULL, 't' }, + { "outfile", required_argument, NULL, 'o' }, + { "prefix", required_argument, NULL, 'p' }, + { "sample-rate", required_argument, NULL, 'd' }, + { "kmer-size", required_argument, NULL, 'k' }, + { "kmer-threshold",required_argument, NULL, 'x' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { "metrics", required_argument, NULL, OPT_METRICS }, + { NULL, 0, NULL, 0 } +}; + +// +// Main +// +int qcMain(int argc, char** argv) +{ + parseQCOptions(argc, argv); + Timer* pTimer = new Timer(PROGRAM_IDENT); + + + BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); + BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); + pBWT->printInfo(); + + std::ostream* pWriter = createWriter(opt::outFile); + std::ostream* pDiscardWriter = createWriter(opt::discardFile); + + QCPostProcess postProcessor(pWriter, pDiscardWriter); + if(opt::numThreads <= 1) + { + // Serial mode + QCProcess processor(pBWT, pRBWT, opt::kmerLength, opt::kmerThreshold); + + SequenceProcessFramework::processSequencesSerial(opt::readsFile, &processor, &postProcessor); + } + else + { + // Parallel mode + std::vector processorVector; + for(int i = 0; i < opt::numThreads; ++i) + { + QCProcess* pProcessor = new QCProcess(pBWT, pRBWT, opt::kmerLength, opt::kmerThreshold); + processorVector.push_back(pProcessor); + } + + SequenceProcessFramework::processSequencesParallel(opt::readsFile, processorVector, &postProcessor); + + for(int i = 0; i < opt::numThreads; ++i) + { + delete processorVector[i]; + } + } + + // close filehandles + delete pWriter; + delete pDiscardWriter; + + // Rebuild the FM-index without the discarded reads + std::string out_prefix = stripFilename(opt::outFile); + removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, BWT_EXT, SAI_EXT, false, opt::numThreads, opt::gapArrayStorage); + removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, RBWT_EXT, RSAI_EXT, true, opt::numThreads, opt::gapArrayStorage); + + delete pBWT; + delete pRBWT; + delete pTimer; + + if(opt::numThreads > 1) + pthread_exit(NULL); + + return 0; +} + +// +// Handle command line arguments +// +void parseQCOptions(int argc, char** argv) +{ + std::string algo_str; + bool die = false; + for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) + { + std::istringstream arg(optarg != NULL ? optarg : ""); + switch (c) + { + case 'p': arg >> opt::prefix; break; + case 'o': arg >> opt::outFile; break; + case 't': arg >> opt::numThreads; break; + case 'd': arg >> opt::sampleRate; break; + case 'k': arg >> opt::kmerLength; break; + case 'x': arg >> opt::kmerThreshold; break; + case '?': die = true; break; + case 'v': opt::verbose++; break; + case OPT_HELP: + std::cout << QC_USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + std::cout << QC_VERSION_MESSAGE; + exit(EXIT_SUCCESS); + } + } + + if (argc - optind < 1) + { + std::cerr << SUBPROGRAM ": missing arguments\n"; + die = true; + } + else if (argc - optind > 1) + { + std::cerr << SUBPROGRAM ": too many arguments\n"; + die = true; + } + + if(opt::numThreads <= 0) + { + std::cerr << SUBPROGRAM ": invalid number of threads: " << opt::numThreads << "\n"; + die = true; + } + + if(opt::kmerLength <= 0) + { + std::cerr << SUBPROGRAM ": invalid kmer length: " << opt::kmerLength << ", must be greater than zero\n"; + die = true; + } + + if(opt::kmerThreshold <= 0) + { + std::cerr << SUBPROGRAM ": invalid kmer threshold: " << opt::kmerThreshold << ", must be greater than zero\n"; + die = true; + } + + if (die) + { + std::cout << "\n" << QC_USAGE_MESSAGE; + exit(EXIT_FAILURE); + } + + // Parse the input filenames + opt::readsFile = argv[optind++]; + + if(opt::prefix.empty()) + { + opt::prefix = stripFilename(opt::readsFile); + } + + if(opt::outFile.empty()) + { + opt::outFile = opt::prefix + ".qcpass.fa"; + } + + opt::discardFile = opt::prefix + ".discard.fa"; +} diff --git a/src/SGA/qc.h b/src/SGA/qc.h new file mode 100644 index 00000000..95fca5f8 --- /dev/null +++ b/src/SGA/qc.h @@ -0,0 +1,26 @@ +//----------------------------------------------- +// Copyright 2010 Wellcome Trust Sanger Institute +// Written by Jared Simpson (js18@sanger.ac.uk) +// Released under the GPL +//----------------------------------------------- +// +// qc - Quality check reads +// +#ifndef QC_H +#define QC_H +#include +#include "config.h" +#include "BWT.h" +#include "Match.h" +#include "BWTAlgorithms.h" +#include "OverlapAlgorithm.h" + +// functions + +// +int qcMain(int argc, char** argv); + +// options +void parseQCOptions(int argc, char** argv); + +#endif diff --git a/src/SGA/rmdup.cpp b/src/SGA/rmdup.cpp index 0177f39b..f75d92d9 100644 --- a/src/SGA/rmdup.cpp +++ b/src/SGA/rmdup.cpp @@ -295,7 +295,7 @@ std::string parseDupHits(const StringVector& hitsFilenames, const std::string& o // In the output fasta, we set the reads ID to be the index // and record its old id in the fasta header. std::stringstream newID; - newID << readIdx; + newID << item.id << ",seqrank=" << readIdx; item.id = newID.str(); // Write some metadata with the fasta record diff --git a/src/SGA/sga.cpp b/src/SGA/sga.cpp index 15b79195..2bfd2aa3 100644 --- a/src/SGA/sga.cpp +++ b/src/SGA/sga.cpp @@ -20,6 +20,7 @@ #include "scaffold.h" #include "connect.h" #include "walk.h" +#include "qc.h" #include "scaffold2fasta.h" #define PROGRAM_BIN "sga" @@ -46,7 +47,8 @@ static const char *SGA_USAGE_MESSAGE = " assemble generate contigs from an assembly graph\n" " oview view overlap alignments\n" " subgraph extract a subgraph from a graph\n" -"\nExperimental commands:\n" +"\n\nExperimental commands:\n" +" qc detect and discard reads that could be problematic for the assembly\n" " connect resolve the complete sequence of a paired-end fragment\n" " scaffold generate ordered sets of contigs using distance estimates\n" " scaffold2fasta convert the output of the scaffold subprogram into a fasta file\n" @@ -79,6 +81,8 @@ int main(int argc, char** argv) indexMain(argc - 1, argv + 1); else if(command == "merge") mergeMain(argc - 1, argv + 1); + else if(command == "qc") + qcMain(argc - 1, argv + 1); else if(command == "rmdup") rmdupMain(argc - 1, argv + 1); else if(command == "overlap") diff --git a/src/SuffixTools/RankProcess.cpp b/src/SuffixTools/RankProcess.cpp index 61188c5c..f9f9db02 100644 --- a/src/SuffixTools/RankProcess.cpp +++ b/src/SuffixTools/RankProcess.cpp @@ -45,8 +45,7 @@ RankVector RankProcess::process(const SequenceWorkItem& workItem) if(m_removeMode) { // Parse the read index from the read id - std::stringstream ss(workItem.read.id); - ss >> rank; + rank = parseRankFromID(workItem.read.id); } out.push_back(rank); @@ -82,6 +81,25 @@ RankVector RankProcess::process(const SequenceWorkItem& workItem) return out; } +// Parse the rank of a read from its ID. This must be set by the process +// which discards the read. +int64_t RankProcess::parseRankFromID(const std::string& id) +{ + static const std::string rank_str = "seqrank="; + // Find the position of the rank expression in the string + size_t rank_pos = id.rfind(rank_str); + if(rank_pos == std::string::npos) + { + std::cout << "Error: rank token not found in the discarded read with id: " << id << "\n"; + exit(EXIT_FAILURE); + } + + assert(rank_pos + rank_str.length() < id.length()); + std::stringstream rp(id.substr(rank_pos + rank_str.length())); + int64_t rank; + rp >> rank; + return rank; +} // // // diff --git a/src/SuffixTools/RankProcess.h b/src/SuffixTools/RankProcess.h index 4cc613ff..f6d93b92 100644 --- a/src/SuffixTools/RankProcess.h +++ b/src/SuffixTools/RankProcess.h @@ -27,6 +27,9 @@ class RankProcess RankVector process(const SequenceWorkItem& item); private: + + int64_t parseRankFromID(const std::string& id); + const BWT* m_pBWT; bool m_doReverse; bool m_removeMode;