diff --git a/src/Algorithm/ErrorCorrectProcess.cpp b/src/Algorithm/ErrorCorrectProcess.cpp index 281d2fe6..ffa5c54b 100644 --- a/src/Algorithm/ErrorCorrectProcess.cpp +++ b/src/Algorithm/ErrorCorrectProcess.cpp @@ -17,12 +17,15 @@ // ErrorCorrectProcess::ErrorCorrectProcess(const OverlapAlgorithm* pOverlapper, int minOverlap, int numRounds, - int conflictCutoff, ErrorCorrectAlgorithm algo, + int conflictCutoff, int kmerLength, + int kmerThreshold, ErrorCorrectAlgorithm algo, bool printMO) : m_pOverlapper(pOverlapper), m_minOverlap(minOverlap), m_numRounds(numRounds), m_conflictCutoff(conflictCutoff), + m_kmerLength(kmerLength), + m_kmerThreshold(kmerThreshold), m_algorithm(algo), m_printOverlaps(printMO), m_depthFilter(10000) @@ -39,10 +42,34 @@ ErrorCorrectProcess::~ErrorCorrectProcess() ErrorCorrectResult ErrorCorrectProcess::process(const SequenceWorkItem& workItem) { - if(m_algorithm == ECA_KMER) - return kmerCorrection(workItem); - else - return overlapCorrection(workItem); + switch(m_algorithm) + { + case ECA_HYBRID: + { + ErrorCorrectResult result = kmerCorrection(workItem); + if(!result.kmerQC) + return overlapCorrection(workItem); + else + return result; + break; + } + case ECA_KMER: + { + return kmerCorrection(workItem); + break; + } + case ECA_OVERLAP: + { + return overlapCorrection(workItem); + break; + } + default: + { + assert(false); + } + } + ErrorCorrectResult result; + return result; } ErrorCorrectResult ErrorCorrectProcess::overlapCorrection(const SequenceWorkItem& workItem) @@ -58,9 +85,11 @@ ErrorCorrectResult ErrorCorrectProcess::overlapCorrection(const SequenceWorkItem while(!done) { + // Compute the set of overlap blocks for the read m_blockList.clear(); OverlapResult overlap_result = m_pOverlapper->overlapRead(currRead, m_minOverlap, &m_blockList); int sumOverlaps = 0; + // Sum the spans of the overlap blocks to calculate the total number of overlaps this read has for(OverlapBlockList::iterator iter = m_blockList.begin(); iter != m_blockList.end(); ++iter) { @@ -83,41 +112,8 @@ ErrorCorrectResult ErrorCorrectProcess::overlapCorrection(const SequenceWorkItem result.num_suffix_overlaps = 0; mo.countOverlaps(result.num_prefix_overlaps, result.num_suffix_overlaps); - if(m_printOverlaps) - { - std::cout << "Prefix overlap: " << result.num_prefix_overlaps - << " suffix overlaps: " << result.num_suffix_overlaps << "\n"; - std::cout << "Coverage overlap: " << mo.calculateCoverageOverlap() << "\n"; - mo.print(); - } - - if(m_algorithm == ECA_TRIE) - { - if(mo.isConflicted(m_conflictCutoff)) - { - // Perform simple correction - SeqTrie leftTrie; - SeqTrie rightTrie; - mo.makeSeqTries(p_error, leftTrie, rightTrie); - result.correctSequence = ErrorCorrect::trieCorrect(currRead.seq.toString(), p_error, leftTrie, rightTrie); - } - else - { - result.correctSequence = mo.consensusConflict(p_error, m_conflictCutoff); - } - } - else if(m_algorithm == ECA_CC) - { - result.correctSequence = mo.consensusConflict(p_error, m_conflictCutoff); - } - else if(m_algorithm == ECA_SIMPLE) - { - result.correctSequence = mo.calculateConsensusFromPartition(p_error); - } - else - { - assert(false); - } + // Perform conflict-aware consensus correction on the read + result.correctSequence = mo.consensusConflict(p_error, m_conflictCutoff); ++rounds; if(rounds == m_numRounds || result.correctSequence == currRead.seq) @@ -125,6 +121,16 @@ ErrorCorrectResult ErrorCorrectProcess::overlapCorrection(const SequenceWorkItem else currRead.seq = result.correctSequence; } + + // Quality checks + if(result.num_prefix_overlaps > 0 && result.num_suffix_overlaps > 0) + { + result.overlapQC = true; + } + else + { + result.overlapQC = false; + } if(m_printOverlaps) { @@ -135,64 +141,9 @@ ErrorCorrectResult ErrorCorrectProcess::overlapCorrection(const SequenceWorkItem std::cout << "QS: " << currRead.qual << "\n"; } - // Quality checks - if(result.num_prefix_overlaps > 0 && result.num_suffix_overlaps > 0) - result.passedQC = true; - else - result.passedQC = false; - return result; } -// -MultiOverlap ErrorCorrectProcess::blockListToMultiOverlap(const SeqRecord& record, OverlapBlockList& blockList) -{ - std::string read_idx = record.id; - std::string read_seq = record.seq.toString(); - MultiOverlap out(read_idx, read_seq); - - for(OverlapBlockList::iterator iter = blockList.begin(); iter != blockList.end(); ++iter) - { - std::string overlap_string = iter->getOverlapString(read_seq); - - // Compute the endpoints of the overlap - int s1 = read_seq.length() - iter->overlapLen; - int e1 = s1 + iter->overlapLen - 1; - SeqCoord sc1(s1, e1, read_seq.length()); - - int s2 = 0; // The start of the second hit must be zero by definition of a prefix/suffix match - int e2 = s2 + iter->overlapLen - 1; - SeqCoord sc2(s2, e2, overlap_string.length()); - - // The coordinates are always with respect to the read, so flip them if - // we aligned to/from the reverse of the read - if(iter->flags.isQueryRev()) - sc1.flip(); - if(iter->flags.isTargetRev()) - sc2.flip(); - - bool isRC = false; // since we transformed the original sequence, they are never RC - if(sc1.isContained()) - continue; // skip containments - - // Add an overlap for each member of the block - for(int64_t i = iter->ranges.interval[0].lower; i <= iter->ranges.interval[0].upper; ++i) - { - Overlap o(read_idx, sc1, makeIdxString(i), sc2, isRC, -1); - out.add(overlap_string, o); - } - } - return out; -} - -// make an id string from a read index -std::string ErrorCorrectProcess::makeIdxString(int64_t idx) -{ - std::stringstream ss; - ss << idx; - return ss.str(); -} - // Correct a read with a k-mer based corrector ErrorCorrectResult ErrorCorrectProcess::kmerCorrection(const SequenceWorkItem& workItem) { @@ -204,34 +155,32 @@ ErrorCorrectResult ErrorCorrectProcess::kmerCorrection(const SequenceWorkItem& w std::cout << "Kmer correcting read " << workItem.read.id << "\n"; #endif - size_t count_threshold = 3; - size_t k_size = 27; - size_t n = readSequence.size(); - size_t nk = n - k_size + 1; + int n = readSequence.size(); + int nk = n - m_kmerLength + 1; // Are all kmers in the read well-represented? bool allSolid = false; - bool done = false; int rounds = 0; - int maxAttempts = 4; - while(!done) + int maxAttempts = 2; + + while(!done && nk > 0) { // Compute the kmer counts across the read // and determine the positions in the read that are not covered by any solid kmers // These are the candidate incorrect bases - std::vector countVector(nk, 0); - std::vector solidVector(n, 0); + std::vector countVector(nk, 0); + std::vector solidVector(n, 0); - for(size_t i = 0; i < nk; ++i) + for(int i = 0; i < nk; ++i) { - std::string kmer = readSequence.substr(i, k_size); - size_t count = BWTAlgorithms::countSequenceOccurrences(kmer, m_pOverlapper->getBWT(), m_pOverlapper->getRBWT()); + std::string kmer = readSequence.substr(i, m_kmerLength); + int count = BWTAlgorithms::countSequenceOccurrences(kmer, m_pOverlapper->getBWT(), m_pOverlapper->getRBWT()); countVector[i] = count; - if(count > count_threshold) + if(count > m_kmerThreshold) { - for(size_t j = i; j < i + k_size; ++j) + for(int j = i; j < i + m_kmerLength; ++j) { solidVector[j] = 1; } @@ -239,7 +188,7 @@ ErrorCorrectResult ErrorCorrectProcess::kmerCorrection(const SequenceWorkItem& w } allSolid = true; - for(size_t i = 0; i < n; ++i) + for(int i = 0; i < n; ++i) { #ifdef KMER_TESTING std::cout << "Position[" << i << "] = " << solidVector[i] << "\n"; @@ -258,19 +207,19 @@ ErrorCorrectResult ErrorCorrectProcess::kmerCorrection(const SequenceWorkItem& w // Attempt to correct the leftmost potentially incorrect base bool corrected = false; - for(size_t i = 0; i < n; ++i) + for(int i = 0; i < n; ++i) { if(solidVector[i] != 1) { // Attempt to correct the base using the leftmost covering kmer - size_t left_k_idx = (i + 1 >= k_size ? i + 1 - k_size : 0); - corrected = attemptKmerCorrection(i, left_k_idx, k_size, std::max(countVector[left_k_idx], count_threshold), readSequence); + int left_k_idx = (i + 1 >= m_kmerLength ? i + 1 - m_kmerLength : 0); + corrected = attemptKmerCorrection(i, left_k_idx, std::max(countVector[left_k_idx], m_kmerThreshold), readSequence); if(corrected) break; // base was not corrected, try using the rightmost covering kmer - size_t right_k_idx = std::min(i, n - k_size); - corrected = attemptKmerCorrection(i, right_k_idx, k_size, std::max(countVector[right_k_idx], count_threshold), readSequence); + size_t right_k_idx = std::min(i, n - m_kmerLength); + corrected = attemptKmerCorrection(i, right_k_idx, std::max(countVector[right_k_idx], m_kmerThreshold), readSequence); if(corrected) break; } @@ -287,24 +236,74 @@ ErrorCorrectResult ErrorCorrectProcess::kmerCorrection(const SequenceWorkItem& w if(allSolid) { result.correctSequence = readSequence; - result.passedQC = true; + result.kmerQC = true; } else { - result.passedQC = false; result.correctSequence = workItem.read.seq.toString(); + result.kmerQC = false; } return result; } +// +MultiOverlap ErrorCorrectProcess::blockListToMultiOverlap(const SeqRecord& record, OverlapBlockList& blockList) +{ + std::string read_idx = record.id; + std::string read_seq = record.seq.toString(); + MultiOverlap out(read_idx, read_seq); + + for(OverlapBlockList::iterator iter = blockList.begin(); iter != blockList.end(); ++iter) + { + std::string overlap_string = iter->getOverlapString(read_seq); + + // Compute the endpoints of the overlap + int s1 = read_seq.length() - iter->overlapLen; + int e1 = s1 + iter->overlapLen - 1; + SeqCoord sc1(s1, e1, read_seq.length()); + + int s2 = 0; // The start of the second hit must be zero by definition of a prefix/suffix match + int e2 = s2 + iter->overlapLen - 1; + SeqCoord sc2(s2, e2, overlap_string.length()); + + // The coordinates are always with respect to the read, so flip them if + // we aligned to/from the reverse of the read + if(iter->flags.isQueryRev()) + sc1.flip(); + if(iter->flags.isTargetRev()) + sc2.flip(); + + bool isRC = false; // since we transformed the original sequence, they are never RC + if(sc1.isContained()) + continue; // skip containments + + // Add an overlap for each member of the block + for(int64_t i = iter->ranges.interval[0].lower; i <= iter->ranges.interval[0].upper; ++i) + { + Overlap o(read_idx, sc1, makeIdxString(i), sc2, isRC, -1); + out.add(overlap_string, o); + } + } + return out; +} + +// make an id string from a read index +std::string ErrorCorrectProcess::makeIdxString(int64_t idx) +{ + std::stringstream ss; + ss << idx; + return ss.str(); +} + + // Attempt to correct the base at position idx in readSequence. Returns true if a correction was made // The correction is made only if the count of the corrected kmer is at least minCount -bool ErrorCorrectProcess::attemptKmerCorrection(size_t i, size_t k_idx, size_t k_size, size_t minCount, std::string& readSequence) +bool ErrorCorrectProcess::attemptKmerCorrection(size_t i, size_t k_idx, size_t minCount, std::string& readSequence) { - assert(i >= k_idx && i < k_idx + k_size); + assert(i >= k_idx && i < k_idx + m_kmerLength); size_t base_idx = i - k_idx; char originalBase = readSequence[i]; - std::string kmer = readSequence.substr(k_idx, k_size); + std::string kmer = readSequence.substr(k_idx, m_kmerLength); size_t bestCount = 0; char bestBase = '$'; @@ -354,12 +353,21 @@ ErrorCorrectPostProcess::ErrorCorrectPostProcess(std::ostream* pCorrectedWriter, m_pDiscardWriter(pDiscardWriter), m_bCollectMetrics(bCollectMetrics), m_totalBases(0), m_totalErrors(0), - m_readsKept(0), m_readsDiscarded(0) - + m_readsKept(0), m_readsDiscarded(0), + m_kmerQCPassed(0), m_overlapQCPassed(0), + m_qcFail(0) { } +// +ErrorCorrectPostProcess::~ErrorCorrectPostProcess() +{ + std::cout << "Reads passed kmer QC check: " << m_kmerQCPassed << "\n"; + std::cout << "Reads passed overlap QC check: " << m_overlapQCPassed << "\n"; + std::cout << "Reads failed QC: " << m_qcFail << "\n"; +} + // void ErrorCorrectPostProcess::writeMetrics(std::ostream* pWriter) { @@ -379,10 +387,23 @@ void ErrorCorrectPostProcess::process(const SequenceWorkItem& item, const ErrorC { // Determine if the read should be discarded - bool discardRead = !result.passedQC; + bool readQCPass = true; + if(result.kmerQC) + { + m_kmerQCPassed += 1; + } + else if(result.overlapQC) + { + m_overlapQCPassed += 1; + } + else + { + readQCPass = false; + m_qcFail += 1; + } // Collect metrics for the reads that were actually corrected - if(m_bCollectMetrics && !discardRead) + if(m_bCollectMetrics && readQCPass) { collectMetrics(item.read.seq.toString(), result.correctSequence.toString(), @@ -391,18 +412,15 @@ void ErrorCorrectPostProcess::process(const SequenceWorkItem& item, const ErrorC SeqRecord record = item.read; record.seq = result.correctSequence; - std::stringstream ss; - ss << "PO:" << result.num_prefix_overlaps; - ss << " SO:" << result.num_suffix_overlaps; - if(!discardRead || m_pDiscardWriter == NULL) + if(readQCPass || m_pDiscardWriter == NULL) { - record.write(*m_pCorrectedWriter, ss.str()); + record.write(*m_pCorrectedWriter); ++m_readsKept; } else { - record.write(*m_pDiscardWriter, ss.str()); + record.write(*m_pDiscardWriter); ++m_readsDiscarded; } } diff --git a/src/Algorithm/ErrorCorrectProcess.h b/src/Algorithm/ErrorCorrectProcess.h index 0c393af8..e808fe95 100644 --- a/src/Algorithm/ErrorCorrectProcess.h +++ b/src/Algorithm/ErrorCorrectProcess.h @@ -19,10 +19,9 @@ enum ErrorCorrectAlgorithm { - ECA_TRIE, // aggressive trie-based correction of conflicted sequences - ECA_CC, // conflict-aware consensus - ECA_SIMPLE, // straightforward correct - ECA_KMER // K-mer based correction algorithm + ECA_HYBRID, // hybrid kmer/overlap correction + ECA_KMER, // kmer correction + ECA_OVERLAP, // overlap correction }; enum ECFlag @@ -36,13 +35,16 @@ enum ECFlag class ErrorCorrectResult { public: + ErrorCorrectResult() : num_prefix_overlaps(0), num_suffix_overlaps(0), kmerQC(false), overlapQC(false) {} + DNAString correctSequence; ECFlag flag; // Metrics - bool passedQC; size_t num_prefix_overlaps; size_t num_suffix_overlaps; + bool kmerQC; + bool overlapQC; }; // @@ -51,7 +53,8 @@ class ErrorCorrectProcess public: ErrorCorrectProcess(const OverlapAlgorithm* pOverlapper, int minOverlap, int numRounds, - int conflictCutoff, ErrorCorrectAlgorithm algo, + int conflictCutoff, int kmerLength, + int kmerThreshold, ErrorCorrectAlgorithm algo, bool printMO); ~ErrorCorrectProcess(); @@ -63,7 +66,7 @@ class ErrorCorrectProcess ErrorCorrectResult kmerCorrection(const SequenceWorkItem& item); ErrorCorrectResult overlapCorrection(const SequenceWorkItem& workItem); - bool attemptKmerCorrection(size_t i, size_t k_idx, size_t k_size, size_t minCount, std::string& readSequence); + bool attemptKmerCorrection(size_t i, size_t k_idx, size_t minCount, std::string& readSequence); MultiOverlap blockListToMultiOverlap(const SeqRecord& record, OverlapBlockList& blockList); @@ -75,6 +78,8 @@ class ErrorCorrectProcess const int m_minOverlap; const int m_numRounds; const int m_conflictCutoff; + const int m_kmerLength; + const int m_kmerThreshold; const ErrorCorrectAlgorithm m_algorithm; const bool m_printOverlaps; const int m_depthFilter; @@ -87,6 +92,8 @@ class ErrorCorrectPostProcess ErrorCorrectPostProcess(std::ostream* pCorrectedWriter, std::ostream* pDiscardWriter, bool bCollectMetrics); + ~ErrorCorrectPostProcess(); + void process(const SequenceWorkItem& item, const ErrorCorrectResult& result); void writeMetrics(std::ostream* pWriter); @@ -109,6 +116,9 @@ class ErrorCorrectPostProcess size_t m_readsKept; size_t m_readsDiscarded; + int m_kmerQCPassed; + int m_overlapQCPassed; + int m_qcFail; }; #endif diff --git a/src/SGA/correct.cpp b/src/SGA/correct.cpp index 5f74bad9..285e0a8d 100644 --- a/src/SGA/correct.cpp +++ b/src/SGA/correct.cpp @@ -41,28 +41,34 @@ static const char *CORRECT_USAGE_MESSAGE = "\n" " --help display this help and exit\n" " -v, --verbose display verbose output\n" +" -p, --prefix=PREFIX use PREFIX for the names of the index files (default: prefix of the input file)\n" +" -o, --outfile=FILE write the corrected reads to FILE\n" " -t, --threads=NUM use NUM threads to compute the overlaps (default: 1)\n" -" -e, --error-rate the maximum error rate allowed between two sequences to consider them aligned\n" +" --discard detect and discard low-quality reads\n" +" -d, --sample-rate=N use occurrence array sample rate of N in the FM-index. Higher values use significantly\n" +" less memory at the cost of higher runtime. This value must be a power of 2. Default is 128\n" +" -a, --algorithm=STR specify the correction algorithm to use. STR must be one of hybrid, kmer, overlap.\n" +" The default algorithm is hybrid which first attempts kmer correction, then performs\n" +" overlap correction on the remaining uncorrected reads.\n" +" --metrics=FILE collect error correction metrics (error rate by position in read, etc) and write\n" +" them to FILE\n" +"\nOverlap correction parameters:\n" +" -e, --error-rate the maximum error rate allowed between two sequences to consider them overlapped\n" " -m, --min-overlap=LEN minimum overlap required between two reads\n" -" -r, --rounds=NUM iteratively correct reads up to a maximum of NUM rounds. Default: 1 round of correction\n" -" -p, --prefix=PREFIX use PREFIX instead of the prefix of the reads filename for the input/output files\n" -" -o, --outfile=FILE write the corrected reads to FILE\n" +" -c, --conflict=INT use INT as the threshold to detect a conflicted base in the multi-overlap\n" " -l, --seed-length=LEN force the seed length to be LEN. By default, the seed length in the overlap step\n" " is calculated to guarantee all overlaps with --error-rate differences are found.\n" " This option removes the guarantee but will be (much) faster. As SGA can tolerate some\n" " missing edges, this option may be preferable for some data sets.\n" " -s, --seed-stride=LEN force the seed stride to be LEN. This parameter will be ignored unless --seed-length\n" " is specified (see above). This parameter defaults to the same value as --seed-length\n" -" -a, --algorithm=STR the correction algorithm to use. STR must be one of simple,conflict or trie\n" -" -c, --conflict=INT use INT as the threshold to detect a conflicted base in the multi-overlap\n" -" --metrics=FILE collect error correction metrics (error rate by position in read, etc) and write\n" -" them to FILE\n" -" --no-discard do not discard low-quality reads\n" -" -d, --sample-rate=N sample the symbol counts every N symbols in the FM-index. Higher values use significantly\n" -" less memory at the cost of higher runtime. This value must be a power of 2. Default is 128\n" " -b, --branch-cutoff=N stop the overlap search at N branches. This parameter is used to control the search time for\n" " highly-repetitive reads. If the number of branches exceeds N, the search stops and the read\n" " will not be corrected. This is not enabled by default.\n" +" -r, --rounds=NUM iteratively correct reads up to a maximum of NUM rounds. Default: 1 round of correction\n" +"\nKmer correction parameters:\n" +" -k, --kmer-size=N The length of the kmer to use.\n" +" -x, --kmer-threshold=N Attempt to correct kmers that are seen less than N times.\n" "\nReport bugs to " PACKAGE_BUGREPORT "\n\n"; static const char* PROGRAM_IDENT = @@ -86,12 +92,16 @@ namespace opt static int seedStride = 0; static int conflictCutoff = 5; static int branchCutoff = -1; - static ErrorCorrectAlgorithm algorithm = ECA_CC; + + static int kmerLength = 41; + static int kmerThreshold = 3; + + static ErrorCorrectAlgorithm algorithm = ECA_HYBRID; } -static const char* shortopts = "p:m:d:e:t:l:s:o:r:b:a:c:vi"; +static const char* shortopts = "p:m:d:e:t:l:s:o:r:b:a:c:k:x:vi"; -enum { OPT_HELP = 1, OPT_VERSION, OPT_METRICS, OPT_NODISCARD }; +enum { OPT_HELP = 1, OPT_VERSION, OPT_METRICS, OPT_DISCARD }; static const struct option longopts[] = { { "verbose", no_argument, NULL, 'v' }, @@ -107,7 +117,9 @@ static const struct option longopts[] = { { "sample-rate", required_argument, NULL, 'd' }, { "conflict", required_argument, NULL, 'c' }, { "branch-cutoff", required_argument, NULL, 'b' }, - { "no-discard", no_argument, NULL, OPT_NODISCARD }, + { "kmer-size", required_argument, NULL, 'k' }, + { "kmer-threshold",required_argument, NULL, 'x' }, + { "discard", no_argument, NULL, OPT_DISCARD }, { "help", no_argument, NULL, OPT_HELP }, { "version", no_argument, NULL, OPT_VERSION }, { "metrics", required_argument, NULL, OPT_METRICS }, @@ -140,7 +152,9 @@ int correctMain(int argc, char** argv) if(opt::numThreads <= 1) { // Serial mode - ErrorCorrectProcess processor(pOverlapper, opt::minOverlap, opt::numRounds, opt::conflictCutoff, opt::algorithm, opt::verbose > 1); + ErrorCorrectProcess processor(pOverlapper, opt::minOverlap, opt::numRounds, opt::conflictCutoff, + opt::kmerLength, opt::kmerThreshold, opt::algorithm, opt::verbose > 1); + SequenceProcessFramework::processSequencesSerial processorVector; for(int i = 0; i < opt::numThreads; ++i) { - ErrorCorrectProcess* pProcessor = new ErrorCorrectProcess(pOverlapper, opt::minOverlap, opt::numRounds, opt::conflictCutoff, opt::algorithm, opt::verbose > 1); + ErrorCorrectProcess* pProcessor = new ErrorCorrectProcess(pOverlapper, opt::minOverlap, + opt::numRounds, opt::conflictCutoff, + opt::kmerLength, opt::kmerThreshold, + opt::algorithm, opt::verbose > 1); processorVector.push_back(pProcessor); } @@ -195,7 +212,7 @@ int correctMain(int argc, char** argv) void parseCorrectOptions(int argc, char** argv) { std::string algo_str; - bool bDoNotDiscard = false; + bool bDiscardReads = false; bool die = false; for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { @@ -216,7 +233,7 @@ void parseCorrectOptions(int argc, char** argv) case '?': die = true; break; case 'v': opt::verbose++; break; case 'b': arg >> opt::branchCutoff; break; - case OPT_NODISCARD: bDoNotDiscard = true; break; + case OPT_DISCARD: bDiscardReads = true; break; case OPT_METRICS: arg >> opt::metricsFile; break; case OPT_HELP: std::cout << CORRECT_USAGE_MESSAGE; @@ -250,17 +267,27 @@ void parseCorrectOptions(int argc, char** argv) die = true; } + if(opt::kmerLength <= 0) + { + std::cerr << SUBPROGRAM ": invalid kmer length: " << opt::kmerLength << ", must be greater than zero\n"; + die = true; + } + + if(opt::kmerThreshold <= 0) + { + std::cerr << SUBPROGRAM ": invalid kmer threshold: " << opt::kmerThreshold << ", must be greater than zero\n"; + die = true; + } + // Determine the correctiona algorithm to use if(!algo_str.empty()) { - if(algo_str == "simple") - opt::algorithm = ECA_SIMPLE; - else if(algo_str == "conflict") - opt::algorithm = ECA_CC; - else if(algo_str == "trie") - opt::algorithm = ECA_TRIE; + if(algo_str == "hybrid") + opt::algorithm = ECA_HYBRID; else if(algo_str == "kmer") opt::algorithm = ECA_KMER; + else if(algo_str == "overlap") + opt::algorithm = ECA_OVERLAP; else { std::cerr << SUBPROGRAM << ": unrecognized -a,--algorithm parameter: " << algo_str << "\n"; @@ -303,7 +330,7 @@ void parseCorrectOptions(int argc, char** argv) opt::outFile = opt::prefix + ".ec.fa"; } - if(!bDoNotDiscard) + if(bDiscardReads) opt::discardFile = opt::prefix + ".discard.fa"; else opt::discardFile.clear();