Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implemented sga qc subprogram. This program looks for, and discards, …
…problematic reads. Right now, the qc check requires each read to have a tiling of high confidence k-mers (with a short kmer length).
- Loading branch information
Showing
13 changed files
with
469 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
///----------------------------------------------- | ||
// Copyright 2010 Wellcome Trust Sanger Institute | ||
// Written by Jared Simpson (js18@sanger.ac.uk) | ||
// Released under the GPL | ||
//----------------------------------------------- | ||
// | ||
// QCProcess - Process to perform quality checks | ||
// for a sequence work item | ||
// | ||
#include "QCProcess.h" | ||
#include "BWTAlgorithms.h" | ||
|
||
// | ||
// | ||
// | ||
QCProcess::QCProcess(const BWT* pBWT, const BWT* pRBWT, int kmerLength, int kmerThreshold) : | ||
m_pBWT(pBWT), | ||
m_pRBWT(pRBWT), | ||
m_kmerLength(kmerLength), | ||
m_kmerThreshold(kmerThreshold) | ||
{ | ||
|
||
} | ||
|
||
// | ||
QCProcess::~QCProcess() | ||
{ | ||
|
||
} | ||
|
||
// | ||
QCResult QCProcess::process(const SequenceWorkItem& workItem) | ||
{ | ||
// Perform a kmer-based qc check on the read | ||
QCResult result; | ||
|
||
std::string readSequence = workItem.read.seq.toString(); | ||
int k = m_kmerLength; | ||
int n = readSequence.size(); | ||
int nk = n - m_kmerLength + 1; | ||
int threshold = m_kmerThreshold; | ||
|
||
// Are all kmers in the read well-represented? | ||
bool allSolid = true; | ||
|
||
for(int i = 0; i < nk; ++i) | ||
{ | ||
std::string kmer = readSequence.substr(i, k); | ||
int count = BWTAlgorithms::countSequenceOccurrences(kmer, m_pBWT, m_pRBWT); | ||
if(count <= threshold) | ||
{ | ||
allSolid = false; | ||
break; | ||
} | ||
} | ||
|
||
if(allSolid) | ||
result.qcPassed = true; | ||
else | ||
result.qcPassed = false; | ||
return result; | ||
} | ||
|
||
// | ||
// | ||
// | ||
QCPostProcess::QCPostProcess(std::ostream* pCorrectedWriter, | ||
std::ostream* pDiscardWriter) : | ||
m_pCorrectedWriter(pCorrectedWriter), | ||
m_pDiscardWriter(pDiscardWriter), | ||
m_readsKept(0), m_readsDiscarded(0) | ||
{ | ||
|
||
} | ||
|
||
// | ||
QCPostProcess::~QCPostProcess() | ||
{ | ||
std::cout << "Reads kept: " << m_readsKept << "\n"; | ||
std::cout << "Reads discarded: " << m_readsDiscarded << "\n"; | ||
} | ||
|
||
// | ||
void QCPostProcess::process(const SequenceWorkItem& item, const QCResult& result) | ||
{ | ||
SeqRecord record = item.read; | ||
if(result.qcPassed) | ||
{ | ||
record.write(*m_pCorrectedWriter); | ||
++m_readsKept; | ||
} | ||
else | ||
{ | ||
// To be able to rebuild the index after discarding the read, we need to write | ||
// the rank of the string (its position in the original read file into the read name) | ||
std::stringstream newID; | ||
newID << item.read.id << ",seqrank=" << item.idx; | ||
record.id = newID.str(); | ||
|
||
record.write(*m_pDiscardWriter); | ||
++m_readsDiscarded; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
///----------------------------------------------- | ||
// Copyright 2010 Wellcome Trust Sanger Institute | ||
// Written by Jared Simpson (js18@sanger.ac.uk) | ||
// Released under the GPL | ||
//----------------------------------------------- | ||
// | ||
// QCProcess - Process to perform quality checks | ||
// for a sequence work item | ||
// | ||
#ifndef QCPROCESS_H | ||
#define QCPROCESS_H | ||
|
||
#include "Util.h" | ||
#include "BWT.h" | ||
#include "SequenceProcessFramework.h" | ||
#include "SequenceWorkItem.h" | ||
|
||
class QCResult | ||
{ | ||
public: | ||
QCResult() : qcPassed(false) {} | ||
|
||
bool qcPassed; | ||
}; | ||
|
||
// | ||
class QCProcess | ||
{ | ||
public: | ||
QCProcess(const BWT* pBWT, const BWT* pRBWT, int kmerLength, int kmerThreshold); | ||
~QCProcess(); | ||
QCResult process(const SequenceWorkItem& item); | ||
|
||
private: | ||
|
||
const BWT* m_pBWT; | ||
const BWT* m_pRBWT; | ||
const int m_kmerLength; | ||
const int m_kmerThreshold; | ||
}; | ||
|
||
// Write the results from the overlap step to an ASQG file | ||
class QCPostProcess | ||
{ | ||
public: | ||
QCPostProcess(std::ostream* pCorrectedWriter, std::ostream* pDiscardWriter); | ||
~QCPostProcess(); | ||
|
||
void process(const SequenceWorkItem& item, const QCResult& result); | ||
|
||
private: | ||
|
||
std::ostream* m_pCorrectedWriter; | ||
std::ostream* m_pDiscardWriter; | ||
|
||
size_t m_readsKept; | ||
size_t m_readsDiscarded; | ||
}; | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.