Skip to content

Commit

Permalink
Implemented sga qc subprogram. This program looks for, and discards, …
Browse files Browse the repository at this point in the history
…problematic reads. Right now, the qc check requires each read to have a tiling of high confidence k-mers (with a short kmer length).
  • Loading branch information
jts committed Oct 21, 2010
1 parent d3ab7c9 commit d07426e
Show file tree
Hide file tree
Showing 13 changed files with 469 additions and 7 deletions.
7 changes: 6 additions & 1 deletion src/Algorithm/ErrorCorrectProcess.cpp
Expand Up @@ -41,7 +41,12 @@ ErrorCorrectProcess::~ErrorCorrectProcess()
//
ErrorCorrectResult ErrorCorrectProcess::process(const SequenceWorkItem& workItem)
{

ErrorCorrectResult result = correct(workItem);
return result;
}

ErrorCorrectResult ErrorCorrectProcess::correct(const SequenceWorkItem& workItem)
{
switch(m_algorithm)
{
case ECA_HYBRID:
Expand Down
3 changes: 2 additions & 1 deletion src/Algorithm/ErrorCorrectProcess.h
Expand Up @@ -60,7 +60,8 @@ class ErrorCorrectProcess
~ErrorCorrectProcess();

ErrorCorrectResult process(const SequenceWorkItem& item);

ErrorCorrectResult correct(const SequenceWorkItem& item);

private:

ErrorCorrectResult kmerCorrection(const SequenceWorkItem& item);
Expand Down
1 change: 1 addition & 0 deletions src/Algorithm/Makefile.am
Expand Up @@ -16,6 +16,7 @@ libalgorithm_a_SOURCES = \
OverlapBlock.h OverlapBlock.cpp \
SearchHistory.h SearchHistory.cpp \
ErrorCorrectProcess.h ErrorCorrectProcess.cpp \
QCProcess.h QCProcess.cpp \
OverlapTools.h OverlapTools.cpp \
DPAlignment.h DPAlignment.cpp \
ConnectProcess.h ConnectProcess.cpp \
Expand Down
103 changes: 103 additions & 0 deletions src/Algorithm/QCProcess.cpp
@@ -0,0 +1,103 @@
///-----------------------------------------------
// Copyright 2010 Wellcome Trust Sanger Institute
// Written by Jared Simpson (js18@sanger.ac.uk)
// Released under the GPL
//-----------------------------------------------
//
// QCProcess - Process to perform quality checks
// for a sequence work item
//
#include "QCProcess.h"
#include "BWTAlgorithms.h"

//
//
//
QCProcess::QCProcess(const BWT* pBWT, const BWT* pRBWT, int kmerLength, int kmerThreshold) :
m_pBWT(pBWT),
m_pRBWT(pRBWT),
m_kmerLength(kmerLength),
m_kmerThreshold(kmerThreshold)
{

}

//
QCProcess::~QCProcess()
{

}

//
QCResult QCProcess::process(const SequenceWorkItem& workItem)
{
// Perform a kmer-based qc check on the read
QCResult result;

std::string readSequence = workItem.read.seq.toString();
int k = m_kmerLength;
int n = readSequence.size();
int nk = n - m_kmerLength + 1;
int threshold = m_kmerThreshold;

// Are all kmers in the read well-represented?
bool allSolid = true;

for(int i = 0; i < nk; ++i)
{
std::string kmer = readSequence.substr(i, k);
int count = BWTAlgorithms::countSequenceOccurrences(kmer, m_pBWT, m_pRBWT);
if(count <= threshold)
{
allSolid = false;
break;
}
}

if(allSolid)
result.qcPassed = true;
else
result.qcPassed = false;
return result;
}

//
//
//
QCPostProcess::QCPostProcess(std::ostream* pCorrectedWriter,
std::ostream* pDiscardWriter) :
m_pCorrectedWriter(pCorrectedWriter),
m_pDiscardWriter(pDiscardWriter),
m_readsKept(0), m_readsDiscarded(0)
{

}

//
QCPostProcess::~QCPostProcess()
{
std::cout << "Reads kept: " << m_readsKept << "\n";
std::cout << "Reads discarded: " << m_readsDiscarded << "\n";
}

//
void QCPostProcess::process(const SequenceWorkItem& item, const QCResult& result)
{
SeqRecord record = item.read;
if(result.qcPassed)
{
record.write(*m_pCorrectedWriter);
++m_readsKept;
}
else
{
// To be able to rebuild the index after discarding the read, we need to write
// the rank of the string (its position in the original read file into the read name)
std::stringstream newID;
newID << item.read.id << ",seqrank=" << item.idx;
record.id = newID.str();

record.write(*m_pDiscardWriter);
++m_readsDiscarded;
}
}
60 changes: 60 additions & 0 deletions src/Algorithm/QCProcess.h
@@ -0,0 +1,60 @@
///-----------------------------------------------
// Copyright 2010 Wellcome Trust Sanger Institute
// Written by Jared Simpson (js18@sanger.ac.uk)
// Released under the GPL
//-----------------------------------------------
//
// QCProcess - Process to perform quality checks
// for a sequence work item
//
#ifndef QCPROCESS_H
#define QCPROCESS_H

#include "Util.h"
#include "BWT.h"
#include "SequenceProcessFramework.h"
#include "SequenceWorkItem.h"

class QCResult
{
public:
QCResult() : qcPassed(false) {}

bool qcPassed;
};

//
class QCProcess
{
public:
QCProcess(const BWT* pBWT, const BWT* pRBWT, int kmerLength, int kmerThreshold);
~QCProcess();
QCResult process(const SequenceWorkItem& item);

private:

const BWT* m_pBWT;
const BWT* m_pRBWT;
const int m_kmerLength;
const int m_kmerThreshold;
};

// Write the results from the overlap step to an ASQG file
class QCPostProcess
{
public:
QCPostProcess(std::ostream* pCorrectedWriter, std::ostream* pDiscardWriter);
~QCPostProcess();

void process(const SequenceWorkItem& item, const QCResult& result);

private:

std::ostream* m_pCorrectedWriter;
std::ostream* m_pDiscardWriter;

size_t m_readsKept;
size_t m_readsDiscarded;
};

#endif
1 change: 1 addition & 0 deletions src/SGA/Makefile.am
Expand Up @@ -38,5 +38,6 @@ sga_SOURCES = sga.cpp \
scaffold2fasta.cpp scaffold2fasta.h \
connect.cpp connect.h \
walk.cpp walk.h \
qc.cpp qc.h \
OverlapCommon.h OverlapCommon.cpp \
SGACommon.h
4 changes: 3 additions & 1 deletion src/SGA/correct.cpp
Expand Up @@ -230,6 +230,8 @@ void parseCorrectOptions(int argc, char** argv)
case 'a': arg >> algo_str; break;
case 'd': arg >> opt::sampleRate; break;
case 'c': arg >> opt::conflictCutoff; break;
case 'k': arg >> opt::kmerLength; break;
case 'x': arg >> opt::kmerThreshold; break;
case '?': die = true; break;
case 'v': opt::verbose++; break;
case 'b': arg >> opt::branchCutoff; break;
Expand Down Expand Up @@ -279,7 +281,7 @@ void parseCorrectOptions(int argc, char** argv)
die = true;
}

// Determine the correctiona algorithm to use
// Determine the correction algorithm to use
if(!algo_str.empty())
{
if(algo_str == "hybrid")
Expand Down

0 comments on commit d07426e

Please sign in to comment.