Skip to content

Commit

Permalink
Merge branch 'small_repeat_rewrite'
Browse files Browse the repository at this point in the history
  • Loading branch information
jts committed Sep 16, 2011
2 parents e0b7c6b + 5b6cbef commit b2de45c
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 43 deletions.
53 changes: 52 additions & 1 deletion src/Bigraph/Vertex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ void Vertex::addEdge(Edge* ep)
std::cout << "Attempted to add duplicate edge with ID: " << ep->getEndID()
<< " to vertex: " << ep->getStartID() << "\n";
std::cout << "Added in desc: " << ep->getDesc() << " curr desc: " << (*iter)->getDesc() << "\n";
assert(false);
//assert(false);
}
}
#endif
Expand Down Expand Up @@ -492,6 +492,30 @@ EdgePtrVec Vertex::findEdgesTo(VertexID id)
return outEdges;
}

// Returns the edge with the longest overlap length
// in direction dir
// Returns NULL if the vertex has no edges
Edge* Vertex::getLongestOverlapEdge(EdgeDir dir) const
{
Edge* pOut = NULL;
int maxOL = 0;
EdgePtrVecConstIter iter = m_edges.begin();
for(; iter != m_edges.end(); ++iter)
{
if((*iter)->getDir() != dir)
continue;

int currOL = (*iter)->getMatchLength();
if(currOL > maxOL)
{
pOut = *iter;
maxOL = currOL;
}
}
return pOut;
}


//
// Get the edges in a particular direction
// This preserves the ordering of the edges
Expand Down Expand Up @@ -536,6 +560,33 @@ size_t Vertex::countEdges(EdgeDir dir)
return ev.size();
}

// Calculate the difference in overlap lengths between
// the longest and second longest edge
int Vertex::getOverlapLengthDiff(EdgeDir dir) const
{
int longest_len = 0;
int second_longest_len = 0;
EdgePtrVecConstIter iter = m_edges.begin();
for(; iter != m_edges.end(); ++iter)
{
if((*iter)->getDir() != dir)
continue;

int currOL = (*iter)->getMatchLength();
if(currOL > longest_len)
{
second_longest_len = longest_len;
longest_len = currOL;
}
else if(currOL > second_longest_len)
{
second_longest_len = currOL;
}
}
return longest_len - second_longest_len;
}


// Return the amount of memory this vertex is using, in bytes
size_t Vertex::getMemSize() const
{
Expand Down
6 changes: 6 additions & 0 deletions src/Bigraph/Vertex.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class Vertex
void removeEdge(const EdgeDesc& ed);
void deleteEdge(Edge* pEdge);
void deleteEdges();

int sweepEdges(GraphColor c);
bool hasEdge(Edge* pEdge) const;
bool hasEdge(const EdgeDesc& ed) const;
Expand All @@ -106,10 +107,15 @@ class Vertex
EdgePtrVec getEdges() const;
EdgePtrVecIter findEdge(const EdgeDesc& ed);
EdgePtrVecConstIter findEdge(const EdgeDesc& ed) const;
Edge* getLongestOverlapEdge(EdgeDir dir) const;

size_t countEdges() const;
size_t countEdges(EdgeDir dir);

// Calculate the difference in overlap lengths between
// the longest and second longest edge
int getOverlapLengthDiff(EdgeDir dir) const;

// Ensure the vertex data is sane
void validate() const;

Expand Down
1 change: 1 addition & 0 deletions src/SGA/assemble.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ void assemble()
// Resolve small repeats
if(opt::resolveSmallRepeatLen > 0)
{
Timer smallTimer("SmallRepeat");
SGSmallRepeatResolveVisitor smallRepeatVisit(opt::resolveSmallRepeatLen);
std::cout << "Resolving small repeats\n";

Expand Down
91 changes: 49 additions & 42 deletions src/StringGraph/SGVisitors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -669,74 +669,81 @@ void SGSmallRepeatResolveVisitor::previsit(StringGraph*)

}

//
// Perform the small repeat resolution algorithm on vertex pX
// The algorithm works as follows.
// We keep only the longest overlap of a vertex pX
// (to pY) under the following conditions.
// 1) pX is the longest overlap for pY
// 2) The overlap between pX and pY is at least
// m_minDiff bases longer than the 2nd longest
bool SGSmallRepeatResolveVisitor::visit(StringGraph* /*pGraph*/, Vertex* pX)
{
bool changed = false;
for(size_t idx = 0; idx < ED_COUNT; idx++)
{
EdgeDir dir = EDGE_DIRECTIONS[idx];
EdgePtrVec x_edges = pX->getEdges(dir); // These edges are already sorted
EdgePtrVec x_edges = pX->getEdges(dir);

// Skip this direction if there aren't multiple edges
if(x_edges.size() < 2)
continue;

// Get the edge to the longest
Edge* pXY = pX->getLongestOverlapEdge(dir);
assert(pXY != NULL);
Vertex* pY = pXY->getEnd();

// Skip self overlaps
if(pX == pY)
continue;

// Try to eliminate the shortest edge from this vertex (let this be X->Y)
// If Y has a longer edge than Y->X in the same direction, we remove X->Y
// Get the longest overlap for Y
Edge* pYZ = pY->getLongestOverlapEdge(pXY->getTwinDir());

// Edges are sorted by length so the last edge is the shortest
Edge* pXY = x_edges.back();
size_t xy_len = pXY->getOverlap().getOverlapLength(0);
size_t x_longest_len = x_edges.front()->getOverlap().getOverlapLength(0);
if(xy_len == x_longest_len)
// This vertex is not resolvable if the longest overlap for Y is not X
if(pYZ->getEnd() != pX)
continue;

Edge* pYX = pXY->getTwin();
Vertex* pY = pXY->getEnd();
// Calculate the difference in overlap length between the longest
// and second longest overlaps of x and y
int x_diff = pX->getOverlapLengthDiff(dir);
int y_diff = pY->getOverlapLengthDiff(dir);

EdgePtrVec y_edges = pY->getEdges(pYX->getDir());
size_t yx_len = pYX->getOverlap().getOverlapLength(0);
//printf("XDIFF: %d n_x: %zu\n", x_diff, x_edges.size());
//printf("YDIFF: %d\n", y_diff);

size_t y_longest_len = 0;
for(size_t i = 0; i < y_edges.size(); ++i)
if(x_diff >= m_minDiff && y_diff >= m_minDiff)
{
Edge* pYZ = y_edges[i];
if(pYZ == pYX)
continue; // skip Y->X

size_t yz_len = pYZ->getOverlap().getOverlapLength(0);
if(yz_len > y_longest_len)
y_longest_len = yz_len;
}


if(y_longest_len > yx_len)
{
// Delete the edge if the difference between the shortest and longest is greater than minDiff
int x_diff = x_longest_len - xy_len;
int y_diff = y_longest_len - yx_len;
// Mark non-selected edges for deletion
for(size_t i = 0; i < x_edges.size(); ++i)
{
if(x_edges[i] != pXY)
{
x_edges[i]->setColor(GC_RED);
x_edges[i]->getTwin()->setColor(GC_RED);
}
}

if(x_diff > m_minDiff && y_diff > m_minDiff)
EdgePtrVec y_edges = pY->getEdges(pYZ->getDir());
for(size_t i = 0; i < y_edges.size(); ++i)
{
/*
printf("Edge %s -> %s is likely a repeat\n", pX->getID().c_str(), pY->getID().c_str());
printf("Actual overlap lengths: %zu and %zu\n", xy_len, yx_len);
printf("Spanned by longer edges of size: %zu and %zu\n", x_longest_len, y_longest_len);
printf("Differences: %d and %d\n", x_diff, y_diff);
*/
pX->deleteEdge(pXY);
pY->deleteEdge(pYX);
changed = true;
if(y_edges[i] != pYZ)
{
y_edges[i]->setColor(GC_RED);
y_edges[i]->getTwin()->setColor(GC_RED);
}
}
changed = true;
}
}

return changed;
}

//
void SGSmallRepeatResolveVisitor::postvisit(StringGraph*)
void SGSmallRepeatResolveVisitor::postvisit(StringGraph* pGraph)
{

pGraph->sweepEdges(GC_RED);
}

//
Expand Down

0 comments on commit b2de45c

Please sign in to comment.