/
SGAlgorithms.cpp
353 lines (307 loc) · 12.3 KB
/
SGAlgorithms.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
//-----------------------------------------------
// Copyright 2009 Wellcome Trust Sanger Institute
// Written by Jared Simpson (js18@sanger.ac.uk)
// Released under the GPL
//-----------------------------------------------
//
// SGAlgorithms - Collection of algorithms for operating on string graphs
//
#include "SGAlgorithms.h"
#include "SGUtil.h"
#include "CompleteOverlapSet.h"
#include "RemovalAlgorithm.h"
#include <iterator>
// add edges to the graph for the given overlap
Edge* SGAlgorithms::createEdgesFromOverlap(StringGraph* pGraph, const Overlap& o, bool allowContained, size_t maxEdges)
{
// Initialize data and perform checks
Vertex* pVerts[2];
EdgeComp comp = (o.match.isRC()) ? EC_REVERSE : EC_SAME;
bool isContainment = o.match.isContainment();
assert(allowContained || !isContainment);
(void)allowContained;
for(size_t idx = 0; idx < 2; ++idx)
{
pVerts[idx] = pGraph->getVertex(o.id[idx]);
// If one of the vertices is not in the graph, skip this edge
// This can occur if one of the verts is a strict substring of some other vertex so it will
// never be added to the graph
if(pVerts[idx] == NULL)
return NULL;
}
// Check if this is a substring containment, if so mark the contained read
// but do not create edges
for(size_t idx = 0; idx < 2; ++idx)
{
if(!o.match.coord[idx].isExtreme())
{
size_t containedIdx = 1 - idx;
assert(o.match.coord[containedIdx].isExtreme());
pVerts[containedIdx]->setColor(GC_RED);
pGraph->setContainmentFlag(true);
return NULL;
}
}
// If either vertex has the maximum number of edges,
// do not add any more. This is to protect against ultra-dense
// regions of the graph inflating memory usage. The nodes that reach
// this limit, and nodes connected to them are marked as super repeats.
// After loading the graph, all edges to super repeats are cut to prevent
// misassembly.
size_t num_edges_0 = pVerts[0]->countEdges();
size_t num_edges_1 = pVerts[1]->countEdges();
if(num_edges_0 > maxEdges || num_edges_1 > maxEdges)
{
WARN_ONCE("Edge limit reached for vertex when loading graph");
pVerts[0]->setSuperRepeat(true);
pVerts[1]->setSuperRepeat(true);
return NULL;
}
if(!isContainment)
{
Edge* pEdges[2];
for(size_t idx = 0; idx < 2; ++idx)
{
EdgeDir dir = o.match.coord[idx].isLeftExtreme() ? ED_ANTISENSE : ED_SENSE;
const SeqCoord& coord = o.match.coord[idx];
pEdges[idx] = new(pGraph->getEdgeAllocator()) Edge(pVerts[1 - idx], dir, comp, coord);
}
pEdges[0]->setTwin(pEdges[1]);
pEdges[1]->setTwin(pEdges[0]);
pGraph->addEdge(pVerts[0], pEdges[0]);
pGraph->addEdge(pVerts[1], pEdges[1]);
return pEdges[0];
}
else
{
// Contained edges don't have a direction, they can be travelled from
// one vertex to the other in either direction. Hence, we
// add two edges per vertex. Later during the contain removal
// algorithm this is important to determine transitivity
Edge* pEdges[4];
for(size_t idx = 0; idx < 2; ++idx)
{
const SeqCoord& coord = o.match.coord[idx];
pEdges[idx] = new(pGraph->getEdgeAllocator()) Edge(pVerts[1 - idx], ED_SENSE, comp, coord);
pEdges[idx + 2] = new(pGraph->getEdgeAllocator()) Edge(pVerts[1 - idx], ED_ANTISENSE, comp, coord);
}
// Twin the edges and add them to the graph
pEdges[0]->setTwin(pEdges[1]);
pEdges[1]->setTwin(pEdges[0]);
pEdges[2]->setTwin(pEdges[3]);
pEdges[3]->setTwin(pEdges[2]);
pGraph->addEdge(pVerts[0], pEdges[0]);
pGraph->addEdge(pVerts[0], pEdges[2]);
pGraph->addEdge(pVerts[1], pEdges[1]);
pGraph->addEdge(pVerts[1], pEdges[3]);
// Set containment flags
updateContainFlags(pGraph, pVerts[0], pEdges[0]->getDesc(), o);
return pEdges[0];
}
}
// Find new edges for pVertex that are required if pDeleteEdge is removed from the graph
void SGAlgorithms::remodelVertexForExcision(StringGraph* pGraph, Vertex* pVertex, Edge* pDeleteEdge)
{
assert(pVertex == pDeleteEdge->getStart());
// If the edge is a containment edge, nothing needs to be done. No edges can be transitive
// through containments
if(pDeleteEdge->getOverlap().isContainment())
return;
double maxER = pGraph->getErrorRate();
int minLength = pGraph->getMinOverlap();
EdgeDescOverlapMap addMap = RemovalAlgorithm::computeRequiredOverlaps(pVertex, pDeleteEdge, maxER, minLength);
for(EdgeDescOverlapMap::iterator iter = addMap.begin();
iter != addMap.end(); ++iter)
{
//std::cout << "Adding edge " << iter->second << " during removal of " << pDeleteEdge->getEndID() << "\n";
createEdgesFromOverlap(pGraph, iter->second, false);
}
/*
// Set the contain flags based on newly discovered edges
updateContainFlags(pGraph, pVertex, containMap);
*/
}
// Set containment flags in the graph based on the overlap map
void SGAlgorithms::updateContainFlags(StringGraph* pGraph, Vertex* pVertex, EdgeDescOverlapMap& containMap)
{
for(EdgeDescOverlapMap::iterator iter = containMap.begin(); iter != containMap.end(); ++iter)
{
updateContainFlags(pGraph, pVertex, iter->first, iter->second);
}
}
//
void SGAlgorithms::updateContainFlags(StringGraph* pGraph, Vertex* pVertex, const EdgeDesc& ed, const Overlap& ovr)
{
assert(ovr.isContainment());
// Determine which of the two vertices is contained
Vertex* pOther = ed.pVertex;
if(ovr.getContainedIdx() == 0)
pVertex->setContained(true);
else
pOther->setContained(true);
pGraph->setContainmentFlag(true);
}
// Calculate the error rate between the two vertices
double SGAlgorithms::calcErrorRate(const Vertex* pX, const Vertex* pY, const Overlap& ovrXY)
{
int num_diffs = ovrXY.match.countDifferences(pX->getSeq().toString(), pY->getSeq().toString());
return static_cast<double>(num_diffs) / static_cast<double>(ovrXY.match.getMinOverlapLength());
}
// Infer an overlap from two edges
// The input edges are between X->Y Y->Z
// and the returned overlap is X->Z
Overlap SGAlgorithms::inferTransitiveOverlap(const Overlap& ovrXY, const Overlap& ovrYZ)
{
// Construct the match
Match match_yx = ovrXY.match;
match_yx.swap();
Match match_yz = ovrYZ.match;
// Infer the match_ij based match_i and match_j
Match match_xz = Match::infer(match_yx, match_yz);
match_xz.expand();
// Convert the match to an overlap
Overlap ovr(ovrXY.id[0], ovrYZ.id[1], match_xz);
return ovr;
}
// Returns true if, given overlaps X->Y, X->Z, the overlap X->Z is transitive
bool SGAlgorithms::isOverlapTransitive(const Vertex* pY, const Vertex* pZ, const Overlap& ovrXY,
const Overlap& ovrXZ, const double maxER, const int minOverlap)
{
// Ensure the overlaps are in the correct order, the overlap with Y should be at least
// as large as the overlap with Z
assert(ovrXY.getOverlapLength(0) >= ovrXZ.getOverlapLength(0));
assert(pY != pZ);
// Compute the overlap YZ
Overlap ovrYX = ovrXY;
ovrYX.swap();
Overlap ovrYZ = SGAlgorithms::inferTransitiveOverlap(ovrYX, ovrXZ);
// If ovrYZ is a containment, then Z is not transitive wrt Y
if(ovrYZ.match.isContainment())
return false;
// If the overlap between Y and Z is not long enough then Z is not transitive
if(ovrYZ.getOverlapLength(0) < minOverlap)
return false;
// Finally, check that the error rate is below the threshold
double error_rate = SGAlgorithms::calcErrorRate(pY, pZ, ovrYZ);
if(isErrorRateAcceptable(error_rate, maxER))
return true;
else
return false;
}
// The following algorithms use these local types
// defining an edge/overlap pair.
typedef std::pair<EdgeDesc, Overlap> EdgeDescOverlapPair;
// Compare two edges by their overlap length
struct EDOPairCompare
{
bool operator()(const EdgeDescOverlapPair& edpXY, const EdgeDescOverlapPair& edpXZ) {
return edpXY.second.match.coord[0].length() < edpXZ.second.match.coord[0].length();
}
};
// typedefs
typedef std::priority_queue<EdgeDescOverlapPair,
std::vector<EdgeDescOverlapPair>,
EDOPairCompare> EDOPairQueue;
// Move the transitive edges from pOverlapMap to pTransitive
void SGAlgorithms::partitionTransitiveOverlaps(EdgeDescOverlapMap* pOverlapMap,
EdgeDescOverlapMap* pTransitive,
double maxER, int minLength)
{
EDOPairQueue overlapQueue;
for(SGAlgorithms::EdgeDescOverlapMap::iterator iter = pOverlapMap->begin();
iter != pOverlapMap->end(); ++iter)
{
overlapQueue.push(*iter);
}
// Traverse the list of overlaps in order of length and move elements from
// the irreducible map to the transitive map
while(!overlapQueue.empty())
{
EdgeDescOverlapPair edoPair = overlapQueue.top();
overlapQueue.pop();
EdgeDesc& edXY = edoPair.first;
Overlap& ovrXY = edoPair.second;
assert(!ovrXY.match.isContainment());
SGAlgorithms::EdgeDescOverlapMap::iterator iter = pOverlapMap->begin();
while(iter != pOverlapMap->end())
{
bool move = false;
const EdgeDesc& edXZ = iter->first;
const Overlap& ovrXZ = iter->second;
// Four conditions must be met to mark an edge X->Z transitive through X->Y
// 1) The overlaps must be in the same direction
// 2) The overlap X->Y must be strictly longer than X->Z
// 3) The overlap between Y->Z must not be a containment
// 4) The overlap between Y->Z must be within the error and length thresholds
if(!(edXZ == edXY) && edXY.dir == edXZ.dir && ovrXY.getOverlapLength(0) > ovrXZ.getOverlapLength(0))
{
move = SGAlgorithms::isOverlapTransitive(edXY.pVertex, edXZ.pVertex, ovrXY, ovrXZ, maxER, minLength);
}
if(move)
{
//std::cout << "Marking overlap: " << iter->second << " as trans via " << ovrXY << "\n";
if(pTransitive != NULL)
pTransitive->insert(*iter);
pOverlapMap->erase(iter++);
}
else
{
++iter;
}
}
}
}
void SGAlgorithms::removeSubmaximalOverlaps(EdgeDescOverlapMap* pOverlapMap)
{
EDOPairQueue overlapQueue;
for(SGAlgorithms::EdgeDescOverlapMap::iterator iter = pOverlapMap->begin();
iter != pOverlapMap->end(); ++iter)
{
overlapQueue.push(*iter);
}
// Traverse the list of overlaps in order of length
// Only add the first seen overlap for each vertex
pOverlapMap->clear();
VertexIDSet seenVerts;
// the irreducible map to the transitive map
while(!overlapQueue.empty())
{
EdgeDescOverlapPair edoPair = overlapQueue.top();
overlapQueue.pop();
EdgeDesc& edXY = edoPair.first;
if(seenVerts.count(edXY.pVertex->getID()) == 0)
{
seenVerts.insert(edXY.pVertex->getID());
pOverlapMap->insert(edoPair);
}
}
}
// Return a descriptor of the edge describing ovrXY
EdgeDesc SGAlgorithms::overlapToEdgeDesc(Vertex* pY, const Overlap& ovrXY)
{
EdgeDesc edXY;
edXY.pVertex = pY;
edXY.comp = (ovrXY.match.isRC()) ? EC_REVERSE : EC_SAME;
edXY.dir = ovrXY.match.coord[0].isLeftExtreme() ? ED_ANTISENSE : ED_SENSE; // X -> Y
return edXY;
}
// Return true if XZ has an overlap
bool SGAlgorithms::hasTransitiveOverlap(const Overlap& ovrXY, const Overlap& ovrYZ)
{
Match match_yx = ovrXY.match;
match_yx.swap();
Match match_yz = ovrYZ.match;
return Match::doMatchesIntersect(match_yx, match_yz);
}
//
EdgeDesc SGAlgorithms::getEdgeDescFromEdge(Edge* pEdge)
{
return pEdge->getDesc();
}
void SGAlgorithms::printOverlapMap(const EdgeDescOverlapMap& overlapMap)
{
for(EdgeDescOverlapMap::const_iterator iter = overlapMap.begin(); iter != overlapMap.end(); ++iter)
{
std::cout << " Overlap:" << iter->second << "\n";
}
}