/
GapFillProcess.cpp
198 lines (170 loc) · 6.53 KB
/
GapFillProcess.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
///----------------------------------------------
// Copyright 2011 Wellcome Trust Sanger Institute
// Written by Jared Simpson (js18@sanger.ac.uk)
// Released under the GPL
//-----------------------------------------------
//
// GapFillProcess - Fill in gaps in a scaffold
//
#include <algorithm>
#include "GapFillProcess.h"
#include "BWTAlgorithms.h"
#include "SGAlgorithms.h"
#include "SGSearch.h"
#include "StdAlnTools.h"
#include "HaplotypeBuilder.h"
#include "MultiAlignment.h"
//
GapFillStats::GapFillStats()
{
numGapsAttempted = numGapsFilled = 0;
for(size_t i = 0; i < GFRC_NUM_CODES; ++i)
numFails[i] = 0;
}
//
void GapFillStats::print() const
{
printf("Num gaps attempted: %zu\n", numGapsAttempted);
printf("Num gaps filled: %zu\n", numGapsFilled);
printf(" Failed -- no anchor: %zu\n", numFails[GFRC_NO_ANCHOR]);
printf(" Failed -- no haplotype: %zu\n", numFails[GFRC_NO_HAPLOTYPE]);
printf(" Failed -- ambiguous solution: %zu\n", numFails[GFRC_AMBIGUOUS]);
printf(" Failed -- assembled size differs from estimate: %zu\n", numFails[GFRC_BAD_SIZE]);
}
//
//
//
GapFillProcess::GapFillProcess(const GapFillParameters& params) : m_parameters(params)
{
}
//
GapFillProcess::~GapFillProcess()
{
m_stats.print();
}
// Process the given scaffold, filling in any gaps found
GapFillResult GapFillProcess::processScaffold(const std::string& scaffold) const
{
if(m_parameters.verbose > 0)
std::cout << "Processing scaffold of length " << scaffold.length() << "\n";
GapFillResult result;
size_t len = scaffold.length();
size_t currIdx = 0;
while(currIdx < len)
{
char b = scaffold[currIdx];
if(b != 'N')
{
// Append this non-gap character into the output scaffold
result.scaffold.append(1,b);
currIdx += 1;
}
else
{
// Found the start of a gap
size_t gapLength = 0;
while(scaffold[currIdx + gapLength] == 'N')
gapLength += 1;
if(m_parameters.verbose >= 1)
printf("Constructing gap at position %zu GapLength: %zu\n", currIdx, gapLength);
// Calculate the left-anchor using the sequence of the scaffold already appended
AnchorSequence leftAnchor = findAnchor(result.scaffold, result.scaffold.length() - m_parameters.kmer, true);
// Calculate the right anchor using the sequence of the input scaffold
AnchorSequence rightAnchor = findAnchor(scaffold, currIdx + gapLength, false);
// Attempt to build the gap sequence
std::string gapSequence;
GapFillReturnCode code = processGap(leftAnchor, rightAnchor, gapSequence);
m_stats.numGapsAttempted += 1;
if(code == GFRC_OK)
{
// Successfully resolved the gap. Calculate the amount of the anchor sequence
// that is already present in the scaffold and does not need to be appended in.
size_t trimLength = result.scaffold.length() - leftAnchor.position;
result.scaffold.append(gapSequence.substr(trimLength));
// We need to update currIdx to point to the next base in the
// input scaffold that is not already assembled. This is given
// by the position of the rightAnchor, plus a kmer
currIdx = rightAnchor.position + m_parameters.kmer;
m_stats.numGapsFilled += 1;
}
else
{
// Failed to resolve the gap. Append the gap into the growing scaffold
m_stats.numFails[code] += 1;
while(scaffold[currIdx] == 'N')
{
result.scaffold.append(1, 'N');
currIdx += 1;
}
}
}
}
if(m_parameters.verbose >= 2)
StdAlnTools::globalAlignment(scaffold, result.scaffold, true);
return result;
}
// Fill in the specified gap
GapFillReturnCode GapFillProcess::processGap(const AnchorSequence& startAnchor, const AnchorSequence& endAnchor, std::string& outSequence) const
{
if(m_parameters.verbose > 0)
{
std::cout << "\tSTART: " << startAnchor << "\n";
std::cout << "\tEND: " << endAnchor << "\n";
}
if(startAnchor.sequence.empty() || endAnchor.sequence.empty() || startAnchor.sequence == endAnchor.sequence)
return GFRC_NO_ANCHOR;
HaplotypeBuilder builder;
builder.setTerminals(startAnchor, endAnchor);
builder.setIndex(m_parameters.pBWT, m_parameters.pRevBWT);
builder.setKmerParameters(m_parameters.kmer, m_parameters.kmerThreshold);
builder.run();
HaplotypeBuilderResult result;
builder.parseWalks(result);
if(result.haplotypes.size() > 0)
{
if(m_parameters.verbose > 0)
{
std::cout << "Built " << result.haplotypes.size() << " candidate haplotypes\n";
MultiAlignment haplotypeAlignment = MultiAlignmentTools::alignSequences(result.haplotypes);
haplotypeAlignment.print();
}
// Arbitrarily choosing the first haplotype as the sequence
outSequence = result.haplotypes[0];
return GFRC_OK;
}
else
{
return result.haplotypes.empty() ? GFRC_NO_HAPLOTYPE : GFRC_AMBIGUOUS;
}
}
AnchorSequence GapFillProcess::findAnchor(const std::string& scaffold, int64_t position, bool upstream) const
{
WARN_ONCE("TODO: deduplicate findAnchor code");
AnchorSequence anchor;
int64_t stride = upstream ? -1 : 1;
int MAX_DISTANCE = 50;
int64_t stop = upstream ? position - MAX_DISTANCE : position + MAX_DISTANCE;
// Cap the travel distance to avoid out of bounds
if(stop < 0)
stop = 0;
if(stop > (int64_t)(scaffold.length() - m_parameters.kmer))
stop = scaffold.length() - m_parameters.kmer;
for(; position != stop; position += stride)
{
std::string testSeq = scaffold.substr(position, m_parameters.kmer);
std::transform(testSeq.begin(), testSeq.end(), testSeq.begin(), ::toupper);
if(testSeq.find_first_of('N') != std::string::npos)
continue;
size_t count = BWTAlgorithms::countSequenceOccurrencesWithCache(testSeq, m_parameters.pBWT, m_parameters.pBWTCache);
if(count > m_parameters.kmerThreshold)
{
anchor.sequence = testSeq;
anchor.count = count;
anchor.position = position;
return anchor;
}
}
anchor.sequence = "";
anchor.count = -1;
return anchor;
}