Skip to content

Commit

Permalink
Fixed an issue with FilterAnchors, it should work much better now. Fi…
Browse files Browse the repository at this point in the history
…lterAnchors also removes chains with very small number of anchors/bases (if < 2 anchors && < 50 bases). The function GenerateClusters now also removes the small clusters (the same conditions as for the chains in FilterAnchors). The option '-a' now allows a 'spliced' alignment option which outputs every cluster separately - this is a starting point for RNA-seq support. In this version the Densehash is used. Also, in this version, I'm testing another condition that might speed up slightly on larger references: simply checking if the current position_bin is the same as the previous one, and if so continue the looping through the hits (the hits should be sorted in the index). The script scatterplot8.py now colors the lines as well as the points for simpler debugging of smaller clusters.
  • Loading branch information
isovic committed Mar 27, 2016
1 parent ffef453 commit bd9f11d
Show file tree
Hide file tree
Showing 8 changed files with 220 additions and 68 deletions.
7 changes: 4 additions & 3 deletions scripts/scatterplot8.py
Expand Up @@ -121,16 +121,17 @@ def plot_data(fig, ax, subplot_coords, x, y, c, query_length, ymin, ymax, l_medi

all_colors = 'bgrcmyk';

colors1 = [all_colors[val%len(all_colors)] for val in c[0::2]];
colors2 = [all_colors[val%len(all_colors)] for val in c[1::2]];

i = 0;
while (i < len(x)):
ax.plot(x[i:(i+2)], y[i:(i+2)], 'k');
ax.plot(x[i:(i+2)], y[i:(i+2)], color=all_colors[(c[i]) % len(all_colors)]);
i += 2;

# ax.plot(x, y, 'o');
# ax.scatter(x[0::2], y[0::2], s=10, facecolor='b', lw = 0.2)
# ax.scatter(x[1::2], y[1::2], s=10, facecolor='c', lw = 0.2)
colors1 = [all_colors[val%len(all_colors)] for val in c[0::2]];
colors2 = [all_colors[val%len(all_colors)] for val in c[1::2]];
ax.scatter(x[0::2], y[0::2], s=10, edgecolor=colors1, facecolor=colors1, lw = 0.2)
ax.scatter(x[1::2], y[1::2], s=10, edgecolor=colors2, facecolor=colors2, lw = 0.2)

Expand Down
12 changes: 7 additions & 5 deletions src/alignment/alignment.cc
Expand Up @@ -9,9 +9,11 @@
#include "libs/opal.h"

int AlignRegion(const SingleSequence *read, const Index *index, const ProgramParameters *parameters, const EValueParams *evalue_params, bool extend_to_end, PathGraphEntry *region_results) {
bool align_end_to_end = true;
// bool align_end_to_end = true;
// bool spliced_alignment = true;
bool spliced_alignment = false;
// bool spliced_alignment = false;
bool align_end_to_end = parameters->alignment_algorithm != "spliced";
bool spliced_alignment = parameters->alignment_algorithm == "spliced";

if (parameters->alignment_algorithm == "gotoh") {
LogSystem::GetInstance().Log(VERBOSE_LEVEL_ALL_DEBUG, ((int64_t) read->get_sequence_id()) == parameters->debug_read, "Using semiglobal alignment approach.\n", "Alignment");
Expand All @@ -25,21 +27,21 @@ int AlignRegion(const SingleSequence *read, const Index *index, const ProgramPar

return SemiglobalAlignment(MyersSemiglobalWrapper, read, index, parameters, evalue_params, region_results);

} else if (parameters->alignment_algorithm == "anchor") {
} else if (parameters->alignment_algorithm == "anchor" || parameters->alignment_algorithm == "spliced") {
LogSystem::GetInstance().Log(VERBOSE_LEVEL_ALL_DEBUG, ((int64_t) read->get_sequence_id()) == parameters->debug_read, "Using anchored alignment approach.\n", "Alignment");
bool is_linear = region_results->get_region_data().is_split == false || parameters->is_reference_circular == false;
LogSystem::GetInstance().Log(VERBOSE_LEVEL_ALL_DEBUG, ((int64_t) read->get_sequence_id()) == parameters->debug_read, "Using Myers' bit-vector algorithm for alignment!\n", "Alignment");
return AnchoredAlignmentNew(MyersNWWrapper, MyersSHWWrapper, read, index, parameters, evalue_params, region_results, align_end_to_end, spliced_alignment);

} else if (parameters->alignment_algorithm == "anchorgotoh") {
} else if (parameters->alignment_algorithm == "anchorgotoh" || parameters->alignment_algorithm == "splicedgotoh") {
LogSystem::GetInstance().Log(VERBOSE_LEVEL_ALL_DEBUG, ((int64_t) read->get_sequence_id()) == parameters->debug_read, "Using anchored alignment approach.\n", "Alignment");
bool is_linear = region_results->get_region_data().is_split == false || parameters->is_reference_circular == false;
LogSystem::GetInstance().Log(VERBOSE_LEVEL_ALL_DEBUG, ((int64_t) read->get_sequence_id()) == parameters->debug_read, "Using Gotoh's algorithm for alignment!\n", "Alignment");
return AnchoredAlignmentNew(SeqAnNWWrapper, SeqAnSHWWrapper, read, index, parameters, evalue_params, region_results, align_end_to_end, spliced_alignment);

#ifndef RELEASE_VERSION

} else if (parameters->alignment_algorithm == "anchormex") {
} else if (parameters->alignment_algorithm == "anchormex" || parameters->alignment_algorithm == "splicedmex") {
LogSystem::GetInstance().Log(VERBOSE_LEVEL_ALL_DEBUG, ((int64_t) read->get_sequence_id()) == parameters->debug_read, "Using anchored alignment approach.\n", "Alignment");
bool is_linear = region_results->get_region_data().is_split == false || parameters->is_reference_circular == false;
LogSystem::GetInstance().Log(VERBOSE_LEVEL_ALL_DEBUG, ((int64_t) read->get_sequence_id()) == parameters->debug_read, "Using Match Extend algorithm for alignment!\n", "Alignment");
Expand Down
4 changes: 2 additions & 2 deletions src/alignment/local_realignment_wrappers.cc
Expand Up @@ -504,7 +504,7 @@ int MyersSemiglobalWrapper(const int8_t *read_data, int64_t read_length,

int myers_return_code = myersCalcEditDistance((const unsigned char *) read_data, read_length,
(const unsigned char *) reference_data, reference_length,
alphabet_length, band_width, MYERS_MODE_HW, &score, &positions, &num_positions,
alphabet_length, -1, MYERS_MODE_HW, &score, &positions, &num_positions,
true, &alignment, &alignment_length);

if (myers_return_code == MYERS_STATUS_ERROR || num_positions == 0 || alignment_length == 0) {
Expand Down Expand Up @@ -560,7 +560,7 @@ int MyersNWWrapper(const int8_t *read_data, int64_t read_length,

int myers_return_code = myersCalcEditDistance((const unsigned char *) read_data, read_length,
(const unsigned char *) reference_data, reference_length,
alphabet_length, band_width, MYERS_MODE_NW, &score, &positions, &num_positions,
alphabet_length, -1, MYERS_MODE_NW, &score, &positions, &num_positions,
true, &alignment, &alignment_length);

if (myers_return_code == MYERS_STATUS_ERROR || num_positions == 0 || alignment_length == 0) {
Expand Down
23 changes: 18 additions & 5 deletions src/graphmap/anchored_mapping.cc
Expand Up @@ -43,8 +43,7 @@ bool CheckDistanceStep(const Vertices& registry_entries, int64_t index_first, in

return false;
}

int GenerateClusters(int64_t min_cluster_length, float min_cluster_coverage, std::vector<int> &lcskpp_indices,
int GenerateClusters(int64_t min_num_anchors_in_cluster, int64_t min_cluster_length, float min_cluster_coverage, std::vector<int> &lcskpp_indices,
ScoreRegistry* local_score, MappingData* mapping_data, const std::vector<Index *> indexes,
const SingleSequence* read, const ProgramParameters* parameters, std::vector<ClusterAndIndices *> &ret_clusters,
std::vector<int> &ret_filtered_lcskpp_indices, std::vector<int32_t> *ret_cluster_ids) {
Expand Down Expand Up @@ -124,6 +123,14 @@ int GenerateClusters(int64_t min_cluster_length, float min_cluster_coverage, std
int64_t covered_bases = ret_clusters[i]->coverage;
float cluster_coverage = ((float) covered_bases) / ((float) cluster_length);

// Filter clusters which are too small.
// if ((min_num_anchors_in_cluster > 0 && min_cluster_length > 0 && min_cluster_coverage > 0) && ret_clusters[i]->lcskpp_indices.size() <= min_num_anchors_in_cluster && cluster_length < min_cluster_length) {
if (ret_clusters[i]->lcskpp_indices.size() <= min_num_anchors_in_cluster &&
cluster_length < min_cluster_length &&
ret_clusters[i]->lcskpp_indices.size() <= min_num_anchors_in_cluster) {
ret_clusters[i]->lcskpp_indices.clear();
continue;
}

ret_filtered_lcskpp_indices.insert(ret_filtered_lcskpp_indices.end(), ret_clusters[i]->lcskpp_indices.begin(), ret_clusters[i]->lcskpp_indices.end());
// printf ("ret_filtered_lcskpp_indices.size() = %ld\n", ret_filtered_lcskpp_indices.size());
Expand All @@ -135,14 +142,20 @@ int GenerateClusters(int64_t min_cluster_length, float min_cluster_coverage, std
ret_cluster_ids->insert(ret_cluster_ids->end(), cluster_indices.begin(), cluster_indices.end());
}

LogSystem::GetInstance().Log(VERBOSE_LEVEL_ALL_DEBUG, read->get_sequence_id() == parameters->debug_read, FormatString("[Cluster %ld] cluster_length = %ld, covered_bases = %ld\n", i, cluster_length, covered_bases), "L1-PostProcessRegionWithLCS_");
LogSystem::GetInstance().Log(VERBOSE_LEVEL_ALL_DEBUG, read->get_sequence_id() == parameters->debug_read, FormatString("[Cluster %ld] cluster_length = %ld, covered_bases = %ld\n", i, cluster_length, covered_bases), std::string(__FUNCTION__));
for (int64_t j=0; j<ret_clusters[i]->lcskpp_indices.size(); j++) {
int32_t qpos_start = 0, rpos_start = 0, qpos_end = 0, rpos_end = 0;
GetPositionsFromRegistry2(registry_entries, ret_clusters[i]->lcskpp_indices[j], &qpos_start, &rpos_start, &qpos_end, &rpos_end);
LogSystem::GetInstance().Log(VERBOSE_LEVEL_ALL_DEBUG, read->get_sequence_id() == parameters->debug_read, FormatString(" [%ld] start: [%d, %d], end: [%d, %d]\n", j, qpos_start, rpos_start, qpos_end, rpos_end), "[]");
}
}

for (int64_t i = (ret_clusters.size() - 1); i >= 0; i--) {
if (ret_clusters[i]->lcskpp_indices.size() == 0) {
ret_clusters.erase(ret_clusters.begin() + i);
}
}

return 0;
}

Expand Down Expand Up @@ -462,10 +475,10 @@ int GraphMap::AnchoredPostProcessRegionWithLCS_(ScoreRegistry* local_score, Mapp
std::vector<int32_t> first_filtered_clusters;
// FilterAnchors(read, local_score, parameters, lcskpp_indices, parameters->error_rate/2.0f + 0.01f, 300, 2, 50, 2, first_filtered_lcskpp_indices, NULL);
// FilterAnchors(read, local_score, parameters, lcskpp_indices, parameters->error_rate/2.0f + 0.01f, std::max(read->get_sequence_length() * 0.10f, 200.0f), 2, 50, 2, first_filtered_lcskpp_indices, NULL);
FilterAnchors(read, local_score, parameters, lcskpp_indices, parameters->error_rate/2.0f + 0.01f, 200.0f, 0, 50, 2, first_filtered_lcskpp_indices, NULL);
FilterAnchors(read, local_score, parameters, lcskpp_indices, parameters->error_rate/2 + 0.01f, 200.0f, 0, 50, 2, first_filtered_lcskpp_indices, NULL);

// FilterAnchorBreakpoints(read->get_sequence_length() * MIN_CLUSTER_LENGTH_FACTOR, MIN_CLUSTER_COVERAGE_FACTOR, first_filtered_lcskpp_indices, local_score, mapping_data, indexes, read, parameters, clusters, cluster_indices, &cluster_ids);
GenerateClusters(0.0f, 0.0f, first_filtered_lcskpp_indices, local_score, mapping_data, indexes, read, parameters, clusters, cluster_indices, &cluster_ids);
GenerateClusters(2, 50, 20, first_filtered_lcskpp_indices, local_score, mapping_data, indexes, read, parameters, clusters, cluster_indices, &cluster_ids);

// Find the L1 parameters (median line and the confidence intervals).
float l_diff = read->get_sequence_length() * parameters->error_rate;
Expand Down

0 comments on commit bd9f11d

Please sign in to comment.