Skip to content

Commit

Permalink
Added Hifiasm gaf output..
Browse files Browse the repository at this point in the history
  • Loading branch information
lipi0056 committed May 1, 2023
1 parent c533a82 commit b0ff0c4
Show file tree
Hide file tree
Showing 3 changed files with 251 additions and 3 deletions.
14 changes: 12 additions & 2 deletions GraphCleanupExe/src/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <iostream>
#include <cstdlib>
#include <map>

#include "bioparser/fasta_parser.hpp"
#include "bioparser/fastq_parser.hpp"
Expand Down Expand Up @@ -66,13 +67,22 @@ int main(int argc, char** argv) {

raven::Graph graph;

graph = raven::LoadGfa(input_gfa_path);
std::map<std::string, long> nodeLengths;
std::map<std::uint32_t, std::string> additionalHifiasmEdgeInfo;

graph = raven::LoadHifiasmGfa(input_gfa_path, nodeLengths, additionalHifiasmEdgeInfo);

raven::PrintGfa(graph, output_gfa_path + ".raven.gfa");

raven::RemoveInvalidEdgesFromGraph(graph);

raven::PrintGfa(graph, output_gfa_path + ".removedInvalidEdges.raven.gfa");
raven::PrintHifiasmGfa(graph, output_gfa_path + ".removedInvalidEdges.hifiasm.gfa", nodeLengths, additionalHifiasmEdgeInfo);

raven::RemoveInvalidConnectionsFromGraph(graph);

raven::PrintGfa(graph, output_gfa_path);
raven::PrintGfa(graph, output_gfa_path + ".removedInvalidConnections.raven.gfa");
raven::PrintHifiasmGfa(graph, output_gfa_path + ".removedInvalidConnections.hifiasm.gfa", nodeLengths, additionalHifiasmEdgeInfo);

timer.Stop();
std::cerr << "[graph_cleanup::] " << std::fixed << timer.elapsed_time() << "s"
Expand Down
3 changes: 3 additions & 0 deletions RavenLib/include/raven/graph/serialization/graph_repr.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define RAVEN_GRAPH_SERIALIZATION_GRAPH_REPR_H_

#include <fstream>
#include <map>

#include "cereal/archives/json.hpp"
#include "raven/export.h"
Expand All @@ -13,6 +14,8 @@ RAVEN_EXPORT void PrintGfa(const Graph& graph, const std::string& path);
RAVEN_EXPORT void PrintCsv(const Graph& graph, const std::string& path, bool printSequenceName = false, bool printPileBeginEnd = false, bool printEdgeSimilarity = false);
RAVEN_EXPORT void PrintJson(const Graph& graph, const std::string& path);
RAVEN_EXPORT Graph LoadGfa(const std::string& path);
RAVEN_EXPORT Graph LoadHifiasmGfa(const std::string& path, std::map<std::string, long>& nodeLengths, std::map<std::uint32_t, std::string>& additionalHifiasmEdgeInfo);
RAVEN_EXPORT void PrintHifiasmGfa(const Graph& graph, const std::string& path, std::map<std::string, long>& nodeLengths, std::map<std::uint32_t, std::string>& additionalHifiasmEdgeInfo);
RAVEN_EXPORT std::vector<std::string> getGfa(const Graph& graph);
RAVEN_EXPORT std::vector<std::string> getCsv(const Graph& graph, bool printSequenceName = false, bool printPileBeginEnd = false, bool printEdgeSimilarity = false);

Expand Down
237 changes: 236 additions & 1 deletion RavenLib/src/graph_repr.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "raven/graph/serialization/graph_repr.h"
#include <string>
#include <vector>
#include <map>
#include "edlib.h"
namespace raven {

Expand Down Expand Up @@ -394,14 +395,41 @@ void splitString(const std::string &inputString, char delimiter, std::vector<std
}
}

Node* findNodeForSequenceName(std::string tailSequenceName, std::vector<std::unique_ptr<Node>> &nodes) {
Node* findNodeForSequenceName(std::string tailSequenceName, std::vector<std::unique_ptr<Node>> &nodes) {
for (auto &node : nodes) {
if (node == nullptr) continue;
if (node->sequence.name == tailSequenceName) return node.get();
}
return nullptr;
}

Node* findNodeForSequenceNameAndOrientation(std::string sequenceName, bool isRc, std::vector<std::unique_ptr<Node>> &nodes) {
for (auto &node : nodes) {
if (node == nullptr) continue;
if (node->sequence.name == sequenceName && node->is_rc() == isRc) return node.get();
}
return nullptr;
}

Node* findNodeForSequenceNameAndOrientation(std::string sequenceName, bool isRc, std::map<std::string, Node*>& nodes, std::map<std::string, Node*>& nodesRc) {
if (!isRc) {
auto it = nodes.find(sequenceName);
if (it == nodes.end() ) {
return nullptr;
} else {
return it->second;
}
} else {
auto it = nodesRc.find(sequenceName);
if (it == nodesRc.end() ) {
return nullptr;
} else {
return it->second;
}
}
return nullptr;
}

Graph LoadGfa(const std::string& path) {
Graph graph = Graph();

Expand Down Expand Up @@ -498,4 +526,211 @@ Graph LoadGfa(const std::string& path) {
return graph;
}

Graph LoadHifiasmGfa(const std::string& path, std::map<std::string, long>& nodeLengths, std::map<std::uint32_t, std::string>& additionalHifiasmEdgeInfo) {
Graph graph = Graph();

if (path.empty()) {
return graph;
}

std::cout << "Loading graph from path: " << path << std::endl;

std::map<std::string, Node*> nodes;
std::map<std::string, Node*> nodesRc;

std::uint32_t currentNodeId = 0;
std::uint32_t currentEdgeId = 0;

std::ifstream is(path);
std::string inputLine;

while(getline(is, inputLine)) {

std::vector<std::string> rowValues;
splitString(inputLine, '\t', rowValues);

if (rowValues[0] == "S") { // this is a node

std::string sequenceName = rowValues[1];
std::string lengthString = rowValues[3];

long nodeLength = stol(lengthString.substr(5));

biosoup::NucleicAcid sequence = biosoup::NucleicAcid(sequenceName, "");
std::unique_ptr<Node> newNode(new Node());

sequence.is_reverse_complement = false;

newNode->id = currentNodeId;
newNode->count = 0;
newNode->is_unitig = false;
newNode->is_circular = false;
newNode->is_polished = false;
newNode->sequence = sequence;

biosoup::NucleicAcid sequenceRc = biosoup::NucleicAcid(sequenceName, "");
std::unique_ptr<Node> newNodeRc(new Node());

sequenceRc.is_reverse_complement = true;

newNodeRc->id = currentNodeId + 1;
newNodeRc->count = 0;
newNodeRc->is_unitig = false;
newNodeRc->is_circular = false;
newNodeRc->is_polished = false;
newNodeRc->sequence = sequenceRc;

newNode->pair = newNodeRc.get();
newNodeRc->pair = newNode.get();

nodeLengths[sequenceName] = nodeLength;
nodes[sequenceName] = newNode.get();
nodesRc[sequenceName] = newNodeRc.get();

graph.nodes.push_back(std::move(newNode));
graph.nodes.push_back(std::move(newNodeRc));

currentNodeId += 2; // since node is_rc() method is based on node id and PrintGfa only prints !is_rc() Nodes all ids are set to even numbers

} else if (rowValues[0] == "L") { // this is an egde

std::string tailSequenceName = rowValues[1].substr(0, rowValues[1].find(":"));
bool isTailReverseComplement = rowValues[2] == "-" ? true : false ;
std::string headSequenceName = rowValues[3].substr(0, rowValues[3].find(":"));
bool isHeadReverseComplement = rowValues[4] == "-" ? true : false ;

std::string additionalHifiasmData = rowValues[5] + "\t" + rowValues[6];

Node* tail;
//tail = findNodeForSequenceNameAndOrientation(tailSequenceName, isTailReverseComplement, graph.nodes);
tail = findNodeForSequenceNameAndOrientation(tailSequenceName, isTailReverseComplement, nodes, nodesRc);

Node* head;
//head = findNodeForSequenceNameAndOrientation(headSequenceName, isHeadReverseComplement, graph.nodes);
head = findNodeForSequenceNameAndOrientation(headSequenceName, isHeadReverseComplement, nodes, nodesRc);

std::unique_ptr<Edge> newEdge(new Edge(tail, head, 0));
newEdge->id = currentEdgeId;

std::unique_ptr<Edge> newEdgeRc(new Edge(tail->pair, head->pair, 0));
newEdgeRc->id = currentEdgeId + 1;

newEdge->pair = newEdgeRc.get();
newEdgeRc->pair = newEdge.get();

if (tail != nullptr) {
tail->outedges.push_back(newEdge.get());
}

if (head != nullptr) {
head->inedges.push_back(newEdge.get());
}

if (tail->pair != nullptr) {
tail->pair->outedges.push_back(newEdgeRc.get());
}

if (head->pair != nullptr) {
head->pair->inedges.push_back(newEdgeRc.get());
}

additionalHifiasmEdgeInfo[newEdge->id] = additionalHifiasmData;
additionalHifiasmEdgeInfo[newEdgeRc->id] = additionalHifiasmData;

graph.edges.push_back(std::move(newEdge));
graph.edges.push_back(std::move(newEdgeRc));

currentEdgeId += 2; // since edge is_rc() method is based on edge id and PrintGfa only prints !is_rc() Edges all ids are set to even numbers

} else {
std::cout << "Unknown element: " << inputLine << std::endl;
}

}

graph.stage = -3;

is.close();

std::cout << "Loaded graph from path: " << path << std::endl;

return graph;
}

void PrintHifiasmGfa(const Graph& graph, const std::string& path, std::map<std::string, long>& nodeLengths, std::map<std::uint32_t, std::string>& additionalHifiasmEdgeInfo) {
if (path.empty()) {
return;
}

std::ofstream os(path);
for (const auto& it : graph.nodes) {
if ((it == nullptr) || it->is_rc() ||
(it->count == 1 && it->outdegree() == 0 && it->indegree() == 0)) {
continue;
}

long sequenceLength;
auto jt = nodeLengths.find(it->sequence.name);
if (jt == nodeLengths.end() ) {
sequenceLength = 0;
} else {
sequenceLength = jt->second;
}

os << "S";
os << "\t";
os << it->sequence.name;
os << "\t";
os << "*";
os << "\t";
os << "LN:i:" << sequenceLength;
os << std::endl;
}

for (const auto& it : graph.edges) {
if (it == nullptr || it->is_rc()) {
continue;
}

long tailSequenceLength;
auto jt = nodeLengths.find(it->tail->sequence.name);
if (jt == nodeLengths.end() ) {
tailSequenceLength = 0;
} else {
tailSequenceLength = jt->second;
}

long headSequenceLength;
auto kt = nodeLengths.find(it->head->sequence.name);
if (kt == nodeLengths.end() ) {
headSequenceLength = 0;
} else {
headSequenceLength = kt->second;
}

std::string additionalEdgeInfo;
auto lt = additionalHifiasmEdgeInfo.find(it->id);
if (lt == additionalHifiasmEdgeInfo.end() ) {
additionalEdgeInfo = "";
} else {
additionalEdgeInfo = lt->second;
}

os << "L";
os << "\t";
os << it->tail->sequence.name << ":1-" << tailSequenceLength;
os << "\t";
os << (it->tail->is_rc() ? '-' : '+'); // NOLINT
os << "\t";
os << it->head->sequence.name << ":1-" << headSequenceLength;
os << "\t";
os << (it->head->is_rc() ? '-' : '+'); // NOLINT
os << "\t";
os << additionalEdgeInfo;
os << std::endl;
}

os.close();
}

} // namespace raven

0 comments on commit b0ff0c4

Please sign in to comment.