In [71]:
import json
import time
import sys
import os

_TAXA_TYPES = {
    'd': 'domain',
    'p': 'phylum',
    'c': 'class',
    'o': 'order',
    'f': 'family',
    'g': 'genus',
    's': 'species',
}

# Graph schema looks like:
# - vertex gtdb_taxon
#   - type (eg. "Domain"), name (eg. "Bacteria")
# - vertex gtdb_organism
#   - _key (accession, eg. "RS_GCF_001245025.1")
# - edge gtdb_child_of_taxon
#   - gtdb_taxon -> gtdb_taxon
#   - gtdb_organism -> gtdb_taxon


def bac_taxonomy_to_json(tsv_path):
    
    path = tsv_path
    release = path.strip('.tsv')
    timestamp = str(int(time.time() * 1000))
    gtdb_vertices_path = f'gtdb_taxon.json'
    gtdb_edges_path = f'gtdb_child_of_taxon.json'
    if os.path.exists(gtdb_edges_path):
        os.remove(gtdb_edges_path)
    if os.path.exists(gtdb_vertices_path):
        os.remove(gtdb_vertices_path)
    gtdb_vertices_output = open(gtdb_vertices_path, 'a')
    gtdb_edges_output = open(gtdb_edges_path, 'a')
    
    # Raw data input
    input_file = open(path)
    # All the file descriptors we will need to close at the end
    to_close = [gtdb_vertices_output, gtdb_edges_output, input_file]
    # For tracking taxon names we have already found
    found_taxon_names = {}  # type: dict
    try:
        for line in input_file:
            (accession, lineage) = line.split('\t')
            # Write the gtdb_organism doc
            refseq_doc = {'_key': accession}
            prev_taxon_key = None
            prev_root_key = None
            # Iterate over taxa
            taxa = []  # type: list
            for taxon in lineage.split(';'):
                (taxon_type_abbrev, taxa_name) = taxon.split('__')
                taxa_type = _TAXA_TYPES[taxon_type_abbrev]
                taxa_name = taxa_name.strip('\n').lower()
                if taxa_type == 'species': 
                    taxa_name = taxa_name.split(" ")
                else: 
                    taxa_name = [taxa_name]
                taxa.append((taxon_type_abbrev, taxa_type, taxa_name))
            for (idx, (taxon_type_abbrev, taxa_type, taxa_name)) in enumerate(taxa):
                # Write the gtdb_taxon document
                if taxa_type == 'species': 
                    full_name = taxon_type_abbrev + ':' + str("_".join(taxa_name))
                else: 
                    full_name = taxon_type_abbrev + ':' + taxa_name[0]
                if full_name in found_taxon_names:
                    prev_taxon_key = full_name
                    # We have already recorded this taxon
                    continue
                vertex_doc = {'_key': full_name, 'release': release, 'rank': taxa_type, 'name': taxa_name}
                for idx2 in range(0, idx+1):
                    (taxon_type_abbrev, taxa_type, taxa_name) = taxa[idx2]
                    if taxa_type == 'species':
                        vertex_doc[taxa_type] = str("_".join(taxa_name))  
                    else:
                        vertex_doc[taxa_type] = taxa_name[0]
                gtdb_vertices_output.write(json.dumps(vertex_doc) + '\n')

                if prev_taxon_key:
                    # Write the edge to go from child to parent
                    # _from is child anud _to is parent
                    edge_doc = {
                        '_from': "gtdb_taxon/" + full_name,
                        'child_type': 't',
                        '_to': "gtdb_taxon/" + prev_taxon_key
                    }
                    gtdb_edges_output.write(json.dumps(edge_doc) + '\n')
                prev_taxon_key = full_name
                found_taxon_names[full_name] = True
            # Write the edge to go from child to parent from the refseq entry to the species
            edge_doc = {
                '_from': "gtdb_taxon/"+accession,
                'child_type': 'o',
                '_to': "gtdb_taxon/"+taxa[-1][0] + ':' + str("_".join(taxa[-1][2]))
            }
            vertex_doc = {
                '_key':accession, 
                'release': release, 
                'rank': 'genome',
                'name': taxa_name
            }
            gtdb_edges_output.write(json.dumps(edge_doc) + '\n')
            gtdb_vertices_output.write(json.dumps(vertex_doc) + '\n')
    finally:
        for fd in to_close:
            fd.close()


if __name__ == '__main__':
    commands = {
        'bac_taxonomy_to_json': bac_taxonomy_to_json
    }
    if len(sys.argv) != 2:
        sys.stderr.write(f'Valid options: {list(commands.keys())}')
        sys.exit(1)
    option = sys.argv[1]
    if option not in commands:
        sys.stderr.write(f'Invalid option: {option}. Valid option: {list(commands.keys())}')
        sys.exit(1)
    commands[option]()
    print('-- done --')



Valid options: ['bac_taxonomy_to_json']

SystemExit: 1

In [72]:
bac_taxonomy_to_json('bac120_taxonomy_r89.tsv')

In [38]:
import json
import time
import sys


_TAXA_TYPES = {
    'd': 'Domain',
    'p': 'Phylum',
    'c': 'Class',
    'o': 'Order',
    'f': 'Family',
    'g': 'Genus',
    's': 'Species',
}

# Graph schema looks like:
# - vertex gtdb_taxon
#   - type (eg. "Domain"), name (eg. "Bacteria")
# - vertex gtdb_organism
#   - _key (accession, eg. "RS_GCF_001245025.1")
# - edge gtdb_child_of_taxon
#   - gtdb_taxon -> gtdb_taxon
#   - gtdb_organism -> gtdb_taxon


def bac_taxonomy_to_json():
    path = 'bac120_taxonomy_r89.tsv'
    timestamp = str(int(time.time() * 1000))
    gtdb_taxon_path = f'gtdb_taxon-{timestamp}.json'
    gtdb_organism_path = f'gtdb_organism-{timestamp}.json'
    gtdb_child_of_taxon_path = f'gtdb_child_of_taxon-{timestamp}.json'
    gtdb_taxon_output = open(gtdb_taxon_path, 'a')
    gtdb_organism_output = open(gtdb_organism_path, 'a')
    gtdb_child_of_taxon_output = open(gtdb_child_of_taxon_path, 'a')
    # Raw data input
    input_file = open(path)
    # All the file descriptors we will need to close at the end
    to_close = [gtdb_taxon_output, gtdb_organism_output, gtdb_child_of_taxon_output, input_file]
    # For tracking taxon names we have already found
    found_taxon_names = {}  # type: dict
    try:
        for line in input_file:
            (accession, lineage) = line.split('\t')
            # Write the gtdb_organism doc
            refseq_doc = {'_key': accession}
            gtdb_organism_output.write(json.dumps(refseq_doc) + '\n')
            prev_taxon_key = None
            prev_root_key = None 
            # Iterate over taxa
            taxa = []  # type: list
            for taxon in lineage.split(';'):
                (short_type, name) = taxon.split('__')
                type_name = _TAXA_TYPES[short_type]
                name = name.strip('\n')
                taxa.append((short_type, type_name, name))
            for (short_type, type_name, name) in taxa:
                # Write the gtdb_taxon document
                full_name = short_type + ':' + name
                if full_name in found_taxon_names:
                    # We have already recorded this taxon
                    continue
                taxon_doc = {'_key': full_name, 'type': type_name, 'name': name}
                for (short_type, type_name, name) in taxa:
                    taxon_doc[type_name] = name
                    
                gtdb_taxon_output.write(json.dumps(taxon_doc) + '\n')
                if prev_taxon_key == None: 
                    prev_root_key = full_name
                    if prev_root_key:
                        child_doc = {
                            '_from': full_name,
                            '_to': prev_root_key
                        }
                        gtdb_child_of_taxon_output.write(json.dumps(child_doc) + '\n')
                if prev_taxon_key:
                    # Write the edge to go from child to parent
                    # _from is child and _to is parent
                    child_doc = {
                        '_from': full_name,
                        '_to': prev_taxon_key
                    }
                    gtdb_child_of_taxon_output.write(json.dumps(child_doc) + '\n')
                prev_taxon_key = full_name
                found_taxon_names[full_name] = True
            # Write the edge to go from child to parent from the refseq entry to the species
            child_doc = {
                '_from': accession,
                '_to': taxa[-1][0] + ':' + taxa[-1][2]
            }
            gtdb_child_of_taxon_output.write(json.dumps(child_doc) + '\n')
    finally:
        for fd in to_close:
            fd.close()


if __name__ == '__main__':
    commands = {
        'bac_taxonomy_to_json': bac_taxonomy_to_json
    }
    if len(sys.argv) != 2:
        sys.stderr.write(f'Valid options: {list(commands.keys())}')
        sys.exit(1)
    option = sys.argv[1]
    if option not in commands:
        sys.stderr.write(f'Invalid option: {option}. Valid option: {list(commands.keys())}')
        sys.exit(1)
    commands[option]()
    print('-- done --')


Valid options: ['bac_taxonomy_to_json']

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [39]:
bac_taxonomy_to_json()

In [210]:
import json
import time 
import sys 

_TAXA_TYPES = {
    'd': 'domain',
    'p': 'phylum',
    'c': 'class',
    'o': 'order',
    'f': 'family',
    'g': 'genus',
    's': 'species',
}

def GTDB_data_to_json(tsv_path):
    # Define paths
    path = tsv_path
    release = path.strip(".tsv")
    gtdb_vertices_path = tsv_path.strip(".tsv") + "_gtdb_vertices.json"
    gtdb_edges_path = tsv_path.strip(".tsv") + "_gtdb_edges.json"
    gtdb_vertices_output = open(gtdb_vertices_path, 'a')
    gtdb_edges_output = open(gtdb_edges_path, 'a')
    
    # Raw data input 
    input_file = open(path)
    
    # All the file descriptors we will need to close at the end
    to_close = [gtdb_vertices_output, gtdb_edges_output, input_file]
    # For tracking taxon names we have already found
    found_taxon_names = {}
    try:
        for line in input_file:
            (accession_id, lineage) = line.split("\t")
            prev_taxon_key = None
            taxa = []
            for taxon in lineage.split(";"):
                (taxon_type_abbrev, taxa_name) = taxon.split("__")
                taxa_type = _TAXA_TYPES[taxon_type_abbrev]
                taxa_name = taxa_name.strip("\n").lower()
                if taxa_type == 'species': 
                    taxa_name = taxa_name.split(" ")
                else:
                    taxa_name = [taxa_name]
                taxa.append((taxon_type_abbrev, taxa_type, taxa_name))
            for (idx, (taxon_type_abbrev, taxa_type, taxa_name)) in enumerate(taxa):
                if taxa_type == 'species':
                    full_name = taxon_type_abbrev + ":" + str(" ".join(taxa_name))
                else:
                    full_name = taxon_type_abbrev + ":" + taxa_name[0]
                if full_name in found_taxon_names:
                    # We have already recorded this taxon
                    continue
                taxon_doc = {'_key':full_name, 'release':release, 'rank': taxa_type, 'name': taxa_name}
                for idx2 in range(0, idx+1):
                    (taxon_type_abbrev, taxa_type, taxa_name) = taxa[idx2]
                    if taxa_type == 'species':
                        taxon_doc[taxa_type] = str("_".join(taxa_name))
                    else:
                        taxon_doc[taxa_type] = taxa_name[0]
                gtdb_vertices_output.write(json.dumps(taxon_doc) + '\n')
                if prev_taxon_key: 
                    edges_doc = {
                        '_from': full_name,
                        '_to': prev_taxon_key
                    }
                    gtdb_edges_output.write(json.dumps(edges_doc) + "\n")
                prev_taxon_key = full_name
                found_taxon_names[full_name] = True
            # Write the edge to go from child to parent from the refseq entry to the species
            edges_doc = {
                '_from': accession_id,
                '_to': taxa[-1][0] + ':hi' + str(" ".join(taxa[-1][2]))
            }
            gtdb_edges_output.write(json.dumps(edges_doc) + '\n')
    finally:
        for file in to_close: 
            file.close()
GTDB_data_to_json('bac120_taxonomy_r89.tsv')