In [59]:
using Printf

using FilePathsBase
using FilePathsBase: /
using DataFrames
import CSV
using PyCall
using PlotlyJS
using AbstractTrees

In [2]:
using Revise

using MidasPlots
using ClusterAnalysis

## Setup

In [3]:
infiles = Dict(
    :genomes_table => p"../../data/processed/211129-update-external-data-sets/211201-konstantinidis-2005/211201-konstantinidis-2005-genomes.csv",
    :taxa_table => p"../../data/processed/211129-update-external-data-sets/211201-konstantinidis-2005/211201-konstantinidis-2005-taxa.csv",
    :genomes_dir => p"/home/jared/projects/gambit/data/external/konstantinidis-2005/genomes/",
)

Dict{Symbol, PosixPath} with 3 entries:
  :genomes_dir   => /home/jared/projects/gambit/data/external/konstantinidis-20…
  :genomes_table => ../../data/processed/211129-update-external-data-sets/21120…
  :taxa_table    => ../../data/processed/211129-update-external-data-sets/21120…

## Load data

In [4]:
genomes_df = DataFrame(CSV.File(infiles[:genomes_table]))
ngenomes = nrow(genomes_df)

70

In [5]:
taxa_df = DataFrame(CSV.File(infiles[:taxa_table]));

## Taxonomy tree

In [6]:
using AbstractTreesDev.ParentAPI

┌ Info: Precompiling AbstractTreesDev [37b2292a-bdae-4e31-9b7d-2183452072cc]
└ @ Base loading.jl:1423


In [7]:
using AbstractTreesDev.IDTrees
using AbstractTreesDev.IDTrees: addnode!, finalize!

### Assemble

In [8]:
V = typeof(first(eachrow(taxa_df)))

tree = IDTree{Int}(V)

for row in eachrow(taxa_df)
    addnode!(tree, row.taxid, row.parent_taxid == 0 ? nothing : row.parent_taxid, row)
end

finalize!(tree)

In [10]:
function AbstractTrees.printnode(io::IO, node::typeof(rootnode(tree)))
    @printf io "(%d) %s [%s]" node.id node.val[:name] node.val[:rank]
end

In [None]:
print_tree(rootnode(tree), maxdepth=99)

### Contract

In [55]:
function _contract!(tree2::IDTree{K, N}, node::N, parent_id::Union{K, Nothing}) where {K, N}
    c = collect(N, children(node))
    length(c) == 1 && return _contract!(tree2, first(c), parent_id)
    
    addnode!(tree2, node.id::K, parent_id, node.val)
    for child::N in c
        _contract!(tree2, child, node.id)
    end
end

_contract! (generic function with 1 method)

In [58]:
tree2 = typeof(tree)()
_contract!(tree2, tree.root, nothing)
finalize!(tree2)
tree2

IDTree{Int64} with 113 nodes

In [57]:
print_tree(rootnode(tree2), maxdepth=99)

(2) Bacteria [superkingdom]
├─ (1224) Proteobacteria [phylum]
│  ├─ (210) Helicobacter pylori [species]
│  │  ├─ (85962) Helicobacter pylori 26695 [strain]
│  │  └─ (85963) Helicobacter pylori J99 [strain]
│  ├─ (28216) Betaproteobacteria [class]
│  │  ├─ (80840) Burkholderiales [order]
│  │  │  ├─ (111527) pseudomallei group [species group]
│  │  │  │  ├─ (13373) Burkholderia mallei [species]
│  │  │  │  └─ (28450) Burkholderia pseudomallei [species]
│  │  │  └─ (517) Bordetella [genus]
│  │  │     ├─ (257310) Bordetella bronchiseptica RB50 [strain]
│  │  │     ├─ (1208660) Bordetella parapertussis Bpp5 [strain]
│  │  │     └─ (257313) Bordetella pertussis Tohama I [isolate]
│  │  └─ (482) Neisseria [genus]
│  │     ├─ (487) Neisseria meningitidis [species]
│  │     │  ├─ (272831) Neisseria meningitidis FAM18 [strain]
│  │     │  ├─ (122587) Neisseria meningitidis Z2491 [strain]
│  │     │  └─ (122586) Neisseria meningitidis MC58 [strain]
│  │     └─ (242231) Neisseria gonorrhoeae FA 