# 210718 calculate additional data

In [1]:
using Mmap
using Statistics

In [2]:
using JSON
using DataFrames
using FilePathsBase
using FilePathsBase: /
using CSV
using Arrow
using HDF5
using ProgressMeter

In [3]:
using Midas.Pairwise: npairs, iterpairs
using TriMatrices

## Setup

In [4]:
ENV["COLUMNS"] = 400
ENV["LINES"] = 100

100

In [5]:
DATESTR = "210718"
NBNAME = "$DATESTR-calculate-additional-data"

"210718-calculate-additional-data"

In [6]:
infiles = Dict(
    :distances => p"../../data/intermediate/200727-find-overlaps/genome-pw-distances.raw-float32",
    :db => p"data-intermediate/210718-compile-edits/",
);

In [7]:
intermediate_out = p"data-intermediate" / NBNAME
isdir(intermediate_out) || mkdir(intermediate_out)

true

## Load data

### Database

In [8]:
taxa = DataFrame(CSV.File(string(infiles[:db] / "taxa.csv")));

In [9]:
genome_assignments = Vector{Int}(open(JSON.parse, infiles[:db] / "genome-assignments.json"))
ngenomes = length(genome_assignments)

50752

In [10]:
tid_to_tidx = Dict(id => i for (i, id) in enumerate(taxa[!, :id]))

taxon_index(tid::Integer) = tid_to_tidx[tid]
taxon_index(tids::AbstractVector{<:Integer}) = taxon_index.(tids)

lookup_tid(tid, cols=:) = taxa[taxon_index(tid), cols]

lookup_tid (generic function with 2 methods)

### Distances

In [11]:
pw_data = Mmap.mmap(open(infiles[:distances]), Vector{Float32}, (npairs(ngenomes),));

pw_dists = TriMatrix(TriSymmetric{false}(), ngenomes, pw_data);

## Taxonomy tree

### Child list

In [12]:
taxa[!, :children] = [Int[] for _ in 1:nrow(taxa)]

for row in eachrow(taxa)
    ismissing(row.parent_id) || push!(lookup_tid(row.parent_id, :children), row.id)
end

### Leaves

In [13]:
leaf_tidxs = findall(taxa.is_leaf)
leaf_tids = taxa[leaf_tidxs, :id]
nleaves = length(leaf_tidxs)

1445

In [14]:
@assert all(id -> id == 0 || id ∈ leaf_tids, genome_assignments)

In [15]:
let
    leaves_map = Dict(tid => [idx] for (idx, tid) in enumerate(leaf_tids))
    function _getleaves(id)
        if !haskey(leaves_map, id)
            leaves_map[id] = collect(Iterators.flatten(_getleaves(child_id) for child_id in lookup_tid(id, :children)))
        end
        return leaves_map[id]
    end
    taxa[!, :leaf_idxs] = _getleaves.(taxa[!, :id])
end;

In [16]:
taxa[!, :leaves] = [leaf_tids[idxs] for idxs in taxa.leaf_idxs];

In [17]:
@assert all(leaves -> issubset(leaves, leaf_tids), taxa.leaves)
@assert !any(isempty, taxa.leaves)

In [18]:
taxa[!, :nleaves] = length.(taxa.leaves);

### Genomes

In [19]:
let
    leaf_genomes = Dict(id => Int[] for id in leaf_tids)
    for (gi, leaf_id) in enumerate(genome_assignments)
        leaf_id == 0 || push!(leaf_genomes[leaf_id], gi)
    end
    
    taxa[!, :genomes] = [
        collect(Iterators.flatten(leaf_genomes[leaf_id] for leaf_id in row.leaves))
        for row in eachrow(taxa)
    ]
end;

In [20]:
taxa[!, :ngenomes] = length.(taxa.genomes);

## Distance data

### Leaf distances

In [21]:
leaf_pw_min_dists = Array{eltype(pw_dists)}(undef, nleaves, nleaves)
leaf_pw_max_dists = similar(leaf_pw_min_dists)
leaf_pw_mean_dists = similar(leaf_pw_min_dists)

@showprogress for (i, j) in iterpairs(nleaves, true)
    genomes_i = lookup_tid(leaf_tids[i], :genomes)
    genomes_j = lookup_tid(leaf_tids[j], :genomes)
    dmat = view(pw_dists, genomes_i, genomes_j)
    
    for (m, f) in [(leaf_pw_min_dists, minimum), (leaf_pw_max_dists, maximum), (leaf_pw_mean_dists, mean)]
        m[i, j] = m[j, i] = f(dmat)
    end
end

[32mProgress: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| Time: 0:00:55[39m:51[39m


### Diameters

In [22]:
taxa[!, :diameter] = [
    maximum(view(leaf_pw_max_dists, idxs, idxs))
    for idxs in taxa.leaf_idxs
];

### Min-inter values

In [23]:
let
    src_vals = Int[]
    dst_vals = Int[]
    dist_vals = eltype(pw_dists)[]
    
    for row in eachrow(taxa)
        others = setdiff(1:nleaves, row.leaf_idxs)
        
        best = nothing
        for src_i in row.leaf_idxs, dst_i in others
            d = leaf_pw_min_dists[src_i, dst_i]
            if isnothing(best) || d < best[1]
                best = (d, src_i, dst_i)
            end
        end
        
        push!(dist_vals, best[1])
        push!(src_vals, leaf_tids[best[2]])
        push!(dst_vals, leaf_tids[best[3]])
    end
    
    taxa[!, :min_inter_src] = src_vals
    taxa[!, :min_inter_dst] = dst_vals
    taxa[!, :min_inter_dist] = dist_vals
end;

## Write output

In [24]:
Arrow.write(string(intermediate_out / "taxa.arrow"), taxa)

"data-intermediate/210718-calculate-additional-data/taxa.arrow"

In [25]:
h5open(string(intermediate_out / "leaf-data.h5"), "w") do f
    f["ids"] = leaf_tids
    f["idxs"] = leaf_tidxs
    f["pw_min_dists"] = leaf_pw_min_dists
    f["pw_max_dists"] = leaf_pw_max_dists
    f["pw_mean_dists"] = leaf_pw_mean_dists
end;