# 210511 Extra overlap data

In [1]:
using Mmap
using Printf
using Statistics

In [2]:
using JSON
using CSV
using DataFrames
using FilePathsBase
using FilePathsBase: /
using ProgressMeter
using Clustering
using Arrow
using HDF5

In [3]:
using Midas.Pairwise: npairs, iterpairs
using TriMatrices

## Setup

In [4]:
ENV["COLUMNS"] = 400

400

In [5]:
DATESTR = "210511"
NBNAME = "$DATESTR-extra-overlap-data"

"210511-extra-overlap-data"

In [6]:
infiles = Dict(
    :distances => p"../../data/intermediate/200727-find-overlaps/genome-pw-distances.raw-float32",
    :db => p"../../data/intermediate/210401-database-v2-fix-species-overlaps/210424-compile-fixes/",
    :overlaps => p"data-intermediate/210428-find-remaining-overlaps/",
);

In [7]:
intermediate_out = p"data-intermediate" / NBNAME
isdir(intermediate_out) || mkdir(intermediate_out);

## Load data

### Database

In [8]:
taxa = DataFrame(CSV.File(infiles[:db] / "taxa.csv"));

In [9]:
genome_assignments = Vector{Int}(open(JSON.parse, infiles[:db] / "genome-taxon-assignments.json"))
ngenomes = length(genome_assignments)

50752

In [10]:
tid_to_tidx = Dict(id => i for (i, id) in enumerate(taxa[!, :id]))

taxon_index(tid::Integer) = tid_to_tidx[tid]
taxon_index(tids::AbstractVector{<:Integer}) = taxon_index.(tids)

lookup_tid(tid, cols=:) = taxa[taxon_index(tid), cols]

lookup_tid (generic function with 2 methods)

### Distances

In [11]:
pw_data = Mmap.mmap(open(infiles[:distances]), Vector{Float32}, (npairs(ngenomes),));

pw_dists = TriMatrix(TriSymmetric{false}(), ngenomes, pw_data);

### Overlaps

In [12]:
overlaps = let
    data = open(JSON.parse, infiles[:overlaps] / "overlaps.json")
    
    map(data) do item
        item["src_taxid"] => collect(zip(item["src_leaf_taxids"], item["dst_leaf_taxids"], item["distances"]))
    end |> Dict
end

taxa[!, :noverlaps] = [haskey(overlaps, id) ? length(overlaps[id]) : 0 for id in taxa[!, :id]]
taxa[!, :has_overlaps] = map(>(0), taxa.noverlaps)
;

In [13]:
let
    data = open(JSON.parse, infiles[:overlaps] / "calculated.json")
    data2 = Dict(item["id"] => item for item in data)
    
    cols = [
        (:is_root, Bool),
        (:leaves, Vector{Int}),
        (:genomes, Vector{Int}),
        (:diameter, Float32),
        (:threshold, Float32),
        (:max_leaf_threshold, Float32),
        (:min_inter_src, Int),
        (:min_inter_dst, Int),
        (:min_inter_dist, Float32),
    ]
    
    for (col, T) in cols
        taxa[!, col] = collect(T, data2[id][string(col)] for id in taxa[!, :id])
    end
end

## Calculated

In [14]:
taxa[!, :nleaves] = length.(taxa.leaves)
taxa[!, :ngenomes] = length.(taxa.genomes)
taxa[!, :plot_label] = [string(row[:id], " ", row[:name]) for row in eachrow(taxa)];
;

### Leaf data

In [15]:
leaf_tidxs = findall(taxa[!, :is_leaf])
leaf_tids = taxa[leaf_tidxs, :id]
nleaves = length(leaf_tidxs)

@assert issetequal(leaf_tids, setdiff(Set(genome_assignments), 0))

leaf_id_to_index = Dict(tid => i for (i, tid) in enumerate(leaf_tids))
taxa[!, :leaf_idxs] = [[leaf_id_to_index[id] for id in leaves] for leaves in taxa.leaves]
leaf_gidxs = taxa[leaf_tidxs, :genomes]

length(leaf_tids)

1445

In [16]:
leaf_pw_min_dists = zeros(eltype(pw_dists), nleaves, nleaves)
leaf_pw_max_dists = zeros(eltype(pw_dists), nleaves, nleaves)
leaf_pw_mean_dists = zeros(eltype(pw_dists), nleaves, nleaves)

@showprogress for (i, j) in iterpairs(nleaves, true)
    dmat = view(pw_dists, leaf_gidxs[i], leaf_gidxs[j])
    
    leaf_pw_min_dists[i, j] = leaf_pw_min_dists[j, i] = minimum(dmat)
    leaf_pw_max_dists[i, j] = leaf_pw_max_dists[j, i] = maximum(dmat)
    leaf_pw_mean_dists[i, j] = leaf_pw_mean_dists[j, i] = mean(dmat)
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:35[39m3:02[39m


In [17]:
# Permute leaves based on dendrogram, to keep similar leaves near each other when plotting
for taxon in eachrow(taxa)
    length(taxon.leaves) > 1 || continue
    
    dmat = leaf_pw_mean_dists[taxon.leaf_idxs, taxon.leaf_idxs]
    perm = hclust(dmat, linkage=:average).order
    permute!(taxon.leaves, perm)
    permute!(taxon.leaf_idxs, perm)
end

# Write output

In [18]:
Arrow.write(string(intermediate_out / "taxa.arrow"), taxa)

"data-intermediate/210511-extra-overlap-data/taxa.arrow"

In [19]:
h5open(string(intermediate_out / "leaf-data.h5"), "w") do f
    f["idxs"] = leaf_tidxs
    f["pw_min_dists"] = leaf_pw_min_dists
    f["pw_max_dists"] = leaf_pw_max_dists
    f["pw_mean_dists"] = leaf_pw_mean_dists
end;