# 201017 Simple overlap component resolutions

In [None]:
using ProgressMeter
using FilePathsBase; using FilePathsBase: /
using GZip
using JSON
using Mmap
using DataFrames
using CSV
using StatsBase
using CategoricalArrays
using HDF5
using DataStructures: counter
using AbstractTrees

In [None]:
using PlotlyJS
using Colors
using ColorSchemes

In [None]:
using Revise

using Midas
using Midas.Pairwise: npairs

using ClusterAnalysis
using ClusterAnalysis: hclusttree_from_json

using MidasPlots
using MidasPlots: Dendrogram, dgleaf_edges, ClassValues
using MidasPlots.Plotly: PlotlyDendrogram, subplot_axes, subplot_axes!, gettraces, setaxes!, vector_attrs!, class_color_strip, make_colorscale

using TriMatrices

## Func defs

In [None]:
findclass(a::CategoricalArray, cls::CategoricalValue) = findall(==(cls), a)
findclass(a::CategoricalArray, i::Integer) = findclass(a, a.pool[i])
findclass(a::CategoricalArray, clss::AbstractVector{<:CategoricalValue}) = findall(∈(clss), a)
findclass(a::CategoricalArray, is::AbstractVector{<:Integer}) = findall(∈(is), a.refs)
selectclass(a::AbstractVector, c::CategoricalArray, cls) = a[findclass(c, cls)]

In [None]:
struct Namespace
    _fields::Dict{Symbol, Any}
end
    
Namespace(; kw...) = Namespace(Dict(kw))

function Base.getproperty(ns::Namespace, p::Symbol)
    f = getfield(ns, :_fields)
    p == :_fields && return f
    try
        return f[p]
    catch KeyError
        error("Namespace has no field $p")
    end
end

function Base.setproperty!(ns::Namespace, p::Symbol, v)
    f = getfield(ns, :_fields)
    f[p] = v
end

macro unpack_ns(e, symbols::Symbol...)
    x = [Expr(:., :ns, QuoteNode(s)) for s in symbols]
    rhs = quote
        let ns = $(esc(e))
            $(Expr(:tuple, x...))
        end
    end
    return Expr(:(=), Expr(:tuple, esc.(symbols)...), rhs)
end

Base.propertynames(ns::Namespace) = collect(keys(ns._fields))

## File paths

In [None]:
infiles = Dict(
    :taxonomy => p"/home/jared/notebooks/midas/midas-notebooks-2019/build-v1-database/out/3-curated-taxonomy-assignments.csv",
    :distances => p"../../data/intermediate/200727-find-overlaps/genome-pw-distances.raw-float32",
    :overlaps => p"../../data/intermediate/200727-find-overlaps/200729-refseq-curated-1.1_beta-species-overlaps.h5",
    :trees => p"../../data/intermediate/201013-overlap-clustering-analysis/201017-overlap-component-clustering-reports/",
);

## Load data

### Taxonomy

In [None]:
taxdf = DataFrame(CSV.File(infiles[:taxonomy]));

In [None]:
sig_genera = categorical(taxdf[!, :genus])
genera = levels(sig_genera)
ngenera = length(genera)

sig_species = categorical([(row[:genus], row[:species]) for row in eachrow(taxdf)])
species = levels(sig_species)
speciesnames = [join(s, " ") for s in species]
nspecies = length(species)

ngenera, nspecies

In [None]:
genus_counts = counts(sig_genera.refs)
species_counts = counts(sig_species.refs)
;

In [None]:
genome_accs = [last(split(k, "/")) for k in taxdf[!, :key]];

In [None]:
species_to_genus = [findfirst(==(genus), genera) for (genus, spname) in species];

In [None]:
nsigs = size(taxdf, 1)

### Distances

In [None]:
pw_data = Mmap.mmap(open(infiles[:distances], "r"), Vector{Float32}, (npairs(nsigs),));

pw_dists = TriMatrix(TriSymmetric{false}(), nsigs, pw_data);

In [None]:
@assert !any(<(0), pw_data)

### Overlaps

In [None]:
h5open(string(infiles[:overlaps])) do h5
    @assert read(h5, "genome_keys") == taxdf[!, :key]

    global min_inter = collect(zip(read(h5, "min_inter_indices"), read(h5, "min_inter_dists")))
    global max_intra = collect(zip(read(h5, "max_intra_indices"), read(h5, "max_intra_dists")))

    global has_overlap = read(h5, "has_overlap")
    global overlap_components_vec = read(h5, "overlap_components")
end;

In [None]:
overlap_components = [findall(==(i), overlap_components_vec) for i in 1:maximum(overlap_components_vec)];

In [None]:
function componentdata(i::Int)
    i = i
    comp = overlap_components[i]
    nspecies = length(comp)
    idxs = [findclass(sig_species, c) for c in comp]
    idxs_flat = vcat(idxs...)
    genera = unique(g for (g, s) in species[comp])
    singlegenus = length(genera) == 1
    
    if singlegenus
        genus = only(genera)
        groupnames = [s for (g, s) in species[comp]]
    else
       genus = nothing
       groupnames = speciesnames[comp]
    end
    
    classnames = ClassValues(groupnames, "(mixed)");
    ngenomes = length(idxs_flat)
    genome_labels = [l for (l, n) in enumerate(species_counts[comp]) for _ in 1:n];
    dmat = pw_dists[idxs_flat, idxs_flat];
    
    treedata = open(JSON.parse, infiles[:trees] / "tree-$i.json")
    tree = hclusttree_from_json(treedata)
    
    accs = open(JSON.parse, infiles[:trees] / "genomes-$i.json")
    @assert accs == genome_accs[idxs_flat]
    
    return Namespace(;
        i, species=comp, idxs, idxs_flat, genera, singlegenus,
        genus, groupnames, classnames, ngenomes, nspecies,
        genome_labels, dmat, tree
    )
end

# ?

In [None]:
comp = componentdata(3)
speciesnames[comp.species]

In [None]:
comp.tree