# 210520 Min inter outliers

In [1]:
using Mmap
using Statistics

In [2]:
using JSON
using DataFrames
using FilePathsBase
using FilePathsBase: /
using ProgressMeter
using Arrow
using HDF5
using PlotlyJS

In [3]:
using Midas.Pairwise: npairs, iterpairs
using TriMatrices
using MidasPlots.Plotly: vector_attrs!, axisname, setaxes!

## Plotting code

In [4]:
include("src/GridAxes.jl")
using .GridAxesModule

In [5]:
itercells(ga::GridAxes) = ((r, c) for r in ga.nrow:-1:1 for c in 1:ga.ncol)

itercells (generic function with 1 method)

In [6]:
function subplot_title(text, xaxis, yaxis; kw...)
    left, right = xaxis[:domain]
    bottom, top = yaxis[:domain]
        
    annotation = attr(
        text=text,
        xref=:paper,
        x=(right + left) / 2,
        xanchor=:center,
        yref=:paper,
        y=top,
        yanchor=:bottom,
        showarrow=false,
    )
    
    !isempty(kw) && merge!(annotation, attr(; kw...))
    
    return annotation
end

subplot_title (generic function with 1 method)

In [7]:
function groupby(by::Function, itr)
    values = itr isa AbstractArray ? itr : collect(itr)
    keys = [by(v) for v in values]
    
    V = eltype(values)
    d = Dict{eltype(keys), Vector{V}}()
    
    for (k, v) in zip(keys, values)
        a = get!(() -> V[], d, k)
        push!(a, v)
    end
    
    return d
end

groupby (generic function with 1 method)

## Setup

In [8]:
ENV["COLUMNS"] = 400

400

In [9]:
DATESTR = "210520"
NBNAME = "$DATESTR-min-inter-outliers"

"210520-min-inter-outliers"

In [10]:
infiles = Dict(
    :distances => p"../../data/intermediate/200727-find-overlaps/genome-pw-distances.raw-float32",
    :db => p"../../data/intermediate/210401-database-v2-fix-species-overlaps/210424-compile-fixes/",
    :overlaps => p"data-intermediate/210428-find-remaining-overlaps/",
    :data => p"data-intermediate/210511-extra-overlap-data/",
);

In [11]:
reports_out = p"reports" / NBNAME
isdir(reports_out) || mkdir(reports_out);

## Load data

### Database

In [12]:
taxa = DataFrame(Arrow.Table(string(infiles[:data] / "taxa.arrow")));

In [13]:
genome_assignments = Vector{Int}(open(JSON.parse, infiles[:db] / "genome-taxon-assignments.json"))
ngenomes = length(genome_assignments)

50752

In [14]:
tid_to_tidx = Dict(id => i for (i, id) in enumerate(taxa[!, :id]))

taxon_index(tid::Integer) = tid_to_tidx[tid]
taxon_index(tids::AbstractVector{<:Integer}) = taxon_index.(tids)

lookup_tid(tid, cols=:) = taxa[taxon_index(tid), cols]

lookup_tid (generic function with 2 methods)

### Distances

In [15]:
pw_data = Mmap.mmap(open(infiles[:distances]), Vector{Float32}, (npairs(ngenomes),));

pw_dists = TriMatrix(TriSymmetric{false}(), ngenomes, pw_data);

### Overlaps

In [16]:
overlaps = let
    data = open(JSON.parse, infiles[:overlaps] / "overlaps.json")
    
    map(data) do item
        item["src_taxid"] => collect(zip(item["src_leaf_taxids"], item["dst_leaf_taxids"], item["distances"]))
    end |> Dict
end

taxa[!, :noverlaps] = [haskey(overlaps, id) ? length(overlaps[id]) : 0 for id in taxa[!, :id]]
taxa[!, :has_overlaps] = map(>(0), taxa.noverlaps)
;

### Leaf data

In [17]:
h5open(string(infiles[:data] / "leaf-data.h5"), "r") do f
    global leaf_tidxs = read(f, "idxs")
    global leaf_pw_min_dists = read(f, "pw_min_dists")
    global leaf_pw_max_dists = read(f, "pw_max_dists")
    global leaf_pw_mean_dists = read(f, "pw_mean_dists")
end;

In [18]:
leaf_tids = taxa[leaf_tidxs, :id]
leaf_id_to_index = Dict(id => i for (i, id) in enumerate(leaf_tids))
nleaves = length(leaf_tidxs)
;

## Calculations

In [19]:
quantiles = let
    p_base = LinRange(0., 1., 101)

    @showprogress map(collect(keys(overlaps))) do id
        taxon = lookup_tid(id)
        vals = reshape(view(pw_dists, taxon.genomes, taxon.genomes), :)
        q = quantile(vals, p_base)
        i = searchsortedlast(q, taxon.min_inter_dist)
        id => (p_base[i:end], q[i:end])
    end |> Dict
end;

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:22[39m


In [20]:
overlap_groups_by_src = map(collect(overlaps)) do (id, triplets)
    dict1 = Dict{Int, Dict{Int, Float32}}()
    
    for (src, dst, d) in triplets
        dict2 = get!(dict1, src, Dict{Int, Float32}())
        dict2[dst] = d
    end

    id => dict1
end |> Dict;

In [21]:
overlap_groups_by_dst = map(collect(overlaps)) do (id, triplets)
    dict1 = Dict{Int, Dict{Int, Float32}}()
    
    for (src, dst, d) in triplets
        dict2 = get!(dict1, dst, Dict{Int, Float32}())
        dict2[src] = d
    end

    id => dict1
end |> Dict;

## Plots

### By src

In [22]:
by_src_data = let
    max_genomes = 30
    
    map(collect(overlap_groups_by_src)) do (id, group)
        
        items = Tuple{Int, Int, Float32}[]
        
        for (src_id, dict) in group
            dst_genomes = collect(Iterators.flatten(lookup_tid(dst, :genomes) for dst in keys(dict)))
            for g in lookup_tid(src_id, :genomes)
                d = minimum(view(pw_dists, g, dst_genomes))
                push!(items, (src_id, g, d))
            end
        end
        
        sort!(items, by=last)
        length(items) > max_genomes && resize!(items, max_genomes)

        id => items
    end |> Dict
end;

In [23]:
let
    # Taxa to plot
    to_plot = collect(keys(overlaps))
    sort!(to_plot)
    
    # Grid
    ncells = length(to_plot)
    ncol = 4
    nrow = round(Int, ncells//ncol, RoundUp)
    cellwidth = 400
    cellheight = 300
    
    ga = GridAxes(
        nrow,
        ncol,
        sharex=true,
        xbase=attr(
            visible=false,
        ),
        ybase=attr(
            zeroline=false,
            showspikes=true,
            spikethickness=1,
            spikemode="taxis+across",
        ),
        xsep=.25 / ncol,
        ysep=.1 / nrow,
    )
    
    # Traces
    traces = GenericTrace[]
    annotations = []
    
    for (id, (gr, gc)) in zip(to_plot, itercells(ga))
        taxon = lookup_tid(id)
        data = by_src_data[id]
        
        data_gb = collect(groupby(first, data))
        sort!(data_gb, by=p -> minimum(last.(p[2])))
        
        # Quantile trace
        let (p, q) = quantiles[id]
            trace = scatter(
                x=p,
                y=q,
                mode=:lines,
                line_color="lightgray",
                fill=:tozerox,
                hoverinfo="x+y",
            )
            setaxes!(trace, ga, gr, gc)
            push!(traces, trace)
        end
        
        # Diameter trace
        let
            trace = scatter(
                x=[0, 1],
                y=[taxon.diameter, taxon.diameter],
                mode=:lines,
                line_dash=:dash,
                line_color=:red,
                hoverinfo=:skip,
            )
            setaxes!(trace, ga, gr, gc)
            push!(traces, trace)
        end
        
        # Distance traces
        x0 = 0
        for (src_id, items) in data_gb
            src_ng = lookup_tid(src_id, :ngenomes)
            ii = 1:length(items)
            
            trace = scatter(
                name=lookup_tid(src_id, :plot_label),
                x=(x0 .+ ii) ./ (length(data) + 1),
                y=last.(items),
                text=[g for (src, g, d) in items],
                marker_symbol=[src_ng - i >= 2 ? :circle : :x for i in ii],
                mode="lines+markers",
                hoverinfo="y+text+name",
            )
            setaxes!(trace, ga, gr, gc)
            push!(traces, trace)
            x0 += length(items)
        end

        # Title
        push!(annotations, subplot_title(
            taxon.plot_label,
            ga.xaxes_grid[gr, gc],
            ga.yaxes_grid[gr, gc],
        ))
    end
    
    # Layout
    layout = Layout(
        annotations=annotations,
        showlegend=false,
        width=cellwidth * ncol,
        height=cellheight * nrow,
        hovermode=:closest,
    )
    setaxes!(layout, ga)
    
    # Save
    plt = plot(traces, layout)
    
    savehtml(plt, string(reports_out / "$DATESTR-min-inter-outliers-by-src.html"), :remote)
end

### By dst

In [24]:
by_dst_data = let
    max_genomes = 30
    
    map(collect(overlap_groups_by_dst)) do (id, group)
        taxon = lookup_tid(id)
        
        items = Tuple{Int, Int, Float32}[]
        
        for (dst_id, dict) in group
            for g in lookup_tid(dst_id, :genomes)
                d = minimum(view(pw_dists, g, taxon.genomes))
                push!(items, (dst_id, g, d))
            end
        end
        
        sort!(items, by=last)
        length(items) > max_genomes && resize!(items, max_genomes)

        id => items
    end |> Dict
end;

In [25]:
let
    # Taxa to plot
    to_plot = collect(keys(overlaps))
    sort!(to_plot)
    
    # Grid
    ncells = length(to_plot)
    ncol = 4
    nrow = round(Int, ncells//ncol, RoundUp)
    cellwidth = 400
    cellheight = 300
    
    ga = GridAxes(
        nrow,
        ncol,
        sharex=true,
        xbase=attr(
            visible=false,
        ),
        ybase=attr(
            zeroline=false,
            showspikes=true,
            spikethickness=1,
            spikemode="taxis+across",
        ),
        xsep=.25 / ncol,
        ysep=.1 / nrow,
    )
    
    # Traces
    traces = GenericTrace[]
    annotations = []
    
    for (id, (gr, gc)) in zip(to_plot, itercells(ga))
        taxon = lookup_tid(id)
        data = by_dst_data[id]
        
        data_gb = collect(groupby(first, data))
        sort!(data_gb, by=p -> minimum(last.(p[2])))
        
        # Quantile trace
        let (p, q) = quantiles[id]
            trace = scatter(
                x=p,
                y=q,
                mode=:lines,
                line_color="lightgray",
                fill=:tozerox,
                hoverinfo="x+y",
            )
            setaxes!(trace, ga, gr, gc)
            push!(traces, trace)
        end
        
        # Diameter trace
        let
            trace = scatter(
                x=[0, 1],
                y=[taxon.diameter, taxon.diameter],
                mode=:lines,
                line_dash=:dash,
                line_color=:red,
                hoverinfo=:skip,
            )
            setaxes!(trace, ga, gr, gc)
            push!(traces, trace)
        end
        
        # Distance traces
        x0 = 0
        for (dst_id, items) in data_gb
            dst_ng = lookup_tid(dst_id, :ngenomes)
            ii = 1:length(items)
            
            trace = scatter(
                name=lookup_tid(dst_id, :plot_label),
                x=(x0 .+ ii) ./ (length(data) + 1),
                y=last.(items),
                text=[g for (dst, g, d) in items],
                marker_symbol=[dst_ng - i >= 2 ? :circle : :x for i in ii],
                mode="lines+markers",
                hoverinfo="y+text+name",
            )
            setaxes!(trace, ga, gr, gc)
            push!(traces, trace)
            x0 += length(items)
        end

        # Title
        push!(annotations, subplot_title(
            taxon.plot_label,
            ga.xaxes_grid[gr, gc],
            ga.yaxes_grid[gr, gc],
        ))
    end
    
    # Layout
    layout = Layout(
        annotations=annotations,
        showlegend=false,
        width=cellwidth * ncol,
        height=cellheight * nrow,
        hovermode=:closest,
    )
    setaxes!(layout, ga)
    
    # Save
    plt = plot(traces, layout)
    
    savehtml(plt, string(reports_out / "$DATESTR-min-inter-outliers-by-dst.html"), :remote)
end