# 210511 Overlap PW heatmaps

In [1]:
# using Mmap

In [2]:
using JSON
using DataFrames
using FilePathsBase
using FilePathsBase: /
# using ProgressMeter
using Arrow
using HDF5
import Clustering
using PlotlyJS

In [3]:
using Midas.Pairwise: npairs, iterpairs
# using TriMatrices
using MidasPlots.Plotly: vector_attrs!, axisname, setaxes!

In [4]:
include("src/GridAxes.jl")
using .GridAxesModule

## Setup

In [5]:
ENV["COLUMNS"] = 400

400

In [6]:
DATESTR = "210513"
NBNAME = "$DATESTR-overlap-pw-heatmaps"

"210513-overlap-pw-heatmaps"

In [7]:
infiles = Dict(
    :distances => p"../../data/intermediate/200727-find-overlaps/genome-pw-distances.raw-float32",
    :db => p"../../data/intermediate/210401-database-v2-fix-species-overlaps/210424-compile-fixes/",
    :overlaps => p"data-intermediate/210428-find-remaining-overlaps/",
    :data => p"data-intermediate/210511-extra-overlap-data/",
);

In [8]:
reports_out = p"reports" / NBNAME
isdir(reports_out) || mkdir(reports_out);

## Load data

### Database

In [9]:
taxa = DataFrame(Arrow.Table(string(infiles[:data] / "taxa.arrow")));

In [10]:
genome_assignments = Vector{Int}(open(JSON.parse, infiles[:db] / "genome-taxon-assignments.json"))
ngenomes = length(genome_assignments)

50752

In [11]:
tid_to_tidx = Dict(id => i for (i, id) in enumerate(taxa[!, :id]))

taxon_index(tid::Integer) = tid_to_tidx[tid]
taxon_index(tids::AbstractVector{<:Integer}) = taxon_index.(tids)

lookup_tid(tid, cols=:) = taxa[taxon_index(tid), cols]

lookup_tid (generic function with 2 methods)

### Distances

### Overlaps

In [12]:
overlaps = let
    data = open(JSON.parse, infiles[:overlaps] / "overlaps.json")
    
    map(data) do item
        item["src_taxid"] => collect(zip(item["src_leaf_taxids"], item["dst_leaf_taxids"], item["distances"]))
    end |> Dict
end

taxa[!, :noverlaps] = [haskey(overlaps, id) ? length(overlaps[id]) : 0 for id in taxa[!, :id]]
taxa[!, :has_overlaps] = map(>(0), taxa.noverlaps)
;

### Leaf data

In [13]:
h5open(string(infiles[:data] / "leaf-data.h5"), "r") do f
    global leaf_tidxs = read(f, "idxs")
    global leaf_pw_min_dists = read(f, "pw_min_dists")
    global leaf_pw_max_dists = read(f, "pw_max_dists")
    global leaf_pw_mean_dists = read(f, "pw_mean_dists")
end;

In [14]:
leaf_tids = taxa[leaf_tidxs, :id]
leaf_id_to_index = Dict(id => i for (i, id) in enumerate(leaf_tids))
nleaves = length(leaf_tidxs)
;

# Plots

In [15]:
function subplot_title(text, xaxis, yaxis; kw...)
    left, right = xaxis[:domain]
    bottom, top = yaxis[:domain]
        
    annotation = attr(
        text=text,
        xref=:paper,
        x=(right + left) / 2,
        xanchor=:center,
        yref=:paper,
        y=top,
        yanchor=:bottom,
        showarrow=false,
    )
    
    !isempty(kw) && merge!(annotation, attr(; kw...))
    
    return annotation
end

subplot_title (generic function with 1 method)

In [16]:
let
    to_plot = [taxon.id for taxon in eachrow(taxa) if taxon.has_overlaps && taxon.nleaves > 2]
    
    ncells = length(to_plot)
    ncol = 6
    nrow = round(Int, ncells//ncol, RoundUp)
    
    cellsize = (250, 250)
    
    ga = GridAxes(
        nrow,
        ncol,
        xbase=attr(
            visible=false,
        ),
        ybase=attr(
            visible=false,
        ),
#         xsep=.1 / ncol,
        xsep=0,
        ysep=.2 / nrow,
    )
    
    traces = GenericTrace[]
    annotations = Any[]
    
    for (tid, gi) in zip(to_plot, CartesianIndices((nrow, ncol)))
        taxon = lookup_tid(tid)

        # Distance matrix
        dmat = leaf_pw_max_dists[taxon.leaf_idxs, taxon.leaf_idxs]
        
        zmin, zmax = extrema(v for (idx, v) in pairs(IndexCartesian(), dmat) if idx[1] != idx[2])
        
        leaf_perm = Clustering.hclust(dmat, linkage=:complete).order
        leaf_labels = [lookup_tid(id, :plot_label) for id in taxon.leaves]

        # Heatmap
        hm = heatmap(
            z=dmat[leaf_perm, leaf_perm],
            x=leaf_labels[leaf_perm],
            y=leaf_labels[leaf_perm],
            colorscale="Viridis",
            showscale=false,
            zmin=zmin,
            zmax=zmax,
        )
        setaxes!(hm, ga, gi)
        push!(traces, hm)
        
        # Dots
        dots = scatter(
            mode=:markers,
            marker_color=:red,
            marker_size=max(1, round(Int, 6 * min(1, 5 / taxon.nleaves))),
            hoverinfo=:skip,
        )
        vector_attrs!(dots, findall(>=(taxon.min_inter_dist), dmat)) do idx
            Dict(
                :x => leaf_labels[idx[1]],
                :y => leaf_labels[idx[2]],
            )
        end
        setaxes!(dots, ga, gi)
        push!(traces, dots)
        
        # X axis
        xax = ga.xaxes_grid[gi]
        xax[:scaleanchor] = axisname(:y, ga.yindices[gi])
        
        # Y axis
        yax = ga.yaxes_grid[gi]
        
        # Title
        push!(annotations, subplot_title(taxon.plot_label, xax, yax))
    end
    
    layout = Layout(
        annotations=annotations,
        width=ncol * cellsize[1],
        height=nrow * cellsize[2],
        showlegend=false,
    )
    setaxes!(layout, ga)
    
    plt = plot(traces, layout)
    
    savehtml(plt, string(reports_out / "$DATESTR-overlap-pw-heatmaps.html"), :remote)
end