# 210712 Find max intra outliers

In [1]:
using Mmap

In [2]:
using JSON
using DataFrames
using FilePathsBase
using FilePathsBase: /
using Arrow
using HDF5
using PlotlyJS

In [3]:
using Midas.Pairwise: npairs, iterpairs
using TriMatrices
using ClusterAnalysis
using MidasPlots
using MidasPlots.Plotly
using MidasPlots.Plotly: subplot_axes!

In [4]:
include("src/GridAxes.jl")
using .GridAxesModule

## Setup

In [5]:
ENV["COLUMNS"] = 400

400

In [6]:
DATESTR = "210712"
NBNAME = "$DATESTR-find-max-intra-outliers"

"210712-find-max-intra-outliers"

In [7]:
infiles = Dict(
    :distances => p"../../data/intermediate/200727-find-overlaps/genome-pw-distances.raw-float32",
    :db => p"../../data/intermediate/210401-database-v2-fix-species-overlaps/210424-compile-fixes/",
    :overlaps => p"data-intermediate/210428-find-remaining-overlaps/",
    :data => p"data-intermediate/210511-extra-overlap-data/",
);

In [8]:
SUMMARY_COLS = [:id, :name, :rank, :is_leaf, :is_root, :nleaves, :ngenomes, :noverlaps, :threshold, :max_leaf_threshold, :min_inter_dist];

## Load data

### Database

In [9]:
taxa = DataFrame(Arrow.Table(string(infiles[:data] / "taxa.arrow")));

In [10]:
genome_assignments = Vector{Int}(open(JSON.parse, infiles[:db] / "genome-taxon-assignments.json"))
ngenomes = length(genome_assignments)

50752

In [11]:
tid_to_tidx = Dict(id => i for (i, id) in enumerate(taxa[!, :id]))

taxon_index(tid::Integer) = tid_to_tidx[tid]
taxon_index(tids::AbstractVector{<:Integer}) = taxon_index.(tids)

lookup_tid(tid, cols=:) = taxa[taxon_index(tid), cols]

lookup_tid (generic function with 2 methods)

### Distances

In [12]:
pw_data = Mmap.mmap(open(infiles[:distances]), Vector{Float32}, (npairs(ngenomes),));

pw_dists = TriMatrix(TriSymmetric{false}(), ngenomes, pw_data);

### Overlaps

In [13]:
overlaps = let
    data = open(JSON.parse, infiles[:overlaps] / "overlaps.json")
    
    map(data) do item
        item["src_taxid"] => collect(zip(item["src_leaf_taxids"], item["dst_leaf_taxids"], item["distances"]))
    end |> Dict
end

taxa[!, :noverlaps] = [haskey(overlaps, id) ? length(overlaps[id]) : 0 for id in taxa[!, :id]]
taxa[!, :has_overlaps] = map(>(0), taxa.noverlaps)
;

### Leaf data

In [14]:
h5open(string(infiles[:data] / "leaf-data.h5"), "r") do f
    global leaf_tidxs = read(f, "idxs")
    global leaf_pw_min_dists = read(f, "pw_min_dists")
    global leaf_pw_max_dists = read(f, "pw_max_dists")
    global leaf_pw_mean_dists = read(f, "pw_mean_dists")
end;

In [15]:
leaf_tids = taxa[leaf_tidxs, :id]
leaf_id_to_index = Dict(id => i for (i, id) in enumerate(leaf_tids))
nleaves = length(leaf_tidxs)
;

## Candidate taxa

In [16]:
candidates = [
#     244 => [1420],
#     315 => [812],
    403 => [1712],
    309 => [950],
#     462 => [1374],
#     451 => [528, 530],
    53 => [1470],  # !!
    145 => [1525],
#     302 => [548],
#     363 => [522],
#     440 => [1021],
#     355 => [1590],
#     439 => [1286, 1740],
    205 => [1786],
    437 => [1405],
#     200 => [1576],
#     263 => [1234],
#     329 => [1337],
    20 => [1803],
#     429 => [1060],
    14 => [1733],
    110 => [1326],
    326 => [1678],
    97 => [1826],
    164 => [1597],
    252 => [763],
    2 => [947],
    90 => [1732],
];

In [17]:
lookup_tid(first.(candidates), SUMMARY_COLS)

Unnamed: 0_level_0,id,name,rank,is_leaf,is_root,nleaves,ngenomes,noverlaps,threshold,max_leaf_threshold,min_inter_dist
Unnamed: 0_level_1,Int64,String,String?,Bool,Bool,Int64,Int64,Int64,Float32,Float32,Float32
1,403,Actinomyces,genus,0,1,5,10,1495,0.994288,0.601804,0.981125
2,309,Exiguobacterium,genus,0,1,3,6,1,0.991162,0.438699,0.99111
3,53,Brucella,genus,0,1,10,276,8,0.959696,0.496397,0.887919
4,145,Enterobacter,genus,0,1,9,260,544,0.988478,0.792222,0.924833
5,205,Veillonella,genus,0,1,3,11,1,0.995047,0.669333,0.993536
6,437,Cronobacter,genus,0,1,7,70,97,0.97345,0.6,0.95314
7,20,Pantoea,genus,0,1,5,50,6034,0.999228,0.597647,0.942451
8,14,Lysobacter,genus,0,1,3,7,21,0.983625,0.480067,0.973654
9,110,Gluconobacter,genus,0,1,3,14,1,0.980024,0.642851,0.976356
10,326,Lysinibacillus,genus,0,1,3,15,1,0.992058,0.7,0.989381


## Plots

In [18]:
function leaf_pw_grid(row_leaves::AbstractVector{Int}, col_leaves::AbstractVector{Int}, mark_threshold=nothing; kw...)
    row_leaves = lookup_tid.(row_leaves)
    col_leaves = lookup_tid.(col_leaves)
    
    ga = GridAxes(
        [leaf.ngenomes for leaf in row_leaves],
        [leaf.ngenomes for leaf in col_leaves];
        sharex=true,
        sharey=true,
        kw...
    )
    for (leaf, ax) in zip(row_leaves, ga.yaxes)
        ax[:title_text] = leaf.plot_label
    end
    for (leaf, ax) in zip(col_leaves, ga.xaxes)
        ax[:title_text] = leaf.plot_label
    end
    
    traces = GenericTrace[]
    
    for (r, row_leaf) in enumerate(row_leaves)
        for (c, col_leaf) in enumerate(col_leaves)
            dmat = view(pw_dists, row_leaf.genomes, col_leaf.genomes)
            
            hm = heatmap(
                z=dmat',
                coloraxis="coloraxis",
                name="$(row_leaf.name)<br>$(col_leaf.name)",
                hoverlabel=attr(namelength=-1)
            )
            setaxes!(hm, ga, r, c)
            push!(traces, hm)
            
            # Dots
            if mark_threshold !== nothing
                dots = scatter(
                    mode=:markers,
                    marker_color=:red,
#                     marker_size=max(1, round(Int, 6 * min(1, 5 / taxon.nleaves))),
                    hoverinfo=:skip,
                )
                vector_attrs!(dots, findall(>=(mark_threshold), dmat)) do idx
                    Dict(
                        :x => idx[2] - 1,
                        :y => idx[1] - 1,
                    )
                end
                setaxes!(dots, ga, r, c)
                push!(traces, dots)
            end
        end
    end
    
    layout = Layout(
        coloraxis=attr(
            colorscale="Viridis",
        ),
        showlegend=false,
    )
    setaxes!(layout, ga)
    
    return Plot(traces, layout)
end

leaf_pw_grid (generic function with 2 methods)

In [19]:
for (tid, row_leaf_ids) in candidates
    taxon = lookup_tid(tid)
    row_leaf_idxs = [leaf_id_to_index[lid] for lid in row_leaf_ids]
    
    col_leaf_idxs = filter(idx -> maximum(leaf_pw_max_dists[row_leaf_idxs, idx]) > taxon.min_inter_dist, taxon.leaf_idxs)
    col_leaves = [lookup_tid(leaf_tids[i]) for i in col_leaf_idxs]
    
    plt = leaf_pw_grid(
        row_leaf_ids,
        [l.id for l in col_leaves],
#         taxon.min_inter_dist,
        xsep=.2,
        ysep=.2,
    )
    relayout!(
        plt,
        title=taxon.plot_label,
#         coloraxis=attr(
#             cmin=taxon.min_inter_dist,
#             cmax=taxon.diameter,
#         )
    )
    
    display(plt)
end

data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, xaxis3, xaxis4, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, xaxis3, xaxis4, xaxis5, xaxis6, xaxis7, xaxis8, xaxis9, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, xaxis3, xaxis4, xaxis5, xaxis6, xaxis7, xaxis8, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, xaxis3, xaxis4, xaxis5, xaxis6, xaxis7, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, xaxis3, xaxis4, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, xaxis3, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, xaxis3, xaxis4, and yaxis"



data: [
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z",
  "heatmap with fields coloraxis, hoverlabel, name, type, xaxis, yaxis, and z"
]

layout: "layout with fields coloraxis, margin, showlegend, title, xaxis, xaxis2, xaxis3, and yaxis"

