# 210428 Find remaining overlaps

In [1]:
using Mmap
using LinearAlgebra

In [2]:
using JSON
using CSV
using DataFrames
using FilePathsBase
using FilePathsBase: /
using ProgressMeter

In [3]:
using Midas.Pairwise: npairs, iterpairs
using TriMatrices

## Setup

In [4]:
ENV["COLUMNS"] = 400

400

In [5]:
DATESTR = "210428"
NBNAME = "$DATESTR-find-remaining-overlaps"

"210428-find-remaining-overlaps"

In [6]:
infiles = Dict(
    :distances => p"../../data/intermediate/200727-find-overlaps/genome-pw-distances.raw-float32",
    :db => p"../../data/intermediate/210401-database-v2-fix-species-overlaps/210424-compile-fixes/",
);

In [7]:
intermediate_out = p"data-intermediate" / NBNAME
isdir(intermediate_out) || mkdir(intermediate_out)

processed_out = p"data-processed" / NBNAME
isdir(processed_out) || mkdir(processed_out);

## Load data

### Database

In [8]:
taxa = DataFrame(CSV.File(infiles[:db] / "taxa.csv"));

In [9]:
genome_assignments = Vector{Int}(open(JSON.parse, infiles[:db] / "genome-taxon-assignments.json"))
ngenomes = length(genome_assignments)

50752

In [10]:
tid_to_tidx = Dict(id => i for (i, id) in enumerate(taxa[!, :id]))

taxon_index(tid::Integer) = tid_to_tidx[tid]

lookup_tid(tid, cols=:) = taxa[taxon_index.(tid), cols]

lookup_tid (generic function with 2 methods)

### Distances

In [11]:
pw_data = Mmap.mmap(open(infiles[:distances]), Vector{Float32}, (npairs(ngenomes),));

pw_dists = TriMatrix(TriSymmetric{false}(), ngenomes, pw_data);

## Setup

### Taxonomy forest

In [12]:
using AbstractTrees


struct SimpleForest{N}
    root_ids::Vector{Int}
    nodes::Dict{Int, N}
end

SimpleForest{N}() where N = SimpleForest{N}(Int[], Dict{Int, N}())


struct SimpleNode
    forest::SimpleForest
    id::Int
    parent_id::Int
    child_ids::Vector{Int}
end

AbstractTrees.children(node::SimpleNode) = [node.forest.nodes[id] for id in node.child_ids]
AbstractTrees.childcount(node::SimpleNode) = length(node.child_ids)

Base.show(io::IO, node::SimpleNode) = print(io, "SimpleNode with ID $(node.id)")

AbstractTrees.printnode(io::IO, node::SimpleNode) = print(io, "#", node.id)

function rootnode(node::SimpleNode)
    while node.parent_id != 0
        node = node.forest.nodes[node.parent_id]
    end
    return node
end

rootnode (generic function with 1 method)

In [13]:
forest = SimpleForest{SimpleNode}()

for row in eachrow(taxa)
    parent_id = coalesce(row[:parent_id], 0)
    node = SimpleNode(forest, row[:id], parent_id, Int[])
    forest.nodes[node.id] = node
end

for node in values(forest.nodes)
    if node.parent_id == 0
        push!(forest.root_ids, node.id)
    else
        push!(forest.nodes[node.parent_id].child_ids, node.id)
    end
end

### Additional calculated taxon properties

In [14]:
taxa[!, :is_root] = ismissing.(taxa.parent_id);

In [15]:
taxa[!, :leaves] = [[leaf.id for leaf in Leaves(forest.nodes[id])] for id in taxa.id];

In [16]:
taxa[!, :genomes] = [findall(∈(leaves), genome_assignments) for leaves in taxa.leaves];

### Leaf data

In [17]:
leaf_tidxs = findall(taxa[!, :is_leaf])
leaf_tids = taxa[leaf_tidxs, :id]
nleaves = length(leaf_tidxs)

@assert issetequal(leaf_tids, setdiff(Set(genome_assignments), 0))

leaf_id_to_index = Dict(tid => i for (i, tid) in enumerate(leaf_tids))
leaf_gidxs = taxa[leaf_tidxs, :genomes]

taxa[!, :leaf_idxs] = [[leaf_id_to_index[id] for id in leaves] for leaves in taxa.leaves]

length(leaf_tids)

1445

In [18]:
leaf_pw_min_dists = zeros(eltype(pw_dists), nleaves, nleaves)
leaf_pw_max_dists = zeros(eltype(pw_dists), nleaves, nleaves)

@showprogress for (i, j) in iterpairs(nleaves, true)
    dmat = view(pw_dists, leaf_gidxs[i], leaf_gidxs[j])
    leaf_pw_min_dists[i, j] = leaf_pw_min_dists[j, i] = minimum(dmat)
    leaf_pw_max_dists[i, j] = leaf_pw_max_dists[j, i] = maximum(dmat)
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:46[39m5:59[39m


### Diameters and thresholds

In [19]:
taxa[!, :diameter] = [
    maximum(view(leaf_pw_max_dists, leaf_idxs, leaf_idxs))
    for leaf_idxs in taxa.leaf_idxs
];

In [20]:
taxa[!, :threshold] = [
    isnan(row[:manual_threshold]) ? row[:diameter] : row[:manual_threshold]
    for row in eachrow(taxa)
];

In [21]:
taxa[!, :max_leaf_threshold] = [
    maximum(taxa.threshold[leaf_tidxs[i]] for i in lidxs)
    for lidxs in taxa.leaf_idxs
];

## Overlaps

In [22]:
const OverlapTuple = Tuple{Int, Int, Float32}
overlaps = Dict{Int, Vector{OverlapTuple}}()
min_inters = OverlapTuple[]

for row in eachrow(taxa)
    taxon_overlaps = OverlapTuple[]
    mi = (0, 0, Inf)
    
    for src_idx in row.leaf_idxs
        for dst_idx in 1:nleaves
            dst_idx ∈ row.leaf_idxs && continue
            
            dst_id = leaf_tids[dst_idx]
            d = leaf_pw_min_dists[src_idx, dst_idx]
            
            item = (leaf_tids[src_idx], leaf_tids[dst_idx], d)
            d <= row.threshold && push!(taxon_overlaps, item)
            d < mi[3] && (mi = item)
        end
    end
    
    !isempty(taxon_overlaps) && (overlaps[row.id] = taxon_overlaps)
    push!(min_inters, mi)
end

length(overlaps)

119

In [23]:
# Check no overlaps were from leaf taxa
@assert all(!lookup_tid(tid, :is_leaf) for tid in keys(overlaps))

In [24]:
for (i, col) in enumerate([:min_inter_src, :min_inter_dst, :min_inter_dist])
    taxa[!, col] = [mi[i] for mi in min_inters]
end

## Summary

In [25]:
internal_tidxs = findall(!, taxa.is_leaf);

In [26]:
summary_df = taxa[internal_tidxs, [:id, :name, :is_root, :diameter, :threshold, :max_leaf_threshold]];

In [27]:
insertcols!(
    summary_df,
    :diameter,
    :ngenomes => length.(taxa.genomes[internal_tidxs]),
    :nleaves => length.(taxa.leaves[internal_tidxs]),
)


summary_df[!, :min_inter_src_id] = taxa[internal_tidxs, :min_inter_src]
summary_df[!, :min_inter_src_name] = [lookup_tid(id, :name) for id in summary_df.min_inter_src_id]
summary_df[!, :min_inter_dst_id] = taxa[internal_tidxs, :min_inter_dst]
summary_df[!, :min_inter_dst_name] = [lookup_tid(id, :name) for id in summary_df.min_inter_dst_id]
summary_df[!, :min_inter_dist] = taxa[internal_tidxs, :min_inter_dist]
;

In [28]:
summary_df[:, :noverlaps] .= 0
summary_df[:, :dst_taxa] .= 0
summary_df[:, :dst_genera] .= 0

for (i, tid) in enumerate(summary_df.id)
    haskey(overlaps, tid) || continue
    leaf_overlaps = overlaps[tid]
    
    dst_taxa = unique(dst for (src, dst, d) in leaf_overlaps)
    dst_genera = unique(rootnode(forest.nodes[id]).id for id in dst_taxa)
    
    summary_df[i, :noverlaps] = length(leaf_overlaps)
    summary_df[i, :dst_taxa] = length(dst_taxa)
    summary_df[i, :dst_genera] = length(dst_genera)
end

In [29]:
summary_df

Unnamed: 0_level_0,id,name,is_root,ngenomes,nleaves,diameter,threshold,max_leaf_threshold,min_inter_src_id,min_inter_src_name,min_inter_dst_id,min_inter_dst_name,min_inter_dist,noverlaps,dst_taxa,dst_genera
Unnamed: 0_level_1,Int64,String,Bool,Int64,Int64,Float32,Abstrac…,Abstrac…,Int64,String,Int64,String,Float32,Int64,Int64,Int64
1,1,Mobiluncus,1,7,2,0.991801,0.991801,0.58688,1106,Mobiluncus mulieris,1688,Actinobaculum massiliense,0.987895,3,2,2
2,2,Gordonia,1,12,5,0.978936,0.978936,0.443014,1794,Gordonia amicalis,1478,Nocardia farcinica,0.974594,72,29,9
3,3,Meiothermus,1,4,1,0.207928,0.207928,0.207928,540,Meiothermus ruber,1868,Thermus oshimai,0.985355,0,0,0
4,4,Kitasatospora,1,6,2,0.947083,0.947083,0.479394,1079,Kitasatospora aureofaciens,1206,Streptomyces pyridomyceticus,0.861725,4,2,1
5,5,Shewanella,1,18,4,0.995506,0.995506,0.593445,467,Shewanella putrefaciens,1460,Salmonella enterica,0.99323,77,72,24
6,6,Sporolactobacillus,1,4,2,0.995221,0.995221,0.332251,1118,Sporolactobacillus vineae,577,Neorhizobium galegae,0.995424,0,0,0
7,7,Dyella,1,2,1,0.00625489,0.00625489,0.00625489,1110,Dyella japonica,1276,Rhodanobacter denitrificans,0.982042,0,0,0
8,8,Myxococcus,1,3,1,0.0126297,0.0126297,0.0126297,470,Myxococcus xanthus,471,Stigmatella aurantiaca,0.963857,0,0,0
9,9,Stigmatella,1,2,1,0.0125714,0.0125714,0.0125714,471,Stigmatella aurantiaca,470,Myxococcus xanthus,0.963857,0,0,0
10,10,Mycoplasma,1,161,23,1.0,1.0,0.679269,1111,Mycoplasma arginini,722,Chlamydia trachomatis,0.788957,32706,1422,453


## Write output

In [30]:
CSV.write(processed_out / "$DATESTR-internal-taxa-overlaps-summary.csv", summary_df)

p"data-processed/210428-find-remaining-overlaps/210428-internal-taxa-overlaps-summary.csv"

In [31]:
let
    data = map(collect(overlaps)) do (tid, leaf_overlaps)
        (
            src_taxid=tid,
            src_leaf_taxids=[src for (src, dst, d) in leaf_overlaps],
            dst_leaf_taxids=[dst for (src, dst, d) in leaf_overlaps],
            distances=[d for (src, dst, d) in leaf_overlaps],
        )
    end
    
    open(intermediate_out / "overlaps.json", "w") do f
        JSON.print(f, data)
    end
end

In [32]:
let
    cols = [:id, :is_leaf, :is_root, :leaves, :genomes, :diameter, :threshold, :max_leaf_threshold,
            :min_inter_src, :min_inter_dst, :min_inter_dist]
    
    data = [Dict(pairs(row)) for row in  eachrow(taxa[!, cols])]
    
    open(intermediate_out / "calculated.json", "w") do f
        JSON.print(f, data)
    end
end