# 210710 Remove min inter outliers

In [1]:
using Mmap
using Statistics

In [2]:
using JSON
using DataFrames
using FilePathsBase
using FilePathsBase: /
import CSV
using Arrow
using HDF5
using DataStructures: Accumulator

In [3]:
using Midas.Pairwise: npairs, iterpairs
using TriMatrices

## Setup

In [4]:
ENV["COLUMNS"] = 400
ENV["LINES"] = 100

100

In [5]:
DATESTR = "210710"
NBNAME = "$DATESTR-remove-min-inter-outliers"

"210710-remove-min-inter-outliers"

In [6]:
infiles = Dict(
    :distances => p"../../data/intermediate/200727-find-overlaps/genome-pw-distances.raw-float32",
    :db => p"../../data/intermediate/210401-database-v2-fix-species-overlaps/210424-compile-fixes/",
    :overlaps => p"data-intermediate/210428-find-remaining-overlaps/",
    :data => p"data-intermediate/210511-extra-overlap-data/",
);

In [7]:
processed_out = p"data-processed" / NBNAME
isdir(processed_out) || mkdir(processed_out);

## Load data

### Database

In [8]:
taxa = DataFrame(Arrow.Table(string(infiles[:data] / "taxa.arrow")));

In [9]:
genome_assignments = Vector{Int}(open(JSON.parse, infiles[:db] / "genome-taxon-assignments.json"))
ngenomes = length(genome_assignments)

50752

In [10]:
tid_to_tidx = Dict(id => i for (i, id) in enumerate(taxa[!, :id]))

taxon_index(tid::Integer) = tid_to_tidx[tid]
taxon_index(tids::AbstractVector{<:Integer}) = taxon_index.(tids)

lookup_tid(tid, cols=:) = taxa[taxon_index(tid), cols]

lookup_tid (generic function with 2 methods)

### Distances

In [11]:
pw_data = Mmap.mmap(open(infiles[:distances]), Vector{Float32}, (npairs(ngenomes),));

pw_dists = TriMatrix(TriSymmetric{false}(), ngenomes, pw_data);

### Overlaps

In [12]:
overlaps = let
    data = open(JSON.parse, infiles[:overlaps] / "overlaps.json")
    
    map(data) do item
        item["src_taxid"] => collect(zip(item["src_leaf_taxids"], item["dst_leaf_taxids"], item["distances"]))
    end |> Dict
end

taxa[!, :noverlaps] = [haskey(overlaps, id) ? length(overlaps[id]) : 0 for id in taxa[!, :id]]
taxa[!, :has_overlaps] = map(>(0), taxa.noverlaps)
;

### Leaf data

In [13]:
h5open(string(infiles[:data] / "leaf-data.h5"), "r") do f
    global leaf_tidxs = read(f, "idxs")
    global leaf_pw_min_dists = read(f, "pw_min_dists")
    global leaf_pw_max_dists = read(f, "pw_max_dists")
    global leaf_pw_mean_dists = read(f, "pw_mean_dists")
end;

In [14]:
leaf_tids = taxa[leaf_tidxs, :id]
leaf_id_to_index = Dict(id => i for (i, id) in enumerate(leaf_tids))
nleaves = length(leaf_tidxs)
;

## Candidates for removal

### src

In [15]:
# Src internal taxid => n closest src genomes to consider
src_candidates = Dict([
    38 => 1,
    45 => 2,
    47 => 6,
    48 => 5,
    52 => 1,
    54 => 1,
    75 => 2,
    80 => 4,
    152 => 7,
    165 => 1,
    172 => 5,
    188 => 8,
    197 => 1,
    205 => 1,
    209 => 3,
    213 => 1,
    305 => 1,
    315 => 1,
    341 => 1,
    342 => 4,
    398 => 2,
    431 => 1,
    439 => 1,
]);

In [16]:
# Src internal taxid => n closest dst genomes to check
dst_candidates = Dict([
    10 => 4,
    47 => 1,
    152 => 4,
    156 => 1,
    247 => 1,
    262 => 2,
    267 => 4,
    319 => 1,
    350 => 18,
    431 => 7,
    434 => 1,
]);

## Evaluate

In [17]:
src_stats = let
    rows = []
    
    for (tid, n) in src_candidates
        taxon = lookup_tid(tid)
        
        # Group overlaps by src
        dsts_by_src = Dict{Int, Vector{Int}}()
        for (src, dst, d) in overlaps[tid]
            push!(get!(dsts_by_src, src, Int[]), dst)
        end
        
        # Min inters by overlapping src genomes
        items = Pair{Int, Float32}[]
        
        for (src, dsts) in dsts_by_src
            src_genomes = lookup_tid(src, :genomes)
            dst_genomes = vcat([lookup_tid(dst, :genomes) for dst in dsts]...)
            append!(items, [
                gi => minimum(view(pw_dists, gi, dst_genomes))
                for gi in src_genomes
            ])
        end
        
        sort!(items, by=last)
        @assert items[1][2] == taxon.min_inter_dist
        
        for i in 1:n
            gi = items[i][1]
            new_min_inter = items[i + 1][2]
            leaf = lookup_tid(genome_assignments[gi])
            
            push!(rows, (
                id=taxon.id,
                name=taxon.name,
                diameter=taxon.diameter,
                min_inter=taxon.min_inter_dist,
                i=i,
                new_min_inter=new_min_inter,
                leaf_id=leaf.id,
                leaf_name=leaf.name,
                leaf_ngenomes=leaf.ngenomes,
                genome=gi,
            ))
        end
    end
    
    df = DataFrame(rows)
    sort!(df, :id)
end

Unnamed: 0_level_0,id,name,diameter,min_inter,i,new_min_inter,leaf_id,leaf_name,leaf_ngenomes,genome
Unnamed: 0_level_1,Int64,String,Float32,Float32,Int64,Float32,Int64,String,Int64,Int64
1,38,Leptospira,0.998561,0.989358,1,0.995792,1365,Leptospira noguchii,15,35369
2,45,Campylobacter,0.999048,0.984083,1,0.985537,505,Campylobacter coli,252,41117
3,45,Campylobacter,0.999048,0.984083,2,0.99083,505,Campylobacter coli,252,41115
4,47,Mannheimia,0.961796,0.960099,1,0.962002,1153,Mannheimia haemolytica,30,45103
5,47,Mannheimia,0.961796,0.960099,2,0.963155,1153,Mannheimia haemolytica,30,45104
6,47,Mannheimia,0.961796,0.960099,3,0.964066,1153,Mannheimia haemolytica,30,7182
7,47,Mannheimia,0.961796,0.960099,4,0.964141,1153,Mannheimia haemolytica,30,8644
8,47,Mannheimia,0.961796,0.960099,5,0.966769,1153,Mannheimia haemolytica,30,8642
9,47,Mannheimia,0.961796,0.960099,6,0.972554,1153,Mannheimia haemolytica,30,45101
10,48,Helicobacter,0.999056,0.979619,1,0.984083,511,Helicobacter pylori,467,31677


In [18]:
dst_stats = let
    rows = []
    
    for (tid, n) in dst_candidates
        taxon = lookup_tid(tid)
        
        # Group overlaps by dst
        srcs_by_dst = Dict{Int, Vector{Int}}()
        for (src, dst, d) in overlaps[tid]
            push!(get!(srcs_by_dst, dst, Int[]), src)
        end
        
        # Min inters by overlapping dst genomes
        items = Pair{Int, Float32}[]
        
        for (dst, srcs) in srcs_by_dst
            dst_genomes = lookup_tid(dst, :genomes)
            src_genomes = vcat([lookup_tid(src, :genomes) for src in srcs]...)
            append!(items, [
                gi => minimum(view(pw_dists, gi, src_genomes))
                for gi in dst_genomes
            ])
        end
        
        sort!(items, by=last)
        @assert items[1][2] == taxon.min_inter_dist
        
        for i in 1:n
            gi = items[i][1]
            new_min_inter = items[i + 1][2]
            leaf = lookup_tid(genome_assignments[gi])
            
            push!(rows, (
                id=taxon.id,
                name=taxon.name,
                diameter=taxon.diameter,
                min_inter=taxon.min_inter_dist,
                i=i,
                new_min_inter=new_min_inter,
                leaf_id=leaf.id,
                leaf_name=leaf.name,
                leaf_ngenomes=leaf.ngenomes,
                genome=gi,
            ))
        end
    end
    
    df = DataFrame(rows)
    sort!(df, :id)
end

Unnamed: 0_level_0,id,name,diameter,min_inter,i,new_min_inter,leaf_id,leaf_name,leaf_ngenomes,genome
Unnamed: 0_level_1,Int64,String,Float32,Float32,Int64,Float32,Int64,String,Int64,Int64
1,10,Mycoplasma,1.0,0.788957,1,0.8266,722,Chlamydia trachomatis,95,36486
2,10,Mycoplasma,1.0,0.788957,2,0.918298,722,Chlamydia trachomatis,95,36484
3,10,Mycoplasma,1.0,0.788957,3,0.954319,722,Chlamydia trachomatis,95,40173
4,10,Mycoplasma,1.0,0.788957,4,0.98396,1460,Salmonella enterica,4049,29617
5,47,Mannheimia,0.961796,0.960099,1,0.976871,1748,Bibersteinia trehalosi,4,10875
6,152,Klebsiella,0.955573,0.928217,1,0.928614,1914,Enterobacter hormaechei subgroup 1,115,41910
7,152,Klebsiella,0.955573,0.928217,2,0.928661,1914,Enterobacter hormaechei subgroup 1,115,42039
8,152,Klebsiella,0.955573,0.928217,3,0.928965,1914,Enterobacter hormaechei subgroup 1,115,41873
9,152,Klebsiella,0.955573,0.928217,4,0.932194,1914,Enterobacter hormaechei subgroup 1,115,41920
10,156,Providencia,0.983623,0.960965,1,0.975101,635,Proteus mirabilis,48,41753


## Remove outliers

### Pick

In [19]:
# Src internal taxid => num genomes to remove
toremove_src = Dict(
    209 => 3,
    213 => 1,
    341 => 1,
    398 => 2
)

Dict{Int64,Int64} with 4 entries:
  213 => 1
  341 => 1
  398 => 2
  209 => 3

In [20]:
# Src internal taxid => num genomes to remove
toremove_dst = Dict(
    10 => 4,
    247 => 1,
    350 => 8,
)

Dict{Int64,Int64} with 3 entries:
  10  => 4
  247 => 1
  350 => 8

### Summarize

In [21]:
# Assumption in next cell
@assert isempty(intersect(keys(toremove_src), keys(toremove_dst)))

In [22]:
nremoved_by_leaf = Accumulator{Int, Int}()
new_min_inters = Dict{Int, Float32}()

removed_summary = let
    rows = []

    for (side, stats, toremove) in [(:src, src_stats, toremove_src), (:dst, dst_stats, toremove_dst)]
        for (tid, n) in toremove
            taxon = lookup_tid(tid)
            sdf = stats[stats.id .== tid, :]
            @assert sdf.i == 1:nrow(sdf)

            for i in 1:n
                row = sdf[i, :]
                nremoved_by_leaf[row.leaf_id] += 1

                push!(rows, (
                    genome_index=row.genome,
                    taxon_id=row.leaf_id,
                    taxon_name=row.leaf_name,
                    reason="min_inter outlier for $(taxon.id) $(taxon.name) on $side side",
                ))
            end
            
            new_min_inters[tid] = sdf[n, :new_min_inter]
        end
    end
    
    DataFrame(rows)
end

Unnamed: 0_level_0,genome_index,taxon_id,taxon_name,reason
Unnamed: 0_level_1,Int64,Int64,String,String
1,32825,1424,Mycobacteroides abscessus,min_inter outlier for 213 Mycobacteroides on src side
2,44077,878,Enterococcus faecium,min_inter outlier for 341 Enterococcus on src side
3,47760,999,Listeria monocytogenes,min_inter outlier for 398 Listeria on src side
4,44726,999,Listeria monocytogenes,min_inter outlier for 398 Listeria on src side
5,36486,722,Chlamydia trachomatis,min_inter outlier for 209 Chlamydia on src side
6,36484,722,Chlamydia trachomatis,min_inter outlier for 209 Chlamydia on src side
7,40173,722,Chlamydia trachomatis,min_inter outlier for 209 Chlamydia on src side
8,36486,722,Chlamydia trachomatis,min_inter outlier for 10 Mycoplasma on dst side
9,36484,722,Chlamydia trachomatis,min_inter outlier for 10 Mycoplasma on dst side
10,40173,722,Chlamydia trachomatis,min_inter outlier for 10 Mycoplasma on dst side


In [23]:
removed_by_leaf_summary = map(collect(nremoved_by_leaf)) do (leaf_id, nremoved)
    leaf = lookup_tid(leaf_id)
    return (
        id=leaf.id,
        name=leaf.name,
        ngenomes=leaf.ngenomes,
        nremoved=nremoved,
    )
end |> DataFrame

Unnamed: 0_level_0,id,name,ngenomes,nremoved
Unnamed: 0_level_1,Int64,String,Int64,Int64
1,1424,Mycobacteroides abscessus,221,1
2,878,Enterococcus faecium,411,1
3,722,Chlamydia trachomatis,95,6
4,1742,Lactobacillus rhamnosus,54,1
5,999,Listeria monocytogenes,522,2
6,859,Streptococcus pneumoniae,6809,8
7,1460,Salmonella enterica,4049,1


In [24]:
min_inter_summary = map(collect(new_min_inters)) do (tid, new_min_inter)
    taxon = lookup_tid(tid)
    (
        id=taxon.id,
        name=taxon.name,
        diameter=taxon.diameter,
        old_min_inter=taxon.min_inter_dist,
        new_min_inter=new_min_inter,
    )
end |> DataFrame

Unnamed: 0_level_0,id,name,diameter,old_min_inter,new_min_inter
Unnamed: 0_level_1,Int64,String,Float32,Float32,Float32
1,213,Mycobacteroides,0.929199,0.830563,0.999291
2,10,Mycoplasma,1.0,0.788957,0.98396
3,341,Enterococcus,0.996376,0.916719,0.983002
4,398,Listeria,0.994244,0.966109,0.986778
5,247,Lactiplantibacillus,0.986148,0.933174,0.980729
6,350,Bacillus,0.998517,0.891198,0.943174
7,209,Chlamydia,0.998077,0.788957,0.983753


## Write output

In [25]:
CSV.write(processed_out / "$DATESTR-removed-genomes.csv", removed_summary)

p"data-processed/210710-remove-min-inter-outliers/210710-removed-genomes.csv"

In [26]:
CSV.write(processed_out / "$DATESTR-removed-genomes-by-taxon.csv", removed_by_leaf_summary)

p"data-processed/210710-remove-min-inter-outliers/210710-removed-genomes-by-taxon.csv"

In [27]:
CSV.write(processed_out / "$DATESTR-min-inter-updates.csv", min_inter_summary)

p"data-processed/210710-remove-min-inter-outliers/210710-min-inter-updates.csv"