# 210424 Compile fixes

In [1]:
using Mmap
using Printf

In [2]:
using JSON
using CSV
using DataFrames
using FilePathsBase
using FilePathsBase: /
using ProgressMeter

In [3]:
using Midas.Pairwise: npairs, iterpairs
using TriMatrices

## Setup

In [4]:
ENV["COLUMNS"] = 400

400

In [5]:
DATESTR = "210424"
NBNAME = "$DATESTR-compile-fixes"

"210424-compile-fixes"

In [6]:
infiles = Dict(
    :distances => p"../../data/intermediate/200727-find-overlaps/genome-pw-distances.raw-float32",
    :db => p"../../data/intermediate/210303-database-v2-overlaps/210328-compile-edits/",
    :overlaps => p"../../data/intermediate/210303-database-v2-overlaps/210328-find-species-overlaps-2/",
    :fixes => p"data-intermediate/component-fixes/",
);

In [7]:
intermediate_out = p"data-intermediate" / NBNAME
isdir(intermediate_out) || mkdir(intermediate_out)

processed_out = p"data-processed" / NBNAME
isdir(processed_out) || mkdir(processed_out)

true

## Load data

### Database

In [8]:
taxa_df = DataFrame(CSV.File(infiles[:db] / "taxa.csv"));

In [9]:
genome_assignments = Vector{Int}(open(JSON.parse, infiles[:db] / "genome-taxon-assignments.json"))
ngenomes = length(genome_assignments)

50752

In [10]:
taxon_idx(id) = findfirst(==(id), taxa_df[!, :id])

taxon_idx (generic function with 1 method)

In [11]:
function lookup(df::DataFrame, (key, val)::Pair{Symbol, <:Any}, cols=:)
    i = findfirst(isequal(val), df[!, key])
    return isnothing(i) ? nothing : df[i, cols]
end

lookup_taxon(id, cols=:) = lookup(taxa_df, :id => id, cols)

lookup_taxon (generic function with 2 methods)

### Distances

In [12]:
pw_data = Mmap.mmap(open(infiles[:distances]), Vector{Float32}, (npairs(ngenomes),));

pw_dists = TriMatrix(TriSymmetric{false}(), ngenomes, pw_data);

### Overlaps

In [13]:
components = let
    data = open(JSON.parse, infiles[:overlaps] / "components.json")
    [Vector{Int}(item["taxon_ids"]) for item in data]
end

ncomps = length(components)

34

### Fixes

In [14]:
fixes = Dict{Int, Any}()

for i in 1:ncomps
    # Main data
    f = infiles[:fixes] / "$i.json"
    if ! exists(f)
        @warn "No fixes for component $i"
        continue
    end
    
    data = open(JSON.parse, f)
    
    for item in data
        id = pop!(item, "taxon_id")
        @assert id ∈ components[i]
        fixes[id] = item
    end
    
    # Subgroup thresholds
    threshold_file = infiles[:fixes] / "$i-subgroup-thresholds.json"
    if exists(threshold_file)
        println("Component $i has subgroup threshold data")
        data2 = open(JSON.parse, threshold_file)
        
        for item in data2
            id = item["parent_id"]
            @assert id ∈ components[i]

            fix = fixes[id]
            thresholds = get!(fix, "subgroup_thresholds", Dict{Int, Float64}())
            thresholds[item["subgroup_index"]] = item["threshold"]
        end
    end
end;

Component 2 has subgroup threshold data
Component 9 has subgroup threshold data
Component 15 has subgroup threshold data
Component 20 has subgroup threshold data


└ @ Main In[14]:7


## Apply fixes

In [15]:
taxa_edited = deepcopy(taxa_df);
assignments_edited = copy(genome_assignments);

### Main fixes

In [16]:
removed_taxa = Int[]
split_taxa = Dict{Int, Vector{Int}}()
removed_genomes = Int[]
manual_thresholds = Dict{Int, Float64}()


for (tid, item) in fixes
    ti = findfirst(==(tid), taxa_edited[!, :id])
    
    # Taxon removed
    if item["removed"]
        @assert isempty(item["removed_genomes"])
        @assert isnothing(item["split"])
        @assert isnothing(item["manual_threshold"])
        
        delete!(taxa_edited, ti)
        assignments_edited[assignments_edited .== tid] .= 0
        push!(removed_taxa, tid)
        
        continue
    end
    
    # Manual threhsold
    if !isnothing(item["manual_threshold"])
        taxa_edited[ti, :manual_threshold] = item["manual_threshold"]
        manual_thresholds[tid] = item["manual_threshold"]
    end
    
    # Removed genomes
    @assert all(assignments_edited[gi] == tid for gi in item["removed_genomes"])
    assignments_edited[item["removed_genomes"]] .= 0
    append!(removed_genomes, item["removed_genomes"])
    
    # Split
    if !isnothing(item["split"])
        parent = taxa_edited[ti, :]
        parent[:manual_threshold] = 0.
        parent[:is_leaf] = false
        
        sg_thresholds = get(item, "subgroup_thresholds", Dict{Int, Float64}())
        @assert issubset(keys(sg_thresholds), 1:length(item["split"]))
        
        nextid = maximum(taxa_edited[!, :id]) + 1
        subgroup_ids = []
        
        for (i, gidxs) in enumerate(item["split"])
            @assert all(assignments_edited[gi] == tid for gi in gidxs)
            assignments_edited[gidxs] .= nextid
            
            push!(taxa_edited, (
                id=nextid,
                ncbi_id=missing,
                name="$(parent[:name]) subgroup $i",
                rank=missing,
                parent_id=tid,
                in_v12=false,
                manual_threshold=get(sg_thresholds, i, NaN),
                report=false,
                is_leaf=true,
            ))
            
            push!(subgroup_ids, nextid)
            nextid += 1
        end
        
        @assert tid ∉ assignments_edited
        
        split_taxa[tid] = subgroup_ids
    end
end

### Taxon merges

These taxa groups could not be disentangled and will instead be merged:

In [17]:
merge_groups = [
    "Lacticaseibacillus casei/paracasei" => ["Lactobacillus casei", "Lacticaseibacillus paracasei"],  # Despite names, these are both in genus Lacticaseibacillus
];

In [18]:
merges = Dict{Int, Vector{Int}}()

for (merged_name, names) in merge_groups
    tidxs = [findfirst(==(name), taxa_edited[!, :name]) for name in names]
    parent_id = only(unique(taxa_edited[i, :parent_id] for i in tidxs))
    
    merged_id = maximum(taxa_edited[!, :id]) + 1
    push!(taxa_edited, (
        id=merged_id,
        ncbi_id=missing,
        name=merged_name,
        rank=missing,
        parent_id=parent_id,
        in_v12=false,
        manual_threshold=NaN,
        report=1,
        is_leaf=false,
    ))
    
    taxa_edited[tidxs, :parent_id] .= merged_id
    taxa_edited[tidxs, :manual_threshold] .= 0
    taxa_edited[tidxs, :report] .= 0
    
    merges[merged_id] = tidxs
end

### Remove empty internal taxa

Remove any taxa not marked as leaves which do not have any children. This occurs when all of a taxon's children were removed.

This step was in the previous experiment (where some taxa were removed due to all of their 2/3 genomes being identical), so some of these empty taxa already exist in the taxon list loaded at the beginning of this notebook. That will be fixed here.

In [19]:
extra_removed = Set{Int}()

while true
    parents = Set(pid for pid in taxa_edited[!, :parent_id] if !ismissing(pid))
    remove_idxs = [i for (i, row) in enumerate(eachrow(taxa_edited)) if !row[:is_leaf] && row[:id] ∉ parents]

    isempty(remove_idxs) && break
    
    union!(extra_removed, taxa_edited[remove_idxs, :id])
    delete!(taxa_edited, remove_idxs)
end

filter(row -> row[:id] ∈ extra_removed, taxa_df)

Unnamed: 0_level_0,id,ncbi_id,name,rank,parent_id,in_v12,manual_threshold,report,is_leaf
Unnamed: 0_level_1,Int64,Int64?,String,String?,Int64?,Bool,Float64,Bool,Bool
1,56,119044,Filomicrobium,genus,missing,1,,1,0
2,62,33055,Candidatus Kinetoplastibacterium,genus,missing,1,,1,0
3,78,59732,Chryseobacterium,genus,missing,1,,1,0
4,98,135575,Idiomarina,genus,missing,1,,1,0
5,127,2767353,Lancefieldella,genus,missing,1,,1,0
6,141,2675231,Mesobacillus,genus,missing,1,,1,0
7,195,41707,Lawsonia,genus,missing,1,,1,0
8,238,939,Hydrogenobacter,genus,missing,1,,1,0
9,393,282198,Nereida,genus,missing,1,,1,0


### Root taxa parent IDs

Noticed there was one taxon with a `0` in the `:parent_id` column instead of `missing`, replace this for consistency.

In [20]:
# Replace parent ID of 0 with missing (means same thing)
println(count(isequal(0), taxa_edited[!, :parent_id]))
replace!(taxa_edited[!, :parent_id], 0 => missing);

1


## Verify all overlaps of leaf taxa removed

### Setup

In [21]:
leaf_tidxs = findall(taxa_edited[!, :is_leaf])
leaf_tids = taxa_edited[leaf_tidxs, :id]
nleaves = length(leaf_tidxs)

@assert issetequal(leaf_tids, setdiff(Set(assignments_edited), 0))

length(leaf_tids)

1445

In [22]:
leaf_id_to_index = Dict(tid => i for (i, tid) in enumerate(leaf_tids))
leaf_gidxs = [Int[] for _ in leaf_tids]

for (gi, tid) in enumerate(assignments_edited)
    tid == 0 || push!(leaf_gidxs[leaf_id_to_index[tid]], gi)
end

@assert all(gidxs -> length(gidxs) >= 2, leaf_gidxs)

### Leaf distances

In [23]:
leaf_diameters = [maximum(view(pw_dists, gidxs, gidxs)) for gidxs in leaf_gidxs];

In [24]:
leaf_thresholds = [
    isnan(thresh) ? diam : thresh
    for (thresh, diam) in zip(taxa_edited[leaf_tidxs, :manual_threshold], leaf_diameters)
];

In [25]:
leaf_pw_min_dists = zeros(eltype(pw_dists), nleaves, nleaves)

@showprogress for (i, j) in iterpairs(nleaves)
    d = minimum(view(pw_dists, leaf_gidxs[i], leaf_gidxs[j]))
    leaf_pw_min_dists[i, j] = leaf_pw_min_dists[j, i] = d
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:13[39m2:05[39m


### Overlaps

In [26]:
overlap_pairs = Pair{Int, Int}[]

for (i, j) in iterpairs(nleaves)
    d = leaf_pw_min_dists[i, j]
    d <= leaf_thresholds[i] && push!(overlap_pairs, i => j)
    d <= leaf_thresholds[j] && push!(overlap_pairs, j => i)
end

@assert isempty(overlap_pairs)

## Summarize fixes

In [27]:
summary_df = let
    rows = Any[]
    
    tid_to_ci = Dict(tid => ci for (ci, tids) in enumerate(components) for tid in tids)

    for row in eachrow(taxa_df)
        tid = row[:id]

        orig_gidxs = findall(==(tid), genome_assignments)
        initial_ngenomes = length(orig_gidxs)
        initial_diameter = isempty(orig_gidxs) ? missing : maximum(view(pw_dists, orig_gidxs, orig_gidxs))
        
        if tid ∈ removed_taxa
            removed = true
            removed_reason = "manual"
        elseif tid ∈ extra_removed
            removed = true
            removed_reason = "child taxa removed"
        else
            removed = false
            removed_reason = missing
        end
        
        if removed
            final_diameter = missing
            final_ngenomes = 0
            split_str = missing
        else
            haskey(manual_thresholds, tid) || haskey(split_taxa, tid) || continue
            
            if haskey(split_taxa, tid)
                subgroup_gidxs = [findall(==(sid), assignments_edited) for sid in split_taxa[tid]]
                split_str = join(length.(subgroup_gidxs), "/")
            else
                subgroup_gidxs = [findall(==(tid), assignments_edited)]
                split_str = missing
            end
            
            diams = [maximum(view(pw_dists, gidxs, gidxs)) for gidxs in subgroup_gidxs]
            final_diameter = join((@sprintf("%.4f", d) for d in diams), "/")
            final_ngenomes = sum(length.(subgroup_gidxs))
        end

        push!(rows, (
            id=tid,
            ncbi_id=row[:ncbi_id],
            name=row[:name],
            rank=row[:rank],
            component=get(tid_to_ci, tid, missing),
            taxon_removed=removed,
            taxon_removed_reason=removed_reason,
            split=split_str,
            initial_ngenomes=initial_ngenomes,
            final_ngenomes=final_ngenomes,
            initial_diameter=initial_diameter,
            final_diameter=final_diameter,
            manual_threshold=get(manual_thresholds, tid, missing),
        ))
    end
    
    df = DataFrame(rows)
    sort!(df, :component)
end

Unnamed: 0_level_0,id,ncbi_id,name,rank,component,taxon_removed,taxon_removed_reason,split,initial_ngenomes,final_ngenomes,initial_diameter,final_diameter,manual_threshold
Unnamed: 0_level_1,Int64,Int64?,String,String?,Int64?,Bool,String?,String?,Int64,Int64,Float32?,String?,Float64?
1,1061,67351,Streptomyces californicus,species,1,0,missing,missing,5,5,0.316231,0.3162,0.2
2,1076,1888,Streptomyces albus,species,1,0,missing,missing,6,6,0.956074,0.9561,0.6
3,1086,1912,Streptomyces hygroscopicus,species,1,0,missing,missing,6,6,0.960201,0.9602,0.6
4,551,294,Pseudomonas fluorescens,species,2,0,missing,missing,38,38,0.976624,0.9766,0.8
5,555,303,Pseudomonas putida,species,2,0,missing,18/4/6,28,28,0.886751,0.7105/0.5615/0.7793,missing
6,557,317,Pseudomonas syringae,species,2,0,missing,51/19/33/7,114,110,0.915752,0.4567/0.3568/0.7149/0.3883,missing
7,1135,157782,Pseudomonas parafulva,species,2,0,missing,missing,4,4,0.941948,0.9419,0.6
8,1158,43263,Pseudomonas alcaligenes,species,2,0,missing,missing,4,4,0.961824,0.9618,0.8
9,1165,43306,Pseudomonas denitrificans (nom. rej.),species,2,1,manual,missing,9,0,0.398932,missing,missing
10,1530,29438,Pseudomonas savastanoi,species,2,0,missing,9/15,24,24,0.489886,0.1716/0.2872,missing


## Write output

In [28]:
CSV.write(intermediate_out / "taxa.csv", taxa_edited);

In [29]:
open(intermediate_out / "genome-taxon-assignments.json", "w") do f
    JSON.print(f, assignments_edited)
end

In [30]:
CSV.write(processed_out / "$DATESTR-species-overlap-fixes-summary.csv", summary_df);