# 200729 Find species overlaps

In [1]:
using ProgressMeter
using GZip
using JSON
using Mmap
using DataFrames
using CSV
using StatsBase
using CategoricalArrays
using HDF5

In [2]:
using Midas
using Midas.SignatureFiles
using Midas.Pairwise: npairs
using TriMatrices

## Func defs

In [3]:
findclass(a::CategoricalArray, cls::CategoricalValue) = findall(==(cls), a)
findclass(a::CategoricalArray, i::Integer) = findclass(a, a.pool[i])
selectclass(a::AbstractVector, c::CategoricalArray, cls) = a[findclass(c, cls)]

selectclass (generic function with 1 method)

## File paths

In [4]:
infiles = Dict(
    :taxonomy => "/home/jared/notebooks/midas-notebooks-2019/build-v1-database/out/3-curated-taxonomy-assignments.csv",
#     :signatures => "/home/jared/data/2019_20/refseq_curated_1.1beta_200604.midas-signatures.gz",
    :distances => "../../data/intermediate/200727-find-overlaps/genome-pw-distances.raw-float32",
);

In [5]:
outdir = "../../data/intermediate/200727-find-overlaps/"

"../../data/intermediate/200727-find-overlaps/"

In [6]:
outfiles = Dict(
    :overlaps => joinpath(outdir, "200729-refseq-curated-1.1_beta-species-overlaps.h5"),
);

## Load taxonomy

In [7]:
taxdf = DataFrame(CSV.File(infiles[:taxonomy]));

In [8]:
sig_genera = categorical(taxdf[!, :genus])
genera = levels(sig_genera)
ngenera = length(genera)

sig_species = categorical([(row[:genus], row[:species]) for row in eachrow(taxdf)])
species = levels(sig_species)
nspecies = length(species)

ngenera, nspecies

(419, 1438)

In [9]:
genus_counts = counts(sig_genera.refs)
species_counts = counts(sig_species.refs)
;

In [10]:
genome_accs = [last(split(k, "/")) for k in taxdf[!, :key]];

In [11]:
species_to_genus = [findfirst(==(genus), genera) for (genus, spname) in species];

In [12]:
nsigs = length(sig_genera)

50752

## Open distances

In [13]:
pw_data = Mmap.mmap(open(infiles[:distances], "r"), Vector{Float32}, (npairs(nsigs),));

pw_dists = TriMatrix(TriSymmetric{false}(), nsigs, pw_data);

In [14]:
@assert !any(<(0), pw_data)

## Find overlaps

In [15]:
# Find min inter and max intra for genome with index i
function intra_inter_extrema(dists::AbstractMatrix{T}, classes::AbstractVector, i::Int, ignore::Union{Nothing,Vector{Bool}}=nothing) where T
    ci = classes[i]

    min_inter = (typemax(T), 0)
    max_intra = (zero(T), 0)

    for j in 1:size(dists, 1)
        !isnothing(ignore) && ignore(j) && continue
        cj = classes[j]
        d = dists[i, j]
        if ci == cj
            d > max_intra[1] && (max_intra = (d, j))
        else
            d < min_inter[1] && (min_inter = (d, j))
        end
    end
    
    return min_inter, max_intra
end

intra_inter_extrema (generic function with 2 methods)

In [16]:
min_inter = Tuple{Int, Float32}[]
max_intra = Tuple{Int, Float32}[]

@showprogress for i in 1:nsigs
    (di, i), (dj, j) = intra_inter_extrema(pw_dists, sig_species, i)
    push!(min_inter, (i, di))
    push!(max_intra, (j, dj))
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:02:00[39m


In [17]:
has_overlap = findall(i -> min_inter[i][2] <= max_intra[i][2], 1:nsigs)
length(has_overlap)

23786

## Species components

Find connected components of overlapping species

In [18]:
overlap_pairs = [i => min_inter[i][1] for i in has_overlap];

In [19]:
overlap_components = Set{Int}[]

let
    components_dict = Dict{Int, Set{Int}}() 
    
    for (i, j) in overlap_pairs
        si = Int(sig_species.refs[i])
        sj = Int(sig_species.refs[j])
        
        ci = get(components_dict, si, nothing)
        cj = get(components_dict, sj, nothing)
        
        if isnothing(ci) && isnothing(cj)
            ci = Set([si, sj])
            components_dict[si] = components_dict[sj] = ci
            push!(overlap_components, ci)
        elseif isnothing(ci)
            push!(cj, si)
            components_dict[si] = cj
        elseif isnothing(cj)
            push!(ci, sj)
            components_dict[sj] = ci
        elseif ci !== cj
            union!(ci, cj)
            deleteat!(overlap_components, findfirst(c -> c === cj, overlap_components))
            
            for k in cj
                components_dict[k] = ci
            end
        end
    end
end

length(overlap_components), sum(map(length, overlap_components))

(41, 147)

In [20]:
overlap_components_vec = zeros(Int, nspecies)
for (i, c) in enumerate(overlap_components)
    for j in c
        overlap_components_vec[j] = i
    end
end

## Save

In [22]:
let h5 = h5open(outfiles[:overlaps], "cw")
    h5["genome_keys"] = taxdf[!, :key]
    h5["min_inter_indices"] = first.(min_inter)
    h5["min_inter_dists"] = last.(min_inter)
    h5["max_intra_indices"] = first.(max_intra)
    h5["max_intra_dists"] = last.(max_intra)
    
    h5["has_overlap"] = has_overlap
    h5["overlap_components"] = overlap_components_vec
    
    close(h5)
end;