# 200722 Find intra-taxon distances

In [None]:
using GZip
using ProgressMeter
using DataFrames
using StatsBase
using CSV
using CategoricalArrays
using JSON
using Serialization
using VPTrees

In [2]:
using Midas
using Midas.Distances
using Midas.SignatureFiles

In [3]:
const metric = jaccard_dist_sorted

jaccard_dist_sorted (generic function with 1 method)

## Func defs

In [4]:
findclass(a::CategoricalArray, cls::CategoricalValue) = findall(==(cls), a)
findclass(a::CategoricalArray, i::Integer) = findclass(a, a.pool[i])

findclass (generic function with 2 methods)

## File paths

In [5]:
taxonomy_file = "/Users/student/notebooks/midas/midas-notebooks-2019/build-v1-database/out/3-curated-taxonomy-assignments.csv"
signature_file_name = "/Users/student/projects/midas/data/2019_20/refseq_curated_1.1beta_200604.midas-signatures.gz"
;

In [6]:
tmpdir = "tmp/"
!isdir(tmpdir) && mkdir(tmpdir);

In [None]:
outdir = "../../data/processed/200722-detect-overlaps/"
isdir(outdir) || mkdir(outdir);

## Load taxonomy

In [7]:
taxdf = DataFrame(CSV.File(taxonomy_file));

In [8]:
sig_genera = categorical(taxdf[!, :genus])
genera = levels(sig_genera)
ngenera = length(genera)

sig_species = categorical([(row[:genus], row[:species]) for row in eachrow(taxdf)])
species = levels(sig_species)
nspecies = length(species)

ngenera, nspecies

(419, 1438)

In [9]:
genus_counts = counts(sig_genera.refs)
species_counts = counts(sig_species.refs)
;

UndefVarError: UndefVarError: counts not defined

In [10]:
genome_accs = [last(split(k, "/")) for k in taxdf[!, :key]];

## Load signatures

In [11]:
sigfile = SignatureFile(GZip.open(signature_file_name))

SignatureFile{UInt32,GZipStream} with 50752 elements

In [12]:
metadata = SignatureFiles._read_metadata(sigfile.stream, sigfile.header.o_metadata...)
JSON.print(metadata, 2)

{
  "date_created": "2020-06-04",
  "genome_set": {
    "key": "midas/assembly/curated",
    "name": "refseq_curated_2020",
    "meta": {
      "date_created": "2020-05-26",
      "parent": {
        "key": "midas/assembly/curated",
        "key_version": "0.9"
      }
    },
    "description": "Created 2020-05-26 by filtering version 0.9 by inclusion in refseq/assembly/all 1.1",
    "key_version": "1.1"
  },
  "kmer_spec": {
    "k": 11,
    "prefix": "ATGAC"
  },
  "description": "Signatures for version 1.1 of curated genome set"
}


In [13]:
# Should both be sorted:
@assert sigfile.ids == taxdf[:, :key]

In [None]:
@time sigs = SignatureArray(sigfile);

## Build metric tree

In [None]:
@time tree = VPTree(collect(sigs), jaccard_dist_sorted);

In [None]:
@time find_diameter(gsigs, jaccard_dist_sorted)

In [None]:
sortperm(genus_counts)[end-20]

In [None]:
g = 52
gidxs = findclass(sig_genera, g);
gsigs = sigs[gidxs];

In [None]:
genera_max_intra = progress_pmap(1:ngenera) do i
    
end

## Overlap detection

In [None]:
@time find_nearest(tree, sigs[1], 1, filter_species(1))

In [None]:
@time tree = VPTree(collect(sigs), metric);

In [None]:
function find_pure_subtrees(root::N, labels::CategoricalArray) where {N<:VPTrees.Node}
    subtrees = [N[] for _ in levels(labels)]
    _find_pure_subtrees(root, labels.refs, subtrees)
    return subtrees
end

function _find_pure_subtrees(node, labels, subtrees)
    isnothing(node) && return nothing
    lab = labels[node.index]
    
    llab = _find_pure_subtrees(node.left_child, labels, subtrees)
    rlab = _find_pure_subtrees(node.right_child, labels, subtrees)
    
    (isnothing(node.left_child) || llab == lab) && (isnothing(node.right_child) || rlab == lab) && return lab
    
    !isnothing(llab) && push!(subtrees[llab], node.left_child)
    !isnothing(rlab) && push!(subtrees[rlab], node.right_child)
    
    return nothing
end

In [None]:
pure_subtrees = find_pure_subtrees(tree.root, sig_genera);

In [None]:
largest_pure_subtrees = [(isempty(sts) ? 0 : maximum(n->n.n_data, sts)) for (i, sts) in enumerate(pure_subtrees)]

In [None]:
plt.scatter(genus_counts, max.(largest_pure_subtrees, 1))
plt.plot([1, maximum(genus_counts)], [1, maximum(genus_counts)], color="red")
plt.xscale("log")
plt.yscale("log")
;

In [None]:
sort(collect(zip(largest_pure_subtrees, genus_counts, genera)), by=first)

In [None]:
for (i, sts) in enumerate(pure_subtrees)
    for n in sts
        @assert sig_genera.refs[n.index] == i
    end
end

In [None]:
sp = sortperm(largest_pure_subtrees);

In [None]:
nodes = pure_subtrees[sp[end-1]];

In [None]:
node = nodes[end-1];

In [None]:
node.n_data

In [None]:
genus_counts[sig_genera.refs[node.index]]