In [1]:
using GZip
using ProgressMeter
using StatsBase
using DataFrames
using CSV
using CategoricalArrays
using JSON
using Serialization

In [2]:
using Midas
using Midas.Pairwise
using Midas.Distances
using Midas.SignatureFiles

In [3]:
const metric = jaccard_dist_sorted

jaccard_dist_sorted (generic function with 1 method)

## Func defs

In [4]:
findclass(a::CategoricalArray, cls::CategoricalValue) = findall(==(cls), a)
findclass(a::CategoricalArray, i::Integer) = findclass(a, a.pool[i])

findclass (generic function with 2 methods)

In [5]:
"""
Find the diameter of a set under the given metric.
"""
function find_diameter(data, metric; progress::Bool=length(data) > 100, msg::String="Finding diameter")
    n = length(data)
    best = (-Inf, 0, 0)
    dt = progress ? .5 : Inf
    @showprogress dt msg for (i, j) in iterpairs(n)
        d = metric(data[i], data[j])
        d > best[1] && (best = (d, i, j))
    end
    return best
end

find_diameter

## File paths

In [6]:
taxonomy_file = "/Users/student/notebooks/midas/midas-notebooks-2019/build-v1-database/out/3-curated-taxonomy-assignments.csv"
signature_file_name = "/Users/student/projects/midas/data/2019_20/refseq_curated_1.1beta_200604.midas-signatures.gz"
;

In [7]:
tmpdir = "tmp/"
!isdir(tmpdir) && mkdir(tmpdir);

In [8]:
outdir = "../../data/processed/200722-detect-overlaps/"
isdir(outdir) || mkdir(outdir);

In [9]:
out_files = Dict(
    :species_diameters => joinpath(outdir, "200722-species-diameters.csv"),
    :genus_diameters => joinpath(outdir, "200722-genus-diameters.csv"),
);

## Load taxonomy

In [10]:
taxdf = DataFrame(CSV.File(taxonomy_file));

In [11]:
sig_genera = categorical(taxdf[!, :genus])
genera = levels(sig_genera)
ngenera = length(genera)

sig_species = categorical([(row[:genus], row[:species]) for row in eachrow(taxdf)])
species = levels(sig_species)
nspecies = length(species)

ngenera, nspecies

(419, 1438)

In [12]:
genus_counts = counts(sig_genera.refs)
species_counts = counts(sig_species.refs)
;

In [13]:
genome_accs = [last(split(k, "/")) for k in taxdf[!, :key]];

## Load signatures

In [14]:
sigfile = SignatureFile(GZip.open(signature_file_name))

SignatureFile{UInt32,GZipStream} with 50752 elements

In [15]:
metadata = SignatureFiles._read_metadata(sigfile.stream, sigfile.header.o_metadata...)
JSON.print(metadata, 2)

{
  "date_created": "2020-06-04",
  "genome_set": {
    "key": "midas/assembly/curated",
    "name": "refseq_curated_2020",
    "meta": {
      "date_created": "2020-05-26",
      "parent": {
        "key": "midas/assembly/curated",
        "key_version": "0.9"
      }
    },
    "description": "Created 2020-05-26 by filtering version 0.9 by inclusion in refseq/assembly/all 1.1",
    "key_version": "1.1"
  },
  "kmer_spec": {
    "k": 11,
    "prefix": "ATGAC"
  },
  "description": "Signatures for version 1.1 of curated genome set"
}


In [16]:
# Should both be sorted:
@assert sigfile.ids == taxdf[:, :key]

In [17]:
@time sigs = SignatureArray(sigfile);

 58.624861 seconds (53.71 k allocations: 1.355 GiB, 0.03% gc time)


## Genus diameters

In [18]:
genus_tmpfile = joinpath(tmpdir, "genus-diameters.jld")

"tmp/genus-diameters.jld"

In [19]:
# Load saved in-progress dict if it exists
if isfile(genus_tmpfile)
    genus_diameters = deserialize(genus_tmpfile)
else
    genus_diameters = Dict{Int, Tuple{Float64, Int, Int}}()
end
length(genus_diameters), ngenera

(419, 419)

Can abort the following mid-loop and save partial progress in the cell after it:

In [20]:
for gi in 1:ngenera
    haskey(genus_diameters, gi) && continue
    n = genus_counts[gi]
    msg = "$(genera[gi]) ($n) "
    gsigs = sigs[findclass(sig_genera, gi)]
    genus_diameters[gi] = find_diameter(gsigs, metric, progress=n>200, msg=msg)
end

In [21]:
# Save (partial) progress
serialize(genus_tmpfile, genus_diameters)

## Species diameters

In [22]:
species_tmpfile = joinpath(tmpdir, "species-diameters.jld")

"tmp/species-diameters.jld"

In [23]:
# Load saved in-progress dict if it exists
if isfile(species_tmpfile)
    species_diameters = deserialize(species_tmpfile)
else
    species_diameters = Dict{Int, Tuple{Float64, Int, Int}}()
end
length(species_diameters), nspecies

(1438, 1438)

Can abort the following mid-loop and save partial progress in the cell after it:

In [24]:
for si in 1:nspecies
    haskey(species_diameters, si) && continue
    n = species_counts[si]
    msg = "$(species[si]) ($n) "
    ssigs = sigs[findclass(sig_species, si)]
    species_diameters[si] = find_diameter(ssigs, metric, progress=n>200, msg=msg)
end

In [25]:
# Save (partial) progress
serialize(species_tmpfile, species_diameters)

## Save dataframes

In [26]:
genus_df = map(1:ngenera) do gi
    d, i, j = genus_diameters[gi]
    
    gindices = findclass(sig_genera, gi)
    i2 = gindices[i]
    j2 = gindices[j]
    
    return (genus_index=gi, genus=genera[gi], diameter=d, index1=i2, acc1=genome_accs[i2], index2=j2, acc2=genome_accs[j2])
end |> DataFrame;

In [27]:
species_df = map(1:nspecies) do si
    d, i, j = species_diameters[si]
    
    sindices = findclass(sig_species, si)
    i2 = sindices[i]
    j2 = sindices[j]
    
    gname, spname = species[si]
    
    return (species_index=si, genus=gname, species=spname, diameter=d, index1=i2, acc1=genome_accs[i2], index2=j2, acc2=genome_accs[j2])
end |> DataFrame;

In [28]:
CSV.write(out_files[:genus_diameters], genus_df)
CSV.write(out_files[:species_diameters], species_df)

"../../data/processed/200722-detect-overlaps/200722-species-diameters.csv"