# 200727 Find overlaps

In [1]:
using GZip
using ProgressMeter
using DataFrames
using CSV
using StatsBase
using CategoricalArrays
using JLD
using Serialization
using JSON

┌ Info: Precompiling JLD [4138dd39-2aa7-5051-a626-17a0bb65d9c8]
└ @ Base loading.jl:1260


In [2]:
using Midas
using Midas.Distances
using Midas.SignatureFiles
using TriMatrices

In [3]:
metric = jaccard_dist_sorted

jaccard_dist_sorted (generic function with 1 method)

## Func defs

In [4]:
findclass(a::CategoricalArray, cls::CategoricalValue) = findall(==(cls), a)
findclass(a::CategoricalArray, i::Integer) = findclass(a, a.pool[i])
selectclass(a::AbstractVector, c::CategoricalArray, cls) = a[findclass(c, cls)]

selectclass (generic function with 1 method)

## File paths

In [5]:
outdir = "../../data/processed/200722-detect-overlaps/"

"../../data/processed/200722-detect-overlaps/"

In [6]:
infiles = Dict(
    :taxonomy => "/Users/student/notebooks/midas/midas-notebooks-2019/build-v1-database/out/3-curated-taxonomy-assignments.csv",
    :signatures => "/Users/student/projects/midas/data/2019_20/refseq_curated_1.1beta_200604.midas-signatures.gz",
    :species_min_dists => joinpath(outdir, "200726-species-min-dists.jld"),
    :genus_min_dists => joinpath(outdir, "200726-genus-min-dists.jld"),
);

In [7]:
tmpdir = "tmp/"
!isdir(tmpdir) && mkdir(tmpdir);

## Load taxonomy

In [8]:
taxdf = DataFrame(CSV.File(infiles[:taxonomy]));

In [9]:
sig_genera = categorical(taxdf[!, :genus])
genera = levels(sig_genera)
ngenera = length(genera)

sig_species = categorical([(row[:genus], row[:species]) for row in eachrow(taxdf)])
species = levels(sig_species)
nspecies = length(species)

ngenera, nspecies

(419, 1438)

In [10]:
genus_counts = counts(sig_genera.refs)
species_counts = counts(sig_species.refs)
;

In [11]:
genome_accs = [last(split(k, "/")) for k in taxdf[!, :key]];

In [12]:
species_to_genus = [findfirst(==(genus), genera) for (genus, spname) in species];

## Load signatures

In [13]:
sigfile = SignatureFile(GZip.open(infiles[:signatures]))

SignatureFile{UInt32,GZipStream} with 50752 elements

In [14]:
metadata = SignatureFiles.read_metadata(sigfile)
JSON.print(metadata, 2)

{
  "date_created": "2020-06-04",
  "genome_set": {
    "key": "midas/assembly/curated",
    "name": "refseq_curated_2020",
    "meta": {
      "date_created": "2020-05-26",
      "parent": {
        "key": "midas/assembly/curated",
        "key_version": "0.9"
      }
    },
    "description": "Created 2020-05-26 by filtering version 0.9 by inclusion in refseq/assembly/all 1.1",
    "key_version": "1.1"
  },
  "kmer_spec": {
    "k": 11,
    "prefix": "ATGAC"
  },
  "description": "Signatures for version 1.1 of curated genome set"
}


In [15]:
# Should both be sorted:
@assert sigfile.ids == taxdf[:, :key]

In [16]:
@time sigs = SignatureArray(sigfile);

 98.139548 seconds (53.71 k allocations: 1.355 GiB, 0.03% gc time)


In [17]:
nsigs = length(sigs)

50752

## Load distance lower bounds and taxon diameters

In [19]:
species_diameters = deserialize("tmp/species-diameters.jld")
genus_diameters = deserialize("tmp/genus-diameters.jld");

In [20]:
species_pw_mindists = let
    data = load(infiles[:species_min_dists], "data")
    TriMatrix(TriSymmetric(), nspecies, data)
end;

In [21]:
genus_pw_mindists = let
    data = load(infiles[:genus_min_dists], "data")
    TriMatrix(TriSymmetric(), ngenera, data)
end;

## Find species overlaps

In [22]:
species_possible_overlaps = [
    (i, j) for i in 1:nspecies for j in 1:(i-1)
    if species_pw_mindists[i, j] <= max(species_diameters[i][1], species_diameters[j][1])
];
length(species_possible_overlaps), nspecies*(nspecies-1)

(13344, 2066406)

In [23]:
species_tmpfile = joinpath(tmpdir, "species-has-overlaps.jls")

# Load partial result if they exist
species_has_overlaps = if isfile(species_tmpfile)
    deserialize(species_tmpfile)
else
    Dict{Tuple{Int, Int}, Bool}()
end

Dict{Tuple{Int64,Int64},Bool} with 0 entries

The following can be aborted mid-loop and partial progress can be saved in the following cell, to be resumed later.

In [29]:
@showprogress for (si, sj) in filter(k -> !haskey(species_has_overlaps, k), species_possible_overlaps)
    thresh = min(species_diameters[si][1], species_diameters[sj][1])
    has_overlap = false
    
    for i in findclass(sig_species, si), j in findclass(sig_species, sj)
        metric(sigs[i], sigs[j]) <= thresh && (has_overlap = true; break)
    end
    
    species_has_overlaps[(si, sj)] = has_overlap
end

[32mProgress:   4%|█▌                                       |  ETA: 2 days, 2:32:00[39mm

InterruptException: InterruptException:

In [30]:
# Save (partial) progress
serialize(species_tmpfile, species_has_overlaps)

In [28]:
[
    (species[si], species[sj])
    for ((si, sj), o) in pairs(species_has_overlaps)
    if o
]

18-element Array{Tuple{Tuple{String,String},Tuple{String,String}},1}:
 (("Bacillus", "velezensis"), ("Bacillus", "amyloliquefaciens"))
 (("Lactobacillus", "paracasei"), ("Lactobacillus", "casei"))
 (("Bacillus", "mycoides"), ("Bacillus", "cereus"))
 (("Brucella", "suis"), ("Brucella", "ceti"))
 (("Enterobacter", "hormaechei"), ("Enterobacter", "cloacae"))
 (("Pseudomonas", "savastanoi"), ("Pseudomonas", "amygdali"))
 (("Neisseria", "meningitidis"), ("Neisseria", "lactamica"))
 (("Clostridium", "sporogenes"), ("Clostridium", "botulinum"))
 (("Bacillus", "thuringiensis"), ("Bacillus", "cereus"))
 (("Pseudomonas", "syringae"), ("Pseudomonas", "savastanoi"))
 (("Corynebacterium", "jeikeium"), ("Corynebacterium", "aurimucosum"))
 (("Mycobacterium", "tuberculosis"), ("Mycobacterium", "bovis"))
 (("Brucella", "pinnipedialis"), ("Brucella", "ceti"))
 (("Pseudomonas", "syringae"), ("Pseudomonas", "amygdali"))
 (("Prochlorococcus", "marinus"), ("Gardnerella", "vaginalis"))
 (("Burkholderia", "mu

## Find genus overlaps

In [None]:
genus_possible_overlaps = [
    (i, j) for i in 1:ngenera for j in 1:(i-1)
    if genus_pw_mindists[i, j] <= max(genus_diameters[i][1], genus_diameters[j][1])
];
length(genus_possible_overlaps), ngenera*(ngenera-1)

In [None]:
genus_tmpfile = joinpath(tmpdir, "genus-overlaps.jls")

# Load partial result if they exist
genus_overlaps = if isfile(genus_tmpfile)
    deserialize(genus_tmpfile)
else
    Dict{Tuple{Int, Int}, Vector{Tuple{Int, Int}}}()
end

The following can be aborted mid-loop and partial progress can be saved in the following cell, to be resumed later.

In [None]:
@showprogress for (gi, gj) in genus_posgible_overlaps
    haskey(genus_overlaps, (gi, gj)) && continue
    pair_overlaps = Tuple{Int, Int}[]
    thresh = min(genus_diameters[gi][1], genus_diameters[gj][1])
    
    for i in findclass(gig_genera, gi), j in findclass(gig_genera, gj)
        metric(gigs[i], gigs[j]) <= thresh && push!(pair_overlaps, (i, j))
    end
    
    genus_overlaps[(gi, gj)] = pair_overlaps
end

In [None]:
# Save (partial) progress
serialize(genus_tmpfile, genus_overlaps)