# 200723 Kmer frequency inspection

In [1]:
using GZip
using ProgressMeter
using DataFrames
using StatsBase
using CSV
using CategoricalArrays
using JSON
using Serialization

In [2]:
using Midas
using Midas.Distances
using Midas.SignatureFiles

## Func defs

In [3]:
findclass(a::CategoricalArray, cls::CategoricalValue) = findall(==(cls), a)
findclass(a::CategoricalArray, i::Integer) = findclass(a, a.pool[i])
selectclass(a::AbstractVector, c::CategoricalArray, cls) = a[findclass(c, cls)]

selectclass (generic function with 1 method)

## File paths

In [4]:
taxonomy_file = "/Users/student/notebooks/midas/midas-notebooks-2019/build-v1-database/out/3-curated-taxonomy-assignments.csv"
signature_file_name = "/Users/student/projects/midas/data/2019_20/refseq_curated_1.1beta_200604.midas-signatures.gz"
;

In [5]:
tmpdir = "tmp/"
!isdir(tmpdir) && mkdir(tmpdir);

## Load taxonomy

In [6]:
taxdf = DataFrame(CSV.File(taxonomy_file));

In [7]:
sig_genera = categorical(taxdf[!, :genus])
genera = levels(sig_genera)
ngenera = length(genera)

sig_species = categorical([(row[:genus], row[:species]) for row in eachrow(taxdf)])
species = levels(sig_species)
nspecies = length(species)

ngenera, nspecies

(419, 1438)

In [8]:
genus_counts = counts(sig_genera.refs)
species_counts = counts(sig_species.refs)
;

In [9]:
genome_accs = [last(split(k, "/")) for k in taxdf[!, :key]];

## Load signatures

In [10]:
sigfile = SignatureFile(GZip.open(signature_file_name))

SignatureFile{UInt32,GZipStream} with 50752 elements

In [11]:
metadata = SignatureFiles.read_metadata(sigfile)
JSON.print(metadata, 2)

{
  "date_created": "2020-06-04",
  "genome_set": {
    "key": "midas/assembly/curated",
    "name": "refseq_curated_2020",
    "meta": {
      "date_created": "2020-05-26",
      "parent": {
        "key": "midas/assembly/curated",
        "key_version": "0.9"
      }
    },
    "description": "Created 2020-05-26 by filtering version 0.9 by inclusion in refseq/assembly/all 1.1",
    "key_version": "1.1"
  },
  "kmer_spec": {
    "k": 11,
    "prefix": "ATGAC"
  },
  "description": "Signatures for version 1.1 of curated genome set"
}


In [12]:
# Should both be sorted:
@assert sigfile.ids == taxdf[:, :key]

In [None]:
@time sigs = SignatureArray(sigfile);

In [None]:
nsigs = length(sigs)

## Funcs

In [None]:
# Entropy of Bernoulli distribution in bits
function entropy(p)
    0 < p < 1 || return zero(p)
    q = 1 - p
    return -(p*log2(p) + q*log2(q))
end

In [None]:
function accumulate_sig!(a::AbstractVector{T}, sig, w::T=one(T)) where T
    for i in sig
        @inbounds a[i + 1] += w
    end
end

## Species k-mer frequencies

In [None]:
using DataStructures: accumulate

In [None]:
nkmers = 4^11

In [None]:
si = partialsortperm(species_counts, nspecies-3)
species_counts[si], species[si]

In [None]:
function kmer_freqs_dict(sigs, freqs=zeros(Float32, nkmers))
    freqs .= 0
    for sig in sigs
        accumulate_sig!(freqs, sig)
    end
    freqs ./= length(sigs)
    
    return Dict(i => f for (i, f) in enumerate(freqs) if f > 0)
end

In [None]:
species_freqs = progress_pmap(1:nspecies) do si
    sidxs = findclass(sig_species, si)
    kmer_freqs_dict(sigs[sidxs], freqs)
end;

In [None]:
species_nkmers = length.(species_freqs);

In [None]:
species_pw_nunion = zeros(Int32, nspecies, nspecies);
species_pw_nintersect = copy(species_pw_nunion);

In [None]:
@showprogress for i in 1:nspecies
    k1 = keys(species_freqs[i])
    for j in (i+1):nspecies
        k2 = keys(species_freqs[j])
        ni = length(intersect(k1, k2))
        species_pw_nintersect[i, j] = species_pw_nintersect[j, i] = ni
        species_pw_nunion[i, j] = species_pw_nunion[j, i] = length(k1) + length(k2) - ni
    end
end

In [None]:
i, j = rand(1:nspecies, 2)

In [None]:
ps_pw_mindist = @. max(1 - Float32(species_pw_nintersect) / max(sp_min_kmers, sp_min_kmers), 0)