# 210718 Validate signatures

In [1]:
using Mmap
using Random

In [2]:
using ProgressMeter
using GZip
using JSON

In [3]:
using Midas
using Midas.Distances
using Midas.SignatureFiles
using Midas.Pairwise: npairs
using TriMatrices
using TriMatrices.Indexing: lin2car

In [4]:
const metric = jaccard_dist_sorted

jaccard_dist_sorted (generic function with 1 method)

## Setup

In [5]:
infiles = Dict(
    :distances => "../../data/intermediate/200727-find-overlaps/genome-pw-distances.raw-float32",
    :signatures => "/home/jared/projects/midas/data/databases/refseq-curated/midas-1.1/refseq_curated-1.1beta-210718.midas-signatures.gz",
);

## Load data

### Signatures

In [6]:
sigfile = SignatureFile(GZip.open(infiles[:signatures]))

SignatureFile{UInt32,GZipStream} with 50752 elements

In [7]:
sigs = SignatureArray(sigfile)
nsigs = length(sigs)

50752

### Distances

In [8]:
pw_data = Mmap.mmap(infiles[:distances], Vector{Float32}, (npairs(nsigs),));

In [9]:
pw_layout = TriSymmetric{false}()

TriSymmetric{false}()

In [10]:
pw_dists = TriMatrix(pw_layout, nsigs, pw_data);

## Check distances

In [11]:
# About 1 in 100
indices = randsubseq(1:length(pw_data), .01);

In [12]:
@showprogress for idx in indices
    i, j = lin2car(pw_layout, idx)
    @assert metric(sigs[i], sigs[j]) ≈ pw_data[idx]
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:11:17[39m39m
