# 200727 Calculate pairwise distances

In [21]:
using ProgressMeter
using GZip
using JSON
using Mmap
using Random

In [2]:
using Midas
using Midas.Distances
using Midas.SignatureFiles
using Midas.Pairwise: npairs, iterpairs
using TriMatrices

In [3]:
const metric = jaccard_dist_sorted

jaccard_dist_sorted (generic function with 1 method)

In [4]:
Threads.nthreads()

4

## File paths

In [5]:
infiles = Dict(
    :signatures => "/home/jared/data/2019_20/refseq_curated_1.1beta_200604.midas-signatures.gz"
);

In [6]:
outfiles = Dict(
    :distances => "../../data/intermediate/200727-find-overlaps/genome-pw-distances.raw-float32"
)

Dict{Symbol,String} with 1 entry:
  :distances => "../../data/intermediate/200727-find-overlaps/genome-pw-distanc…

## Load signatures

In [7]:
sigfile = SignatureFile(GZip.open(infiles[:signatures]))

SignatureFile{UInt32,GZipStream} with 50752 elements

In [8]:
metadata = SignatureFiles.read_metadata(sigfile)
JSON.print(metadata, 2)

{
  "date_created": "2020-06-04",
  "genome_set": {
    "key": "midas/assembly/curated",
    "name": "refseq_curated_2020",
    "meta": {
      "date_created": "2020-05-26",
      "parent": {
        "key": "midas/assembly/curated",
        "key_version": "0.9"
      }
    },
    "description": "Created 2020-05-26 by filtering version 0.9 by inclusion in refseq/assembly/all 1.1",
    "key_version": "1.1"
  },
  "kmer_spec": {
    "k": 11,
    "prefix": "ATGAC"
  },
  "description": "Signatures for version 1.1 of curated genome set"
}


In [9]:
@time sigs = SignatureArray(sigfile);

 64.270878 seconds (53.57 k allocations: 1.355 GiB, 0.01% gc time)


In [10]:
nsigs = length(sigs)

50752

## Open mmapped file

In [17]:
pw_data = Mmap.mmap(open(outfiles[:distances], read=true, write=true, create=true), Vector{Float32}, (npairs(nsigs),));

In [18]:
pw_dists = TriMatrix(TriSymmetric{false}(), nsigs, pw_data);

In [34]:
sum(<(0), pw_data) / length(pw_data)

0.0

## Calculate distances

In [24]:
chunksize = 10000
chunks = Tuple{Int, UnitRange{Int}}[]

for i in 2:nsigs
    nchunks = ceil(Int, i / chunksize)
    n = ceil(Int, (i-1) / nchunks)
    
    for j in 1:n:(i-1)
        push!(chunks, (i, j:min(j+n-1, i-1)))
    end
end

shuffle!(chunks)

length(chunks)

154511

In [31]:
let
    pbar = Progress(length(chunks))
    
    Threads.@threads for (i, jrange) in chunks
        si = sigs[i]
        for j in jrange
            pw_dists[i, j] <= 0 && (pw_dists[i, j] = metric(si, sigs[j]))
        end
        Threads.threadid() == 1 && Mmap.sync!(pw_data)
        next!(pbar)
    end
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:01:42[39m


In [26]:
Mmap.sync!(pw_data)

In [32]:
any(<(0), pw_data)

false