# 200812 ???

In [None]:
using Revise
using StatsBase
using DataFrames
using HDF5
using ProgressMeter

In [None]:
using Midas: ArraySet, vec_in

In [None]:
using PyPlot, PyCall
sns = pyimport("seaborn")

## Configuration and directories

In [None]:
tmpdir = "tmp/"

In [None]:
datafiles = Dict(
    match(r"^(.*)_L001_R1_001.h5", basename(f)).captures[1] => f
    for f in readdir(tmpdir, join=true) if endswith(f, ".h5")
)
length(datafiles)

In [None]:
outdir = "../../data/processed/200811-fastq-kmer-counts/"
isdir(outdir) || mkdir(outdir);

## Defs

In [None]:
# Histogram with transparency fill
function myhist(ax, x, color; alpha=.25, lw=2, kw...)
    h1 = ax.hist(x; fc=color, alpha=alpha, kw...)
    h2 = ax.hist(x; histtype=:step, ec=color, lw=lw, kw...)
    return (h1, h2)
end

In [None]:
# Histogram with one bin per integer
function counts_hist(ax::PyObject, data::AbstractVector, range=1:maximum(data); kw...)
    cnts = counts(data, range)
    b = ax.bar(range, cnts; kw...)
    ax.set_xticks(range)
    return b
end

counts_hist(data::AbstractVector, range=1:maximum(data); kw...) = counts_hist(plt.gca(), data, range; kw...)

## Read data

In [None]:
data = Dict(progress_map(collect(datafiles)) do (key, f)
    h5 = h5open(f)
        
    assm_kmers = read(h5, "assembly/kmers")
    assm_size = sum(read(h5, "assembly/contig_lengths"))
    raw_kmers = read(h5, "raw/kmers")
        
    d = (
        assm_kmers=assm_kmers,
        assm_counts=read(h5, "assembly/counts"),
        raw_kmers=raw_kmers,
        raw_counts=read(h5, "raw/counts"),
        score_thresholds=read(h5, "raw/score_thresholds"),
        assm_size=assm_size,
        est_coverage=read(attrs(h5["raw"]), "mean_read_length") * read(attrs(h5["raw"]), "nreads") / assm_size,
        raw_kmer_in_assm=vec_in(ArraySet(raw_kmers), ArraySet(assm_kmers)),
    )
        
    close(h5)
    return key => d
end);

## ?

In [None]:
let
    n = size(raw_counts, 1)
    fig, axs = plt.subplots(n, 2, sharey=:col, sharex=:col)
    range1 = 1:9
    range2 = (10, maximum(raw_counts))
    
    for i in 1:n
        ax1, ax2 = axs[i, :]
        
        tp = filter(>(0), raw_counts[i, raw_kmer_in_assm])
        tn = filter(>(0), raw_counts[i, .!raw_kmer_in_assm])
        
        counts_hist(ax1, tp, range1)
        counts_hist(ax1, tn, range1)
        
        ax2.hist(log10.(tp), range=log10.(range2), bins=40)
        ax2.hist(log10.(tn), range=log10.(range2), bins=40)

        ax2.axvline(log10(est_coverage), color=:red)
    end
    
    axs[1, 1].set_yscale(:log)
end

In [None]:
let
    n = size(raw_counts, 1)
    fig, axs = plt.subplots(n, 2, sharey=:col, sharex=:col)
    range1 = 1:9
    range2 = (10, maximum(raw_counts))
    
    for i in 1:n
        ax1, ax2 = axs[i, :]
        
        tp = filter(>(0), raw_counts[i, raw_kmer_in_assm])
        tn = filter(>(0), raw_counts[i, .!raw_kmer_in_assm])
        
        counts_hist(ax1, tp, range1)
        counts_hist(ax1, tn, range1)
        
        ax2.hist(log10.(tp), range=log10.(range2), bins=40)
        ax2.hist(log10.(tn), range=log10.(range2), bins=40)

        ax2.axvline(log10(est_coverage), color=:red)
    end
    
    axs[1, 1].set_yscale(:log)
end

In [None]:
let
    fig, axs = plt.subplots(20, 4, figsize=(16, 30), sharex=true)
    
    @showprogress for (key, ax) in zip(sort(collect(keys(data))), axs)
        d = data[key]
        
        for i in 1:4
            tp = filter(>(0), d.raw_counts[i, d.raw_kmer_in_assm])
            sns.kdeplot(log10.(tp), ax=ax, shade=true)
        end
        
        ax.axvline(log10(d.est_coverage), lw=2, color=:black, ls=:dotted)
        ax.set_title(key, fontsize="small")
        ax.yaxis.set_visible(false)
    end
    
    plt.tight_layout()
end

In [None]:
let
    fig, axs = plt.subplots(20, 4, figsize=(16, 30), sharex=true)
    
    @showprogress for (key, ax) in zip(sort(collect(keys(data))), axs)
        d = data[key]
        
        for i in 1:4
            tp = filter(>(0), d.raw_counts[i, d.raw_kmer_in_assm])
            sns.kdeplot(log10.(tp), ax=ax, shade=true)
        end
        
        ax.axvline(log10(d.est_coverage), lw=2, color=:black, ls=:dotted)
        ax.set_title(key, fontsize="small")
        ax.yaxis.set_visible(false)
    end
    
    plt.tight_layout()
end