In [1]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from fainder.typing import Histogram
from fainder.utils import configure_run, load_input
from utils.plotting_defaults import set_style

configure_run("WARNING")
set_style()
Path("plots/misc").mkdir(parents=True, exist_ok=True)

In [2]:
open_data: list[tuple[np.uint32, Histogram]] = load_input("../data/open_data_usa/histograms.zst")
gittables: list[tuple[np.uint32, Histogram]] = load_input("../data/gittables/histograms.zst")

In [3]:
data = {
    "open_data": open_data,
    "gittables": gittables,
}

In [4]:
features = {}
for k, v in data.items():
    features[k] = np.zeros((len(v), 3), dtype=np.float128)
    for id_, hist in v:
        features[k][id_] = (hist[1].min(), hist[1].max(), np.diff(hist[1]).mean())

In [5]:
for k, v in features.items():
    subset = np.random.default_rng().choice(v, size=min(100000, len(v)), axis=0, replace=False)
    fig, ax = plt.subplots(figsize=(1.7, 1.2))
    ax.scatter(subset[:, 1] - subset[:, 0], subset[:, 2])
    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.set_ylabel("Mean bin width")
    ax.set_xlabel("Histogram value range")
    sns.despine()
    plt.tight_layout(pad=1.02)
    plt.savefig(
        f"plots/misc/histogram_stats_{k}.png", dpi=600, bbox_inches="tight", pad_inches=0.01
    )
    plt.close()