# Chapter 5 - Type A (solo HLL e HLL++)

Grafici Type A: ascissa `F0` reale (`f0_mean_t`), ordinata `F0_hat` (`f0_hat_mean_t`) con banda `Â± stddev`, a `rho` fissato.
Versione con soli HyperLogLog e HyperLogLog++.


In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use('seaborn-v0_8-whitegrid')

REPO = Path('/Users/daniele/CLionProjects/satp-cpp')
RESULTS_ROOT = REPO / 'results' / 'prefix_constant_rho' / 'streaming'
OUT_DIR = REPO / 'thesis' / 'figures' / 'results'
OUT_DIR.mkdir(parents=True, exist_ok=True)

TARGET_N = 10_000_000
TARGET_SEED = 21_041_998

TARGET_PARAMS = {
    'HyperLogLog++': 'k=14',
    'HyperLogLog': 'k=14,L=32',
}

ALGO_ORDER = ['HyperLogLog++', 'HyperLogLog']
ALGO_COLORS = {
    'HyperLogLog++': '#1f77b4',
    'HyperLogLog': '#ff7f0e',
}


In [2]:
paths = sorted(RESULTS_ROOT.glob('*/*/results_streaming.csv'))
if not paths:
    raise FileNotFoundError(f'Nessun CSV trovato in: {RESULTS_ROOT}')

frames = []
for p in paths:
    df = pd.read_csv(p)
    frames.append(df)

df = pd.concat(frames, ignore_index=True)
sel = df[(df['mode'] == 'streaming') &
         (df['sample_size'] == TARGET_N) &
         (df['seed'] == TARGET_SEED)].copy()

sel = sel[sel['algorithm'].isin(ALGO_ORDER)].copy()
for algo, params in TARGET_PARAMS.items():
    sel = sel[~((sel['algorithm'] == algo) & (sel['params'] != params))]

if sel.empty:
    raise ValueError('Filtro vuoto: nessun dato per i parametri richiesti')

sel['rho'] = (sel['sample_size'] / sel['f0']).astype(int)
sel = sel[(sel['f0_mean_t'] > 0) & (sel['f0_heat_mean_t'] > 0)]

rho_values = sorted(sel['rho'].unique().tolist())
print('Righe selezionate:', len(sel))
print('Rho disponibili:', rho_values)
print('Algoritmi:', sorted(sel['algorithm'].unique().tolist()))


Righe selezionate: 2800
Rho disponibili: [1, 2, 5, 10, 20, 50, 100]
Algoritmi: ['HyperLogLog', 'HyperLogLog++']


In [3]:
def plot_type_a(loglog: bool):
    ncols = 3
    nrows = int(np.ceil(len(rho_values) / ncols))
    fig, axes = plt.subplots(nrows, ncols, figsize=(18, 5.2 * nrows), constrained_layout=True)
    axes = np.array(axes).reshape(-1)

    def log_band_from_mean_std(mean: np.ndarray, std: np.ndarray, eps: float = 1e-12):
        m = np.maximum(mean, eps)
        cv = np.divide(std, m, out=np.zeros_like(m), where=m > 0)
        sigma_log = np.sqrt(np.log1p(cv * cv))
        low = m * np.exp(-sigma_log)
        high = m * np.exp(sigma_log)
        return low, high

    for idx, rho in enumerate(rho_values):
        ax = axes[idx]
        sub_rho = sel[sel['rho'] == rho].copy()

        x_min = float(sub_rho['f0_mean_t'].min())
        x_max = float(sub_rho['f0_mean_t'].max())
        diag_x = np.array([x_min, x_max], dtype=float)
        ax.plot(diag_x, diag_x, color='black', linestyle='--', linewidth=1.2, label='y = F0')

        for algo in ALGO_ORDER:
            part = sub_rho[sub_rho['algorithm'] == algo].copy()
            if part.empty:
                continue
            part = part.sort_values('f0_mean_t')

            x = part['f0_mean_t'].to_numpy(dtype=float)
            y = part['f0_heat_mean_t'].to_numpy(dtype=float)
            s = part['stddev'].to_numpy(dtype=float)

            color = ALGO_COLORS.get(algo, None)
            ax.plot(x, y, color=color, linewidth=2.0, label=algo)
            if loglog:
                low, high = log_band_from_mean_std(y, s)
                ax.fill_between(x, low, high, color=color, alpha=0.22)
            else:
                ax.fill_between(x, np.maximum(y - s, 1e-12), y + s, color=color, alpha=0.22)

        ax.set_title(f'rho = {rho}')
        ax.set_xlabel('F0 reale (f0_mean_t)')
        ax.set_ylabel('F0 stimata (f0_hat_mean_t)')

        if loglog:
            ax.set_xscale('log')
            ax.set_yscale('log')

    for j in range(len(rho_values), len(axes)):
        axes[j].axis('off')

    handles, labels = axes[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc='upper center', ncol=3, bbox_to_anchor=(0.5, 1.02), frameon=True)

    suffix = 'log-log' if loglog else 'lineare'
    fig.suptitle(f'Type A (solo HLL/HLL++) - prefix_constant_rho (n={TARGET_N}, seed={TARGET_SEED}, scala {suffix})', y=1.06)

    out_name = 'typeA_prefix_constant_rho_hll_hllpp_loglog.png' if loglog else 'typeA_prefix_constant_rho_hll_hllpp_linear.png'
    out_path = OUT_DIR / out_name
    fig.savefig(out_path, dpi=160, bbox_inches='tight')
    plt.close(fig)
    return out_path

out_linear = plot_type_a(loglog=False)
out_loglog = plot_type_a(loglog=True)
print('saved:', out_linear)
print('saved:', out_loglog)



saved: /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/typeA_prefix_constant_rho_hll_hllpp_linear.png
saved: /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/typeA_prefix_constant_rho_hll_hllpp_loglog.png
