# HLL vs HLL++: differenza della stima su `n=10.000.000`

Questo notebook confronta **HLL** e **HLL++** sul dataset piÃ¹ grande (`sample_size = 10.000.000`) e disegna:

- un grafico per ogni coppia **(seed, d)**;
- asse x: `element_index` (checkpoint nello stream);
- asse y: `f0_hat_mean(HLL) - f0_hat_mean(HLL++)`.

I grafici sono interattivi (zoom, pan, legenda attiva/disattiva).

In [None]:
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go


def resolve_repo_root() -> Path:
    cwd = Path.cwd().resolve()
    candidates = [cwd, cwd.parent, cwd.parent.parent]
    for c in candidates:
        if (c / 'results').exists() and (c / 'notebooks').exists():
            return c
    raise RuntimeError('Impossibile trovare la root del repository partendo dalla cwd corrente.')


REPO_ROOT = resolve_repo_root()
RESULTS_DIR = REPO_ROOT / 'results'

hll_path = RESULTS_DIR / 'HyperLogLog' / 'k_16_L_32' / 'results_streaming.csv'
hllpp_path = RESULTS_DIR / 'HyperLogLog++' / 'k_16' / 'results_streaming.csv'

if not hll_path.exists():
    raise FileNotFoundError(f'File mancante: {hll_path}')
if not hllpp_path.exists():
    raise FileNotFoundError(f'File mancante: {hllpp_path}')

hll = pd.read_csv(hll_path)
hllpp = pd.read_csv(hllpp_path)

for df in (hll, hllpp):
    df['sample_size'] = df['sample_size'].astype(int)
    df['element_index'] = df['element_index'].astype(int)
    df['distinct_count'] = df['distinct_count'].astype(int)
    df['seed'] = df['seed'].astype(int)
    df['f0_hat_mean'] = pd.to_numeric(df['f0_hat_mean'], errors='coerce')

TARGET_N = 10_000_000
hll_n = hll[hll['sample_size'] == TARGET_N].copy()
hllpp_n = hllpp[hllpp['sample_size'] == TARGET_N].copy()

join_keys = ['sample_size', 'element_index', 'distinct_count', 'seed']

merged = hll_n[join_keys + ['f0_hat_mean']].merge(
    hllpp_n[join_keys + ['f0_hat_mean']],
    on=join_keys,
    how='inner',
    suffixes=('_hll', '_hllpp')
)

merged['delta_hll_minus_hllpp'] = merged['f0_hat_mean_hll'] - merged['f0_hat_mean_hllpp']
merged = merged.sort_values(['seed', 'distinct_count', 'element_index']).reset_index(drop=True)

seeds = sorted(merged['seed'].unique().tolist())
d_values = sorted(merged['distinct_count'].unique().tolist())

print(f'Repository: {REPO_ROOT}')
print(f'Righe HLL (n={TARGET_N}):', len(hll_n))
print(f'Righe HLL++ (n={TARGET_N}):', len(hllpp_n))
print('Righe merge:', len(merged))
print('Seeds:', seeds)
print('d values:', d_values)

In [None]:
# Grafico per ogni coppia (seed, d)
for seed in seeds:
    for d in d_values:
        sdf = merged[(merged['seed'] == seed) & (merged['distinct_count'] == d)].copy()
        if sdf.empty:
            continue

        fig = go.Figure()

        fig.add_trace(go.Scatter(
            x=sdf['element_index'],
            y=sdf['delta_hll_minus_hllpp'],
            mode='lines+markers',
            name='HLL - HLL++',
            marker=dict(size=4),
            line=dict(width=2),
            hovertemplate=(
                'seed: %{customdata[0]}<br>'
                'd: %{customdata[1]}<br>'
                'element_index: %{x}<br>'
                'delta (HLL-HLL++): %{y:.6f}<extra></extra>'
            ),
            customdata=sdf[['seed', 'distinct_count']].to_numpy(),
        ))

        fig.add_hline(y=0.0, line_dash='dash', line_width=1)

        fig.update_layout(
            title=f'Differenza stima HLL - HLL++ | n={TARGET_N}, seed={seed}, d={d}',
            xaxis_title='element_index',
            yaxis_title='f0_hat_mean(HLL) - f0_hat_mean(HLL++)',
            template='plotly_white',
        dragmode='zoom',
            height=430,
            width=1100,
            legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='left', x=0),
            xaxis=dict(fixedrange=False),
            yaxis=dict(fixedrange=False),
        )

        fig.show(config={'scrollZoom': True, 'displaylogo': False})