# Grafico 2 - #Campioni vs Varianza della stima

## Setup esperimento

- Modalita: **streaming**
- Parametri: HLL++ `k=14`, HLL `k=14,L=32`, LL `k=14,L=32`, PC `L=31`
- Dataset: `n=10^7`, `p=50`, `seed=21041998`, `d in {1%,10%,50%,100%}`

Per ogni valore di `d` il notebook mostra:
- vista **linear-linear**
- vista **log-log**


In [5]:
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display, Markdown

REPO_ROOT = Path('/Users/daniele/CLionProjects/satp-cpp')
CSV_FILES = ['results/HyperLogLog++/k_14/results_streaming.csv', 'results/HyperLogLog/k_14_L_32/results_streaming.csv', 'results/LogLog/k_14_L_32/results_streaming.csv', 'results/ProbabilisticCounting/L_31/results_streaming.csv']

TARGET_SEED = 21041998
TARGET_SAMPLE_SIZE = 10_000_000
TARGET_F0 = None

ALGO_ALIAS = {
    'HyperLogLog++': 'HLL++',
    'HyperLogLog': 'HLL',
    'LogLog': 'LL',
    'Probabilistic Counting': 'PC',
}
COLORS = {
    'HLL++': '#1f77b4',
    'HLL': '#ff7f0e',
    'LL': '#2ca02c',
    'PC': '#d62728',
    'REAL': '#111111',
}

REQ = {
    'algorithm','params','sample_size','number_of_elements_processed',
    'f0','seed','f0_mean_t','f0_heat_mean_t','variance'
}


In [6]:
frames = []
for rel in CSV_FILES:
    p = REPO_ROOT / rel
    if not p.exists():
        raise FileNotFoundError(f'Missing CSV: {p}')
    d = pd.read_csv(p)
    missing = REQ.difference(d.columns)
    if missing:
        raise ValueError(f'{p}: missing cols {sorted(missing)}')
    d['source_file'] = str(p)
    frames.append(d)

df = pd.concat(frames, ignore_index=True)
for c in ['sample_size','number_of_elements_processed','f0','seed','f0_mean_t','f0_heat_mean_t','variance']:
    df[c] = pd.to_numeric(df[c], errors='coerce')

df = df.dropna(subset=['sample_size','number_of_elements_processed','f0','seed','f0_mean_t','f0_heat_mean_t'])
df['sample_size'] = df['sample_size'].astype('int64')
df['number_of_elements_processed'] = df['number_of_elements_processed'].astype('int64')
df['f0'] = df['f0'].astype('int64')
df['seed'] = df['seed'].astype('int64')
df['algo'] = df['algorithm'].astype(str).map(lambda x: ALGO_ALIAS.get(x, x))

print('Rows:', len(df))
print('Seeds:', sorted(df['seed'].unique().tolist()))
print('sample_size:', sorted(df['sample_size'].unique().tolist()))
print('f0:', sorted(df['f0'].unique().tolist()))


Rows: 3200
Seeds: [21041998]
sample_size: [10000000]
f0: [100000, 1000000, 5000000, 10000000]


In [7]:
sel = df[(df['seed'] == int(TARGET_SEED)) & (df['sample_size'] == int(TARGET_SAMPLE_SIZE))].copy()
if TARGET_F0 is not None:
    sel = sel[sel['f0'] == int(TARGET_F0)]
if sel.empty:
    raise ValueError('Filtro vuoto')
print('Filtered rows:', len(sel))
print('f0 presenti:', sorted(sel['f0'].unique().tolist()))
print('algoritmi:', sorted(sel['algo'].unique().tolist()))


Filtered rows: 3200
f0 presenti: [100000, 1000000, 5000000, 10000000]
algoritmi: ['HLL', 'HLL++', 'LL', 'PC']


In [8]:
for f0_value in sorted(sel['f0'].unique().tolist()):
    d = sel[sel['f0'] == f0_value].sort_values(['algo','number_of_elements_processed'])
    fig = make_subplots(rows=1, cols=2, subplot_titles=('Linear-Linear', 'Log-Log'), horizontal_spacing=0.1)

    for algo in [a for a in ['HLL++','HLL','LL','PC'] if a in d['algo'].unique()]:
        a = d[d['algo'] == algo]
        c = COLORS.get(algo, '#444444')

        fig.add_trace(go.Scatter(
            x=a['number_of_elements_processed'], y=a['variance'], mode='lines', name=algo,
            legendgroup=algo, line=dict(color=c, width=2),
            hovertemplate='algoritmo: %{fullData.name}<br>campioni: %{x}<br>variance: %{y:.6f}<extra></extra>'
        ), row=1, col=1)

        fig.add_trace(go.Scatter(
            x=a['number_of_elements_processed'], y=a['variance'].where(a['variance'] > 0, np.nan),
            mode='lines', name=algo, legendgroup=algo, showlegend=False, line=dict(color=c, width=2),
            hovertemplate='algoritmo: %{fullData.name}<br>campioni: %{x}<br>variance: %{y:.6f}<extra></extra>'
        ), row=1, col=2)

    fig.update_xaxes(title_text='Numero campioni processati', row=1, col=1)
    fig.update_yaxes(title_text='Varianza', row=1, col=1)
    fig.update_xaxes(type='log', title_text='Numero campioni processati (log)', row=1, col=2)
    fig.update_yaxes(type='log', title_text='Varianza (log)', row=1, col=2)

    fig.update_layout(
        title=f'Varianza della stima | seed={TARGET_SEED} | n={TARGET_SAMPLE_SIZE} | d={f0_value}',
        template='plotly_white', hovermode='x unified', dragmode='zoom', height=550,
        legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='left', x=0),
        margin=dict(l=50, r=30, t=90, b=50)
    )
    fig.show()


In [9]:
candidate = [1_000_000, 10_000_000]
available = set(sel['f0'].unique().tolist())
chosen = [x for x in candidate if x in available]
if not chosen and len(available) > 0:
    chosen = sorted(available)[-2:]

for f0_value in chosen:
    d = sel[sel['f0'] == f0_value]
    tmax = d['number_of_elements_processed'].max()
    end = d[d['number_of_elements_processed'] == tmax][['algo','variance']].drop_duplicates('algo').sort_values('variance')

    leader = end.iloc[0]['algo'] if not end.empty else 'N/A'
    worst = end.iloc[-1]['algo'] if not end.empty else 'N/A'

    lines = []
    lines.append(f"### Commento varianza (n=10^7, d={f0_value})")
    lines.append(f"- A fine stream la varianza minore e di **{leader}**.")
    lines.append(f"- A fine stream la varianza maggiore e di **{worst}**.")
    lines.append("- La scala log-log aiuta a visualizzare come la varianza evolve nei primi campioni e nelle code.")

    display(Markdown("\n".join(lines)))
    display(end.rename(columns={'algo':'algoritmo','variance':'varianza_finale'}))


### Commento varianza (n=10^7, d=1000000)
- A fine stream la varianza minore e di **HLL++**.
- A fine stream la varianza maggiore e di **PC**.
- La scala log-log aiuta a visualizzare come la varianza evolve nei primi campioni e nelle code.

Unnamed: 0,algoritmo,varianza_finale
399,HLL++,108312200.0
1199,HLL,108312500.0
1999,LL,151792900.0
2799,PC,1437510000000.0


### Commento varianza (n=10^7, d=10000000)
- A fine stream la varianza minore e di **HLL++**.
- A fine stream la varianza maggiore e di **PC**.
- La scala log-log aiuta a visualizzare come la varianza evolve nei primi campioni e nelle code.

Unnamed: 0,algoritmo,varianza_finale
799,HLL++,0.0
1599,HLL,0.0
2399,LL,0.0
3199,PC,0.0
