# Risultati Normal (One-shot)

Questo notebook analizza i risultati `results_oneshot.csv` per tutti gli algoritmi.

Obiettivi:
- confronto accuratezza al variare di `n` e `d/n`;
- confronto tra metrica osservata e teorica (RSE);
- sensibilita ai seed;
- ranking sintetico per metrica.


In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

PLOT_CONFIG = {'scrollZoom': True, 'displaylogo': False}

def resolve_results_root() -> Path:
    candidates = [Path('results'), Path('../results'), Path('../../results')]
    for c in candidates:
        if c.exists():
            return c.resolve()
    raise FileNotFoundError('Cartella results non trovata. Esegui dalla root progetto o da notebooks/.')

RESULTS_ROOT = resolve_results_root()
print('RESULTS_ROOT =', RESULTS_ROOT)


RESULTS_ROOT = /Users/daniele/CLionProjects/satp-cpp/results


In [2]:
oneshot_files = sorted(RESULTS_ROOT.glob('*/*/results_oneshot.csv'))
if not oneshot_files:
    raise FileNotFoundError(f'Nessun results_oneshot.csv trovato in {RESULTS_ROOT}')

frames = []
for f in oneshot_files:
    df = pd.read_csv(f)
    df['source_file'] = str(f)
    frames.append(df)

df = pd.concat(frames, ignore_index=True)

num_cols = [
    'runs', 'sample_size', 'element_index', 'distinct_count', 'seed',
    'f0_mean', 'f0_hat_mean', 'mean', 'variance', 'stddev',
    'rse_theoretical', 'rse_observed', 'bias', 'difference',
    'bias_relative', 'mean_relative_error', 'rmse', 'mae'
]
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

df['d_over_n'] = df['distinct_count'] / df['sample_size']
df['d_over_n_label'] = (100.0 * df['d_over_n']).round(2).astype(str) + '%'
df['algorithm_label'] = df['algorithm'] + ' [' + df['params'] + ']'

print('rows:', len(df))
print('algorithms:', sorted(df['algorithm'].unique().tolist()))
print('sample_size:', sorted(df['sample_size'].unique().tolist()))
print('d_over_n:', sorted(df['d_over_n'].dropna().unique().tolist()))
print('seed count:', df['seed'].nunique())


rows: 480
algorithms: ['HyperLogLog', 'HyperLogLog++', 'LogLog', 'Probabilistic Counting']
sample_size: [100, 1000, 10000, 100000, 1000000, 10000000]
d_over_n: [0.01, 0.1, 0.5, 1.0]
seed count: 5


In [3]:
coverage = (
    df.groupby(['algorithm', 'sample_size', 'd_over_n_label'])
      .size()
      .reset_index(name='rows')
      .sort_values(['algorithm', 'sample_size', 'd_over_n_label'])
)
coverage.head(20)


Unnamed: 0,algorithm,sample_size,d_over_n_label,rows
0,HyperLogLog,100,1.0%,5
1,HyperLogLog,100,10.0%,5
2,HyperLogLog,100,100.0%,5
3,HyperLogLog,100,50.0%,5
4,HyperLogLog,1000,1.0%,5
5,HyperLogLog,1000,10.0%,5
6,HyperLogLog,1000,100.0%,5
7,HyperLogLog,1000,50.0%,5
8,HyperLogLog,10000,1.0%,5
9,HyperLogLog,10000,10.0%,5


In [4]:
fig = px.density_heatmap(
    coverage,
    x='sample_size',
    y='d_over_n_label',
    z='rows',
    facet_col='algorithm',
    color_continuous_scale='Viridis',
    title='Copertura esperimenti one-shot (righe per combinazione)'
)
fig.update_layout(
    template='plotly_white',
    dragmode='zoom',
    xaxis_title='sample_size',
    yaxis_title='d/n',
)
fig.update_xaxes(type='log', fixedrange=False)
fig.update_yaxes(fixedrange=False)
fig.show(config=PLOT_CONFIG)


In [5]:
metrics = ['mean_relative_error', 'bias_relative', 'rmse', 'mae', 'rse_observed']

agg = (
    df.groupby(['algorithm_label', 'sample_size', 'd_over_n_label'], as_index=False)[metrics]
      .mean()
)

for metric in metrics:
    fig = px.line(
        agg,
        x='sample_size',
        y=metric,
        color='algorithm_label',
        facet_row='d_over_n_label',
        markers=True,
        title=f'{metric} vs sample_size (one-shot, media sui seed)'
    )
    fig.update_layout(
        template='plotly_white',
        dragmode='zoom',
        legend=dict(itemclick='toggle', itemdoubleclick='toggleothers')
    )
    fig.update_xaxes(type='log', fixedrange=False)
    fig.update_yaxes(fixedrange=False)
    fig.show(config=PLOT_CONFIG)


In [6]:
rse_df = df[np.isfinite(df['rse_theoretical'])].copy()
if rse_df.empty:
    print('Nessun rse_theoretical disponibile')
else:
    rr = (
        rse_df.groupby(['algorithm_label', 'sample_size'], as_index=False)
             [['rse_observed', 'rse_theoretical']]
             .mean()
    )

    fig = go.Figure()
    for algo, g in rr.groupby('algorithm_label'):
        g = g.sort_values('sample_size')
        fig.add_trace(go.Scatter(
            x=g['sample_size'], y=g['rse_observed'], mode='lines+markers',
            name=f'{algo} observed',
            hovertemplate='algo=%{fullData.name}<br>n=%{x}<br>RSE obs=%{y:.6f}<extra></extra>'
        ))
        fig.add_trace(go.Scatter(
            x=g['sample_size'], y=g['rse_theoretical'], mode='lines+markers',
            line=dict(dash='dash'),
            name=f'{algo} theory',
            hovertemplate='algo=%{fullData.name}<br>n=%{x}<br>RSE th=%{y:.6f}<extra></extra>'
        ))

    fig.update_layout(
        title='RSE osservata vs teorica (one-shot, media sui seed)',
        xaxis_title='sample_size', yaxis_title='RSE',
        template='plotly_white', dragmode='zoom',
        legend=dict(itemclick='toggle', itemdoubleclick='toggleothers')
    )
    fig.update_xaxes(type='log', fixedrange=False)
    fig.update_yaxes(fixedrange=False)
    fig.show(config=PLOT_CONFIG)


In [7]:
cal = (
    df.groupby(['algorithm_label', 'sample_size', 'd_over_n_label'], as_index=False)
      [['f0_mean', 'f0_hat_mean']]
      .mean()
)

fig = px.scatter(
    cal,
    x='f0_mean',
    y='f0_hat_mean',
    color='algorithm_label',
    symbol='d_over_n_label',
    title='Calibrazione one-shot: f0_hat_mean vs f0_mean (media sui seed)',
    hover_data=['sample_size', 'd_over_n_label']
)
mn = float(min(cal['f0_mean'].min(), cal['f0_hat_mean'].min()))
mx = float(max(cal['f0_mean'].max(), cal['f0_hat_mean'].max()))
fig.add_trace(go.Scatter(x=[mn, mx], y=[mn, mx], mode='lines', name='y=x', line=dict(color='black', dash='dash')))
fig.update_layout(template='plotly_white', dragmode='zoom', legend=dict(itemclick='toggle', itemdoubleclick='toggleothers'))
fig.update_xaxes(type='log', fixedrange=False)
fig.update_yaxes(type='log', fixedrange=False)
fig.show(config=PLOT_CONFIG)


In [8]:
seed_box = df.copy()
fig = px.box(
    seed_box,
    x='algorithm_label',
    y='mean_relative_error',
    color='algorithm_label',
    points='all',
    hover_data=['sample_size', 'distinct_count', 'seed', 'd_over_n_label'],
    title='Sensibilita ai seed: distribuzione MRE (one-shot)'
)
fig.update_layout(template='plotly_white', dragmode='zoom', showlegend=False)
fig.update_yaxes(fixedrange=False)
fig.show(config=PLOT_CONFIG)


In [9]:
summary = (
    df.groupby('algorithm_label', as_index=False)
      .agg(
          MRE_mean=('mean_relative_error', 'mean'),
          MRE_median=('mean_relative_error', 'median'),
          MRE_p95=('mean_relative_error', lambda s: np.quantile(s, 0.95)),
          RMSE_mean=('rmse', 'mean'),
          BiasAbs_mean=('difference', 'mean'),
          RSE_obs_mean=('rse_observed', 'mean')
      )
      .sort_values('MRE_mean')
)
summary


Unnamed: 0,algorithm_label,MRE_mean,MRE_median,MRE_p95,RMSE_mean,BiasAbs_mean,RSE_obs_mean
1,HyperLogLog++ [k=16],0.00136,9.7e-05,0.00447,2207.649029,1530.954833,0.000849
0,"HyperLogLog [k=16,L=32]",0.001992,0.002113,0.003599,2202.617897,1521.117833,0.001639
3,Probabilistic Counting [L=16],0.641996,0.635035,1.175279,717426.275075,714904.084833,0.587998
2,"LogLog [k=16,L=32]",1360.796638,3.47662,2601.3634,19131.171378,18376.790333,0.045222


In [10]:
fig = go.Figure(data=[go.Table(
    header=dict(values=list(summary.columns), fill_color='lightgrey', align='left'),
    cells=dict(values=[summary[c] for c in summary.columns], align='left')
)])
fig.update_layout(title='Ranking sintetico one-shot (ordinato per MRE_mean)')
fig.show(config=PLOT_CONFIG)
