# Risultati Streaming

Questo notebook analizza i risultati `results_streaming.csv` per tutti gli algoritmi.

Obiettivi:
- andamento delle metriche lungo lo stream (`element_index` / progress);
- confronto tra algoritmi nel regime grande (`n=10^7`);
- confronto per diversi rapporti `d/n`.


In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

PLOT_CONFIG = {'scrollZoom': True, 'displaylogo': False}

def resolve_results_root() -> Path:
    candidates = [Path('results'), Path('../results'), Path('../../results')]
    for c in candidates:
        if c.exists():
            return c.resolve()
    raise FileNotFoundError('Cartella results non trovata. Esegui dalla root progetto o da notebooks/.')

RESULTS_ROOT = resolve_results_root()
print('RESULTS_ROOT =', RESULTS_ROOT)


RESULTS_ROOT = /Users/daniele/CLionProjects/satp-cpp/results


In [2]:
stream_files = sorted(RESULTS_ROOT.glob('*/*/results_streaming.csv'))
if not stream_files:
    raise FileNotFoundError(f'Nessun results_streaming.csv trovato in {RESULTS_ROOT}')

frames = []
for f in stream_files:
    d = pd.read_csv(f)
    d['source_file'] = str(f)
    frames.append(d)

s = pd.concat(frames, ignore_index=True)

num_cols = [
    'runs', 'sample_size', 'element_index', 'distinct_count', 'seed',
    'f0_mean', 'f0_hat_mean', 'mean', 'variance', 'stddev',
    'rse_theoretical', 'rse_observed', 'bias', 'difference',
    'bias_relative', 'mean_relative_error', 'rmse', 'mae'
]
for c in num_cols:
    if c in s.columns:
        s[c] = pd.to_numeric(s[c], errors='coerce')

s['progress'] = s['element_index'] / s['sample_size']
s['d_over_n'] = s['distinct_count'] / s['sample_size']
s['d_over_n_label'] = (100.0 * s['d_over_n']).round(2).astype(str) + '%'
s['algorithm_label'] = s['algorithm'] + ' [' + s['params'] + ']'

print('rows:', len(s))
print('algorithms:', sorted(s['algorithm'].unique().tolist()))
print('sample_size:', sorted(s['sample_size'].unique().tolist()))
print('d_over_n:', sorted(s['d_over_n'].dropna().unique().tolist()))
print('seed count:', s['seed'].nunique())


rows: 88000
algorithms: ['HyperLogLog', 'HyperLogLog++', 'LogLog', 'Probabilistic Counting']
sample_size: [100, 1000, 10000, 100000, 1000000, 10000000]
d_over_n: [0.01, 0.1, 0.5, 1.0]
seed count: 5


In [3]:
checkpoints = (
    s.groupby(['sample_size', 'd_over_n_label'], as_index=False)
     .agg(n_checkpoints=('element_index', 'nunique'))
)
checkpoints.sort_values(['sample_size', 'd_over_n_label'])


Unnamed: 0,sample_size,d_over_n_label,n_checkpoints
0,100,1.0%,100
1,100,10.0%,100
2,100,100.0%,100
3,100,50.0%,100
4,1000,1.0%,200
5,1000,10.0%,200
6,1000,100.0%,200
7,1000,50.0%,200
8,10000,1.0%,200
9,10000,10.0%,200


In [4]:
fig = px.scatter(
    checkpoints,
    x='sample_size',
    y='n_checkpoints',
    color='d_over_n_label',
    title='Numero checkpoint per sample_size e d/n',
    hover_data=['sample_size', 'd_over_n_label', 'n_checkpoints']
)
fig.update_layout(template='plotly_white', dragmode='zoom', legend=dict(itemclick='toggle', itemdoubleclick='toggleothers'))
fig.update_xaxes(type='log', fixedrange=False)
fig.update_yaxes(fixedrange=False)
fig.show(config=PLOT_CONFIG)


In [5]:
# Focus principale: n=10^7, medie sui seed
N = 10_000_000
focus = s[s['sample_size'] == N].copy()
if focus.empty:
    raise ValueError('Nessun dato streaming per n=10^7')

f_agg = (
    focus.groupby(['algorithm_label', 'd_over_n_label', 'element_index'], as_index=False)
         [['f0_mean', 'f0_hat_mean', 'variance', 'mean_relative_error', 'bias_relative', 'rse_observed']]
         .mean()
)

sorted(f_agg['d_over_n_label'].unique().tolist())


['1.0%', '10.0%', '100.0%', '50.0%']

In [6]:
# Per ogni d/n: F0 vero vs F0 stimato lungo lo stream
for dlab in sorted(f_agg['d_over_n_label'].unique().tolist()):
    g = f_agg[f_agg['d_over_n_label'] == dlab].copy()
    truth = g.groupby('element_index', as_index=False)['f0_mean'].mean().sort_values('element_index')

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=truth['element_index'], y=truth['f0_mean'], mode='lines',
        name='F0 reale', line=dict(color='black', dash='dash', width=3),
        hovertemplate='t=%{x}<br>F0=%{y}<extra></extra>'
    ))

    for algo, a in g.groupby('algorithm_label'):
        a = a.sort_values('element_index')
        fig.add_trace(go.Scatter(
            x=a['element_index'], y=a['f0_hat_mean'], mode='lines', name=algo,
            hovertemplate='algo=%{fullData.name}<br>t=%{x}<br>F0_hat=%{y:.6f}<extra></extra>'
        ))

    fig.update_layout(
        title=f'Streaming n={N}, d/n={dlab}: F0 reale vs stima media (seed-avg)',
        xaxis_title='element_index', yaxis_title='F0',
        template='plotly_white', dragmode='zoom', hovermode='x unified',
        legend=dict(itemclick='toggle', itemdoubleclick='toggleothers')
    )
    fig.update_xaxes(fixedrange=False, rangeslider=dict(visible=True))
    fig.update_yaxes(fixedrange=False)
    fig.show(config=PLOT_CONFIG)


In [7]:
# Per ogni d/n: varianza lungo lo stream
for dlab in sorted(f_agg['d_over_n_label'].unique().tolist()):
    g = f_agg[f_agg['d_over_n_label'] == dlab].copy()

    fig = px.line(
        g, x='element_index', y='variance', color='algorithm_label',
        title=f'Streaming n={N}, d/n={dlab}: varianza stimata',
        hover_data=['algorithm_label', 'element_index', 'variance']
    )
    fig.update_layout(template='plotly_white', dragmode='zoom', hovermode='x unified', legend=dict(itemclick='toggle', itemdoubleclick='toggleothers'))
    fig.update_xaxes(fixedrange=False, rangeslider=dict(visible=True))
    fig.update_yaxes(fixedrange=False)
    fig.show(config=PLOT_CONFIG)


In [8]:
# Per ogni d/n: MRE e bias relativo lungo lo stream
for metric, ytitle in [('mean_relative_error', 'MRE'), ('bias_relative', 'Bias relativo')]:
    for dlab in sorted(f_agg['d_over_n_label'].unique().tolist()):
        g = f_agg[f_agg['d_over_n_label'] == dlab].copy()
        fig = px.line(
            g, x='element_index', y=metric, color='algorithm_label',
            title=f'Streaming n={N}, d/n={dlab}: {ytitle}',
            hover_data=['algorithm_label', 'element_index', metric]
        )
        fig.update_layout(template='plotly_white', dragmode='zoom', hovermode='x unified', legend=dict(itemclick='toggle', itemdoubleclick='toggleothers'))
        fig.update_xaxes(fixedrange=False, rangeslider=dict(visible=True))
        fig.update_yaxes(fixedrange=False)
        fig.show(config=PLOT_CONFIG)


In [9]:
# Heatmap finale (t = n): MRE per algoritmo e d/n
last = s[s['element_index'] == s['sample_size']].copy()
last_agg = (
    last.groupby(['algorithm_label', 'd_over_n_label'], as_index=False)['mean_relative_error']
        .mean()
)

fig = px.density_heatmap(
    last_agg,
    x='d_over_n_label', y='algorithm_label', z='mean_relative_error',
    color_continuous_scale='Magma',
    title='Streaming endpoint (t=n): MRE media per algoritmo e d/n'
)
fig.update_layout(template='plotly_white', dragmode='zoom')
fig.update_xaxes(fixedrange=False)
fig.update_yaxes(fixedrange=False)
fig.show(config=PLOT_CONFIG)


In [10]:
# Tabella sintetica streaming (endpoint t=n)
summary_stream = (
    last.groupby('algorithm_label', as_index=False)
        .agg(
            MRE_mean=('mean_relative_error', 'mean'),
            MRE_median=('mean_relative_error', 'median'),
            RMSE_mean=('rmse', 'mean'),
            BiasAbs_mean=('difference', 'mean'),
            RSE_obs_mean=('rse_observed', 'mean')
        )
        .sort_values('MRE_mean')
)
summary_stream


Unnamed: 0,algorithm_label,MRE_mean,MRE_median,RMSE_mean,BiasAbs_mean,RSE_obs_mean
1,HyperLogLog++ [k=16],0.001101,6.3e-05,2184.711287,1510.6555,0.000767
0,"HyperLogLog [k=16,L=32]",0.001992,0.002113,2202.617897,1521.117833,0.001639
3,Probabilistic Counting [L=16],0.641996,0.635035,717426.275075,714904.084833,0.587998
2,"LogLog [k=16,L=32]",1360.796638,3.47662,19131.171378,18376.790333,0.045222
