
# Capitolo 5 - Analisi Streaming

Questo notebook analizza **solo** i file `results_streaming.csv` presenti in `results/**`.
I grafici vengono salvati in `thesis/figures/results/` e sono destinati al Capitolo 5 della tesi.


In [1]:

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ROOT = Path('..').resolve()
FIG_DIR = ROOT / 'thesis' / 'figures' / 'results'
NOTE_DIR = ROOT / 'notes'
FIG_DIR.mkdir(parents=True, exist_ok=True)
NOTE_DIR.mkdir(parents=True, exist_ok=True)

plt.style.use('default')
plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10
plt.rcParams['axes.titlesize'] = 11
plt.rcParams['axes.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 8


In [2]:

stream_files = sorted(ROOT.glob('results/**/results_streaming.csv'))
if not stream_files:
    raise FileNotFoundError('Nessun results_streaming.csv trovato in results/**')

df = pd.concat([pd.read_csv(p) for p in stream_files], ignore_index=True)
print(f'File caricati: {len(stream_files)}')
print(f'Righe totali: {len(df)}')
display(df.head())


File caricati: 72
Righe totali: 288000


Unnamed: 0,algorithm,params,mode,runs,sample_size,number_of_elements_processed,f0,seed,f0_mean_t,f0_heat_mean_t,variance,stddev,rse_theoretical,rse_observed,bias,absolute_bias,relative_bias,mean_relative_error,rmse,mae
0,HyperLogLog,"k=10,L=32",streaming,50,10000000,1,100000,21041998,1.0,1.0,0.0,0.0,0.0325,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,HyperLogLog,"k=10,L=32",streaming,50,10000000,102,100000,21041998,101.94,101.46,4.539184,2.130536,0.0325,0.0209,-0.48,0.48,-0.004709,0.018053,2.097618,1.84
2,HyperLogLog,"k=10,L=32",streaming,50,10000000,203,100000,21041998,202.82,202.66,17.902449,4.231129,0.0325,0.020861,-0.16,0.16,-0.000789,0.016569,4.127953,3.36
3,HyperLogLog,"k=10,L=32",streaming,50,10000000,304,100000,21041998,303.58,302.7,31.846939,5.643309,0.0325,0.018589,-0.88,0.88,-0.002899,0.01449,5.663921,4.4
4,HyperLogLog,"k=10,L=32",streaming,50,10000000,405,100000,21041998,404.32,403.36,78.194286,8.842753,0.0325,0.021871,-0.96,0.96,-0.002374,0.016919,8.669487,6.84


In [3]:

# Quality checks minimi
required_cols = {
    'algorithm','params','mode','runs','sample_size','number_of_elements_processed',
    'f0','seed','f0_mean_t','f0_heat_mean_t','variance','stddev',
    'bias','absolute_bias','relative_bias','mean_relative_error','rmse','mae'
}
missing = required_cols - set(df.columns)
assert not missing, f'Colonne mancanti: {missing}'
assert (df['mode'] == 'streaming').all(), 'Sono presenti righe non-streaming nel dataset caricato'
assert (df['number_of_elements_processed'] >= 1).all(), 'Checkpoint invalidi'

groups = df.groupby(['algorithm','params','sample_size','f0','seed'])['number_of_elements_processed'].nunique()
print('Checkpoint per gruppo - min/max:', groups.min(), groups.max())
print('Sample size disponibili:', sorted(df['sample_size'].unique().tolist()))
print('f0 disponibili:', sorted(df['f0'].unique().tolist()))
print('seed disponibili:', sorted(df['seed'].unique().tolist()))


Checkpoint per gruppo - min/max: 200 200
Sample size disponibili: [10000000]
f0 disponibili: [100000, 1000000, 5000000, 10000000]
seed disponibili: [42, 137357, 10032018, 21041998, 29042026]


## 1) Conteggio stimato vs conteggio reale (configurazione di riferimento)

In [4]:

REF = {
    'HyperLogLog++': 'k=16',
    'HyperLogLog': 'k=16,L=32',
    'LogLog': 'k=16,L=32',
    'Probabilistic Counting': 'L=16',
}
SEED = 21041998

ref_df = pd.concat([
    df[(df['algorithm'] == algo) & (df['params'] == param) & (df['seed'] == SEED)]
    for algo, param in REF.items()
], ignore_index=True)

f0_values = sorted(ref_df['f0'].unique().tolist())
alg_order = ['HyperLogLog++', 'HyperLogLog', 'LogLog', 'Probabilistic Counting']
colors = {
    'HyperLogLog++': '#1f77b4',
    'HyperLogLog': '#ff7f0e',
    'LogLog': '#2ca02c',
    'Probabilistic Counting': '#d62728'
}

def make_estimate_plot(loglog=False, out_name='stream_estimate_vs_truth_linear_seed21041998_ref.png'):
    fig, axes = plt.subplots(2, 2, figsize=(12, 8), sharex=False, sharey=False)
    axes = axes.ravel()

    for ax, f0 in zip(axes, f0_values):
        sub = ref_df[ref_df['f0'] == f0].sort_values('number_of_elements_processed')
        for algo in alg_order:
            s = sub[sub['algorithm'] == algo]
            ax.plot(s['number_of_elements_processed'], s['f0_heat_mean_t'], label=algo, color=colors[algo], linewidth=1.6)

        truth = sub[sub['algorithm'] == 'HyperLogLog++'][['number_of_elements_processed','f0_mean_t']].drop_duplicates()
        ax.plot(truth['number_of_elements_processed'], truth['f0_mean_t'], color='black', linestyle='--', linewidth=1.8, label='F0 reale')

        if loglog:
            ax.set_xscale('log')
            ax.set_yscale('log')
        ax.set_title(f'f0 finale = {f0:,}'.replace(',', '.'))
        ax.set_xlabel('numero di campioni elaborati')
        ax.set_ylabel('stima media $\hat{F}_0(t)$')
        ax.grid(alpha=0.25)

    handles, labels = axes[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', ncol=5, frameon=False, bbox_to_anchor=(0.5, -0.01))
    title = 'Stima media vs reale (seed=21041998, parametri di riferimento)'
    if loglog:
        title += ' - scala log-log'
    fig.suptitle(title, y=0.98)
    fig.tight_layout(rect=[0, 0.05, 1, 0.95])
    out = FIG_DIR / out_name
    fig.savefig(out, bbox_inches='tight')
    plt.close(fig)
    print('Salvato:', out)

make_estimate_plot(loglog=False, out_name='stream_estimate_vs_truth_linear_seed21041998_ref.png')
make_estimate_plot(loglog=True, out_name='stream_estimate_vs_truth_loglog_seed21041998_ref.png')


  ax.set_ylabel('stima media $\hat{F}_0(t)$')


Salvato: /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/stream_estimate_vs_truth_linear_seed21041998_ref.png


Salvato: /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/stream_estimate_vs_truth_loglog_seed21041998_ref.png


## 2) Varianza della stima nel tempo (stessi parametri di riferimento)

In [5]:

fig, axes = plt.subplots(2, 2, figsize=(12, 8), sharex=False, sharey=False)
axes = axes.ravel()

for ax, f0 in zip(axes, f0_values):
    sub = ref_df[ref_df['f0'] == f0].sort_values('number_of_elements_processed')
    for algo in alg_order:
        s = sub[sub['algorithm'] == algo]
        ax.plot(s['number_of_elements_processed'], s['variance'], label=algo, color=colors[algo], linewidth=1.6)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_title(f'f0 finale = {f0:,}'.replace(',', '.'))
    ax.set_xlabel('numero di campioni elaborati')
    ax.set_ylabel('varianza')
    ax.grid(alpha=0.25)

handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc='lower center', ncol=4, frameon=False, bbox_to_anchor=(0.5, -0.01))
fig.suptitle('Varianza della stima nel tempo (seed=21041998, scala log-log)', y=0.98)
fig.tight_layout(rect=[0, 0.05, 1, 0.95])

out = FIG_DIR / 'stream_variance_loglog_seed21041998_ref.png'
fig.savefig(out, bbox_inches='tight')
plt.close(fig)
print('Salvato:', out)


Salvato: /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/stream_variance_loglog_seed21041998_ref.png


## 3) Errore relativo medio finale per algoritmo e f0

In [6]:

final_df = df[df['number_of_elements_processed'] == df['sample_size']].copy()
ref_final = pd.concat([
    final_df[(final_df['algorithm'] == algo) & (final_df['params'] == param)]
    for algo, param in REF.items()
], ignore_index=True)

agg = (ref_final
       .groupby(['algorithm','f0'], as_index=False)
       .agg(mre_mean=('mean_relative_error','mean'),
            mre_median=('mean_relative_error','median')))

f0s = sorted(agg['f0'].unique().tolist())
x = np.arange(len(f0s))
width = 0.2

fig, ax = plt.subplots(figsize=(10, 5))
for i, algo in enumerate(alg_order):
    sub = agg[agg['algorithm'] == algo].sort_values('f0')
    ax.bar(x + (i - 1.5) * width, sub['mre_mean'], width=width, label=algo, color=colors[algo])

ax.set_xticks(x)
ax.set_xticklabels([f'{v:,}'.replace(',', '.') for v in f0s])
ax.set_xlabel('f0 finale')
ax.set_ylabel('MRE medio al checkpoint finale')
ax.set_yscale('log')
ax.grid(axis='y', alpha=0.3)
ax.legend(ncol=2, frameon=False)
ax.set_title('Errore relativo medio finale per algoritmo (media su seed)')

out = FIG_DIR / 'stream_final_mre_by_algorithm_and_f0_ref.png'
fig.tight_layout()
fig.savefig(out, bbox_inches='tight')
plt.close(fig)
print('Salvato:', out)

summary = (ref_final
           .groupby(['algorithm','params','f0'], as_index=False)
           .agg(f0_hat_final_mean=('f0_heat_mean_t','mean'),
                variance_final_mean=('variance','mean'),
                mre_final_mean=('mean_relative_error','mean'),
                rmse_final_mean=('rmse','mean')))
summary.to_csv(NOTE_DIR / 'ch5_streaming_reference_summary.csv', index=False)
display(summary)
print('Salvata tabella:', NOTE_DIR / 'ch5_streaming_reference_summary.csv')


Salvato: /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/stream_final_mre_by_algorithm_and_f0_ref.png


Unnamed: 0,algorithm,params,f0,f0_hat_final_mean,variance_final_mean,mre_final_mean,rmse_final_mean
0,HyperLogLog,"k=16,L=32",100000,100008.844,127585.8,0.002836,358.2382
1,HyperLogLog,"k=16,L=32",1000000,999213.888,15118310.0,0.003195,3938.423
2,HyperLogLog,"k=16,L=32",5000000,4994349.692,278970100.0,0.0028,17416.88
3,HyperLogLog,"k=16,L=32",10000000,9978866.0,0.0,0.002113,21134.0
4,HyperLogLog++,k=16,100000,100055.996,140057.2,0.003512,380.9683
5,HyperLogLog++,k=16,1000000,999213.896,15118360.0,0.003195,3938.426
6,HyperLogLog++,k=16,5000000,4994351.388,278970100.0,0.0028,17416.33
7,HyperLogLog++,k=16,10000000,9978880.0,0.0,0.002112,21120.0
8,LogLog,"k=16,L=32",100000,103409.536,158581.3,0.034095,3432.303
9,LogLog,"k=16,L=32",1000000,999638.488,22938980.0,0.003878,4784.446


Salvata tabella: /Users/daniele/CLionProjects/satp-cpp/notes/ch5_streaming_reference_summary.csv


## 4) Sensibilità ai parametri (checkpoint finale, media su seed e f0)

In [7]:

def param_value(row):
    algo = row['algorithm']
    p = row['params']
    if algo == 'HyperLogLog++':
        return int(p.split('=')[1])
    if algo in ('HyperLogLog','LogLog'):
        return int(p.split(',')[0].split('=')[1])
    if algo == 'Probabilistic Counting':
        return int(p.split('=')[1])
    return np.nan

sens = final_df.copy()
sens['param_value'] = sens.apply(param_value, axis=1)
sens_agg = (sens.groupby(['algorithm','param_value'], as_index=False)
                .agg(mre_mean=('mean_relative_error','mean')))

fig, axes = plt.subplots(2, 2, figsize=(11, 8))
ax_map = {
    'HyperLogLog++': axes[0,0],
    'HyperLogLog': axes[0,1],
    'LogLog': axes[1,0],
    'Probabilistic Counting': axes[1,1],
}

for algo, ax in ax_map.items():
    sub = sens_agg[sens_agg['algorithm'] == algo].sort_values('param_value')
    ax.plot(sub['param_value'], sub['mre_mean'], marker='o', linewidth=1.8, color=colors.get(algo, '#333333'))
    ax.set_yscale('log')
    ax.set_title(algo)
    ax.set_xlabel('parametro')
    ax.set_ylabel('MRE medio finale')
    ax.grid(alpha=0.3)

fig.suptitle('Sensibilità ai parametri (streaming, checkpoint finale)')
fig.tight_layout(rect=[0, 0, 1, 0.97])
out = FIG_DIR / 'stream_parameter_sensitivity_final_mre.png'
fig.savefig(out, bbox_inches='tight')
plt.close(fig)
print('Salvato:', out)

sens_agg.to_csv(NOTE_DIR / 'ch5_streaming_param_sensitivity.csv', index=False)
print('Salvata tabella:', NOTE_DIR / 'ch5_streaming_param_sensitivity.csv')


Salvato: /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/stream_parameter_sensitivity_final_mre.png
Salvata tabella: /Users/daniele/CLionProjects/satp-cpp/notes/ch5_streaming_param_sensitivity.csv


## 5) Differenza HLL++ vs HLL nel tempo (k=16)

In [8]:

hpp = ref_df[ref_df['algorithm'] == 'HyperLogLog++'][['f0','number_of_elements_processed','f0_heat_mean_t']]
hll = ref_df[ref_df['algorithm'] == 'HyperLogLog'][['f0','number_of_elements_processed','f0_heat_mean_t']]
cmp = hpp.merge(hll, on=['f0','number_of_elements_processed'], suffixes=('_hpp','_hll'))
cmp['abs_diff_hat'] = (cmp['f0_heat_mean_t_hpp'] - cmp['f0_heat_mean_t_hll']).abs()

fig, axes = plt.subplots(2, 2, figsize=(12, 8), sharex=False, sharey=False)
axes = axes.ravel()

for ax, f0 in zip(axes, sorted(cmp['f0'].unique())):
    sub = cmp[cmp['f0'] == f0].sort_values('number_of_elements_processed')
    ax.plot(sub['number_of_elements_processed'], sub['abs_diff_hat'], color='#9467bd', linewidth=1.8)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_title(f'f0 finale = {f0:,}'.replace(',', '.'))
    ax.set_xlabel('numero di campioni elaborati')
    ax.set_ylabel('$|\hat{F}_{0,HLL++} - \hat{F}_{0,HLL}|$')
    ax.grid(alpha=0.3)

fig.suptitle('Differenza assoluta tra HLL++ e HLL (seed=21041998, k=16)')
fig.tight_layout(rect=[0, 0, 1, 0.97])
out = FIG_DIR / 'stream_hll_vs_hllpp_absdiff_seed21041998.png'
fig.savefig(out, bbox_inches='tight')
plt.close(fig)
print('Salvato:', out)


  ax.set_ylabel('$|\hat{F}_{0,HLL++} - \hat{F}_{0,HLL}|$')


Salvato: /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/stream_hll_vs_hllpp_absdiff_seed21041998.png


## 6) Export riassunto per il capitolo 5

In [9]:

stream_overview = {
    'stream_files': len(stream_files),
    'rows': int(len(df)),
    'algorithms': sorted(df['algorithm'].unique().tolist()),
    'sample_size_values': sorted(df['sample_size'].unique().tolist()),
    'f0_values': sorted(df['f0'].unique().tolist()),
    'seeds': sorted(df['seed'].unique().tolist()),
}
overview_df = pd.DataFrame([stream_overview])
overview_df.to_csv(NOTE_DIR / 'ch5_streaming_overview.csv', index=False)
display(overview_df)
print('Salvata tabella:', NOTE_DIR / 'ch5_streaming_overview.csv')


Unnamed: 0,stream_files,rows,algorithms,sample_size_values,f0_values,seeds
0,72,288000,"[HyperLogLog, HyperLogLog++, LogLog, Probabili...",[10000000],"[100000, 1000000, 5000000, 10000000]","[42, 137357, 10032018, 21041998, 29042026]"


Salvata tabella: /Users/daniele/CLionProjects/satp-cpp/notes/ch5_streaming_overview.csv
