
# Capitolo 5 - Sensibilità al seed per setting

Obiettivo: per ogni algoritmo e per ogni setting di parametri, mantenere fissi i parametri e
valutare quanto cambiano le metriche al variare del seed.

Dataset usati: `results/**/results_streaming.csv` (solo endpoint `t = sample_size`).


In [1]:

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ROOT = Path('..').resolve()
FIG_DIR = ROOT / 'thesis' / 'figures' / 'results'
NOTE_DIR = ROOT / 'notes'
FIG_DIR.mkdir(parents=True, exist_ok=True)
NOTE_DIR.mkdir(parents=True, exist_ok=True)

plt.style.use('default')
plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10


In [2]:

files = sorted(ROOT.glob('results/**/results_streaming.csv'))
if not files:
    raise FileNotFoundError('Nessun results_streaming.csv trovato')

df = pd.concat([pd.read_csv(p) for p in files], ignore_index=True)
endpoint = df[df['number_of_elements_processed'] == df['sample_size']].copy()

print('files:', len(files))
print('rows all:', len(df), 'rows endpoint:', len(endpoint))
print('algorithms:', sorted(endpoint['algorithm'].unique().tolist()))
print('seeds:', sorted(endpoint['seed'].unique().tolist()))
print('sample_size:', sorted(endpoint['sample_size'].unique().tolist()))
print('f0:', sorted(endpoint['f0'].unique().tolist()))


files: 72
rows all: 288000 rows endpoint: 1440
algorithms: ['HyperLogLog', 'HyperLogLog++', 'LogLog', 'Probabilistic Counting']
seeds: [42, 137357, 10032018, 21041998, 29042026]
sample_size: [10000000]
f0: [100000, 1000000, 5000000, 10000000]


## 1) Definizione indice di sensibilità al seed

In [3]:

# Per ciascun (algorithm, params), consideriamo i valori endpoint su tutti i seed e f0.
# Indice principale: coefficiente di variazione (CV = std / |mean|), per metrica.

metric_cols = ['mean_relative_error', 'rmse', 'variance', 'bias']

rows = []
for (algo, params), g in endpoint.groupby(['algorithm', 'params']):
    row = {'algorithm': algo, 'params': params, 'n_rows': len(g), 'n_seeds': g['seed'].nunique(), 'n_f0': g['f0'].nunique()}
    for m in metric_cols:
        vals = g[m].astype(float)
        mean = vals.mean()
        std = vals.std(ddof=1) if len(vals) > 1 else 0.0
        cv = (std / abs(mean)) if abs(mean) > 0 else np.nan
        row[f'{m}_mean'] = mean
        row[f'{m}_std'] = std
        row[f'{m}_cv'] = cv
    rows.append(row)

seed_sens = pd.DataFrame(rows)
seed_sens.sort_values(['algorithm', 'mean_relative_error_cv'], inplace=True)
display(seed_sens.head(20))

out_csv = NOTE_DIR / 'ch5_seed_sensitivity_by_setting.csv'
seed_sens.to_csv(out_csv, index=False)
print('saved:', out_csv)


Unnamed: 0,algorithm,params,n_rows,n_seeds,n_f0,mean_relative_error_mean,mean_relative_error_std,mean_relative_error_cv,rmse_mean,rmse_std,rmse_cv,variance_mean,variance_std,variance_cv,bias_mean,bias_std,bias_cv
10,HyperLogLog,"k=7,L=32",20,5,4,0.072119,0.005701,0.079052,312574.904696,289014.71722,0.924625,27469820000.0,45588380000.0,1.65958,-260093.598,294666.617354,1.132925
12,HyperLogLog,"k=9,L=32",20,5,4,0.034639,0.004466,0.128939,143826.400245,132827.537544,0.923527,8802107000.0,14889220000.0,1.691552,-101584.708,139553.848777,1.373768
1,HyperLogLog,"k=11,L=32",20,5,4,0.017235,0.002308,0.1339,71383.305132,66509.101897,0.931718,2413735000.0,4108931000.0,1.702312,-44886.861,71947.153542,1.602856
9,HyperLogLog,"k=6,L=32",20,5,4,0.09304,0.014219,0.152827,401169.790994,393768.668019,0.981551,58788060000.0,96747870000.0,1.645706,-267903.008,433897.8206,1.619608
6,HyperLogLog,"k=16,L=32",20,5,4,0.002736,0.00048,0.175477,10711.884799,9044.364242,0.84433,73553990.0,126307500.0,1.717208,-6890.394,8759.628288,1.271281
7,HyperLogLog,"k=4,L=32",20,5,4,0.197727,0.040611,0.20539,858089.414232,903201.801454,1.052573,161490100000.0,255835800000.0,1.58422,-664239.974,979703.091597,1.474923
2,HyperLogLog,"k=12,L=32",20,5,4,0.015439,0.003625,0.234805,79681.933019,84178.033367,1.056426,1243179000.0,2087865000.0,1.679457,-68658.788,86745.301244,1.263426
8,HyperLogLog,"k=5,L=32",20,5,4,0.151253,0.037834,0.250139,706875.688679,802914.758483,1.135864,89801580000.0,143815700000.0,1.601483,-553389.565,869280.515551,1.570829
3,HyperLogLog,"k=13,L=32",20,5,4,0.007362,0.002162,0.293715,25539.162871,20773.924115,0.813414,666161500.0,1136591000.0,1.706179,11324.62,17202.864852,1.519068
4,HyperLogLog,"k=14,L=32",20,5,4,0.005149,0.001641,0.318705,17510.321524,14231.244851,0.812735,323418000.0,551190500.0,1.704267,-5181.859,12782.057903,2.466693


saved: /Users/daniele/CLionProjects/satp-cpp/notes/ch5_seed_sensitivity_by_setting.csv


## 2) Vista per algoritmo: top setting più stabili (MRE CV minimo)

In [4]:

top_rows = []
for algo, g in seed_sens.groupby('algorithm'):
    g = g.sort_values('mean_relative_error_cv')
    top_rows.append(g.head(5))
top_df = pd.concat(top_rows, ignore_index=True)
display(top_df[['algorithm','params','mean_relative_error_cv','rmse_cv','variance_cv','bias_cv']])

out_top = NOTE_DIR / 'ch5_seed_sensitivity_top5_by_algo.csv'
top_df.to_csv(out_top, index=False)
print('saved:', out_top)


Unnamed: 0,algorithm,params,mean_relative_error_cv,rmse_cv,variance_cv,bias_cv
0,HyperLogLog,"k=7,L=32",0.079052,0.924625,1.65958,1.132925
1,HyperLogLog,"k=9,L=32",0.128939,0.923527,1.691552,1.373768
2,HyperLogLog,"k=11,L=32",0.1339,0.931718,1.702312,1.602856
3,HyperLogLog,"k=6,L=32",0.152827,0.981551,1.645706,1.619608
4,HyperLogLog,"k=16,L=32",0.175477,0.84433,1.717208,1.271281
5,HyperLogLog++,k=7,0.079061,0.924585,1.659583,1.132891
6,HyperLogLog++,k=9,0.128955,0.923488,1.691553,1.373754
7,HyperLogLog++,k=11,0.133913,0.931677,1.702313,1.602867
8,HyperLogLog++,k=18,0.136402,0.905062,1.704546,1.480464
9,HyperLogLog++,k=6,0.15281,0.981477,1.645708,1.61964


saved: /Users/daniele/CLionProjects/satp-cpp/notes/ch5_seed_sensitivity_top5_by_algo.csv


## 3) Grafico: sensibilità al seed per setting (MRE CV)

In [5]:

algos = sorted(seed_sens['algorithm'].unique().tolist())
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.ravel()

for ax, algo in zip(axes, algos):
    sub = seed_sens[seed_sens['algorithm'] == algo].sort_values('mean_relative_error_cv')
    x = np.arange(len(sub))
    ax.bar(x, sub['mean_relative_error_cv'], color='#1f77b4')
    ax.set_title(algo)
    ax.set_xlabel('setting ordinati per stabilità')
    ax.set_ylabel('CV(MRE)')
    ax.set_yscale('log')
    ax.grid(axis='y', alpha=0.3)

fig.suptitle('Sensibilità al seed per setting (metrica principale: CV del MRE endpoint)')
fig.tight_layout(rect=[0,0,1,0.96])
out_fig = FIG_DIR / 'seed_sensitivity_mre_cv_by_setting.png'
fig.savefig(out_fig, bbox_inches='tight')
plt.close(fig)
print('saved:', out_fig)


saved: /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/seed_sensitivity_mre_cv_by_setting.png


## 4) Boxplot per seed: MRE endpoint in best-setting di ciascun algoritmo

In [6]:

# Best setting per algoritmo = MRE medio minimo endpoint
best_map = {}
for algo, g in endpoint.groupby('algorithm'):
    ag = g.groupby('params', as_index=False).agg(mre_mean=('mean_relative_error', 'mean')).sort_values('mre_mean')
    best_map[algo] = ag.iloc[0]['params']

print('best settings:', best_map)

best_ep = pd.concat([
    endpoint[(endpoint['algorithm'] == a) & (endpoint['params'] == p)]
    for a, p in best_map.items()
], ignore_index=True)

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.ravel()
for ax, algo in zip(axes, sorted(best_map.keys())):
    sub = best_ep[best_ep['algorithm'] == algo]
    seeds = sorted(sub['seed'].unique().tolist())
    data = [sub[sub['seed'] == s]['mean_relative_error'].values for s in seeds]
    ax.boxplot(data, labels=[str(s) for s in seeds], showfliers=False)
    ax.set_title(f"{algo} ({best_map[algo]})")
    ax.set_xlabel('seed')
    ax.set_ylabel('MRE endpoint')
    ax.set_yscale('log')
    ax.grid(axis='y', alpha=0.3)

fig.suptitle('Distribuzione del MRE endpoint per seed nel best-setting')
fig.tight_layout(rect=[0,0,1,0.96])
out_fig2 = FIG_DIR / 'seed_sensitivity_mre_boxplot_best_setting.png'
fig.savefig(out_fig2, bbox_inches='tight')
plt.close(fig)
print('saved:', out_fig2)

best_summary = (best_ep.groupby(['algorithm','params','seed'], as_index=False)
                .agg(mre_mean=('mean_relative_error','mean'),
                     rmse_mean=('rmse','mean'),
                     variance_mean=('variance','mean'),
                     bias_mean=('bias','mean')))
out_best = NOTE_DIR / 'ch5_seed_sensitivity_best_setting_by_seed.csv'
best_summary.to_csv(out_best, index=False)
print('saved:', out_best)
display(best_summary.head(20))


best settings: {'HyperLogLog': 'k=16,L=32', 'HyperLogLog++': 'k=18', 'LogLog': 'k=15,L=32', 'Probabilistic Counting': 'L=23'}


  ax.boxplot(data, labels=[str(s) for s in seeds], showfliers=False)
  ax.boxplot(data, labels=[str(s) for s in seeds], showfliers=False)
  ax.boxplot(data, labels=[str(s) for s in seeds], showfliers=False)
  ax.boxplot(data, labels=[str(s) for s in seeds], showfliers=False)


saved: /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/seed_sensitivity_mre_boxplot_best_setting.png
saved: /Users/daniele/CLionProjects/satp-cpp/notes/ch5_seed_sensitivity_best_setting_by_seed.csv


Unnamed: 0,algorithm,params,seed,mre_mean,rmse_mean,variance_mean,bias_mean
0,HyperLogLog,"k=16,L=32",42,0.002763,10373.09,58944960.0,-6937.755
1,HyperLogLog,"k=16,L=32",137357,0.002888,11425.3,100544000.0,-6701.135
2,HyperLogLog,"k=16,L=32",10032018,0.00271,11116.68,80922090.0,-7663.69
3,HyperLogLog,"k=16,L=32",21041998,0.002718,10215.33,57096120.0,-6723.03
4,HyperLogLog,"k=16,L=32",29042026,0.002601,10429.02,70262770.0,-6426.36
5,HyperLogLog++,k=18,42,0.001305,5743.452,15946520.0,3571.41
6,HyperLogLog++,k=18,137357,0.001467,6200.447,21605730.0,4177.355
7,HyperLogLog++,k=18,10032018,0.001406,5956.706,18137110.0,3091.385
8,HyperLogLog++,k=18,21041998,0.001292,5557.86,12561610.0,3802.75
9,HyperLogLog++,k=18,29042026,0.001379,5928.001,14477190.0,4506.24


## 5) Tabella riassuntiva pronta per capitolo 5

In [7]:

report_rows = []
for algo, sub in seed_sens.groupby('algorithm'):
    # pick best stable setting by MRE CV
    r = sub.sort_values('mean_relative_error_cv').iloc[0]
    report_rows.append({
        'algorithm': algo,
        'most_stable_params_by_mre_cv': r['params'],
        'mre_cv': r['mean_relative_error_cv'],
        'rmse_cv': r['rmse_cv'],
        'variance_cv': r['variance_cv'],
        'bias_cv': r['bias_cv']
    })

report = pd.DataFrame(report_rows).sort_values('algorithm')
display(report)
out_report = NOTE_DIR / 'ch5_seed_sensitivity_report_table.csv'
report.to_csv(out_report, index=False)
print('saved:', out_report)


Unnamed: 0,algorithm,most_stable_params_by_mre_cv,mre_cv,rmse_cv,variance_cv,bias_cv
0,HyperLogLog,"k=7,L=32",0.079052,0.924625,1.65958,1.132925
1,HyperLogLog++,k=7,0.079061,0.924585,1.659583,1.132891
2,LogLog,"k=8,L=32",0.116134,0.918555,1.6578,1.224613
3,Probabilistic Counting,L=1,9e-06,0.997114,,0.997114


saved: /Users/daniele/CLionProjects/satp-cpp/notes/ch5_seed_sensitivity_report_table.csv
