# Capitolo 5 - Best-config analysis (streaming + merge)

Confronto tra algoritmi usando la migliore configurazione osservata nei CSV.

In [1]:

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ROOT = Path('..').resolve()
FIG_DIR = ROOT / 'thesis' / 'figures' / 'results'
NOTE_DIR = ROOT / 'notes'
FIG_DIR.mkdir(parents=True, exist_ok=True)
NOTE_DIR.mkdir(parents=True, exist_ok=True)

plt.style.use('default')
plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10


In [2]:

stream_files = sorted(ROOT.glob('results/**/results_streaming.csv'))
merge_files = sorted(ROOT.glob('results/**/results_merge.csv'))
sdf = pd.concat([pd.read_csv(p) for p in stream_files], ignore_index=True)
mdf = pd.concat([pd.read_csv(p) for p in merge_files], ignore_index=True)
final = sdf[sdf['number_of_elements_processed'] == sdf['sample_size']].copy()
print('stream files:', len(stream_files), 'rows:', len(sdf))
print('merge files:', len(merge_files), 'rows:', len(mdf))


stream files: 72 rows: 288000
merge files: 72 rows: 36000


In [3]:

def param_value(row):
    p=row['params']; a=row['algorithm']
    if a == 'HyperLogLog++':
        return int(p.split('=')[1])
    if a in ('HyperLogLog','LogLog'):
        return int(p.split(',')[0].split('=')[1])
    if a == 'Probabilistic Counting':
        return int(p.split('=')[1])
    return np.nan

best = (final.assign(param_value=final.apply(param_value, axis=1))
          .groupby(['algorithm','params','param_value'], as_index=False)
          .agg(mre_mean=('mean_relative_error','mean'))
          .sort_values(['algorithm','mre_mean']))
best_params = {algo: grp.iloc[0]['params'] for algo, grp in best.groupby('algorithm')}
print('Best params:')
for k,v in best_params.items():
    print(' -', k, ':', v)

# frozen for reproducibility in this notebook
BEST = {
    'HyperLogLog++': best_params.get('HyperLogLog++','k=18'),
    'HyperLogLog': best_params.get('HyperLogLog','k=16,L=32'),
    'LogLog': best_params.get('LogLog','k=15,L=32'),
    'Probabilistic Counting': best_params.get('Probabilistic Counting','L=23'),
}
print('BEST used =', BEST)


Best params:
 - HyperLogLog : k=16,L=32
 - HyperLogLog++ : k=18
 - LogLog : k=15,L=32
 - Probabilistic Counting : L=23
BEST used = {'HyperLogLog++': 'k=18', 'HyperLogLog': 'k=16,L=32', 'LogLog': 'k=15,L=32', 'Probabilistic Counting': 'L=23'}


In [4]:

SEED = 21041998
ORDER = ['HyperLogLog++', 'HyperLogLog', 'LogLog', 'Probabilistic Counting']
COLORS = {
    'HyperLogLog++': '#1f77b4',
    'HyperLogLog': '#ff7f0e',
    'LogLog': '#2ca02c',
    'Probabilistic Counting': '#d62728'
}

best_seed = pd.concat([
    sdf[(sdf['algorithm'] == algo) & (sdf['params'] == param) & (sdf['seed'] == SEED)]
    for algo, param in BEST.items()
], ignore_index=True)

f0_values = sorted(best_seed['f0'].unique().tolist())
f0_values


[100000, 1000000, 5000000, 10000000]

## 1) Stima vs reale (best-config, seed fissato)

In [5]:

def plot_estimate_vs_truth(loglog=False, fname='best_stream_estimate_vs_truth_linear.png'):
    fig, axes = plt.subplots(2,2, figsize=(12,8))
    axes = axes.ravel()
    for ax, f0 in zip(axes, f0_values):
        sub = best_seed[best_seed['f0'] == f0].sort_values('number_of_elements_processed')
        for algo in ORDER:
            s = sub[sub['algorithm'] == algo]
            ax.plot(s['number_of_elements_processed'], s['f0_heat_mean_t'], color=COLORS[algo], linewidth=1.6, label=algo)
        truth = sub[['number_of_elements_processed','f0_mean_t']].drop_duplicates().sort_values('number_of_elements_processed')
        ax.plot(truth['number_of_elements_processed'], truth['f0_mean_t'], color='black', linestyle='--', linewidth=1.8, label='F0 reale')
        if loglog:
            ax.set_xscale('log'); ax.set_yscale('log')
        ax.set_title(f'f0 finale = {f0:,}'.replace(',', '.'))
        ax.set_xlabel('numero di campioni elaborati')
        ax.set_ylabel('stima media $\hat{F}_0(t)$')
        ax.grid(alpha=0.25)
    h, l = axes[0].get_legend_handles_labels()
    fig.legend(h,l, loc='lower center', ncol=5, frameon=False, bbox_to_anchor=(0.5,-0.01))
    ttl = 'Best-config: stima media vs reale'
    if loglog: ttl += ' (log-log)'
    fig.suptitle(ttl, y=0.98)
    fig.tight_layout(rect=[0,0.05,1,0.95])
    out = FIG_DIR / fname
    fig.savefig(out, bbox_inches='tight')
    plt.close(fig)
    print('saved', out)

plot_estimate_vs_truth(False, 'best_stream_estimate_vs_truth_linear_seed21041998.png')
plot_estimate_vs_truth(True, 'best_stream_estimate_vs_truth_loglog_seed21041998.png')


  ax.set_ylabel('stima media $\hat{F}_0(t)$')


saved /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/best_stream_estimate_vs_truth_linear_seed21041998.png


saved /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/best_stream_estimate_vs_truth_loglog_seed21041998.png


## 2) Varianza nel tempo (best-config, seed fissato)

In [6]:

fig, axes = plt.subplots(2,2, figsize=(12,8))
axes = axes.ravel()
for ax, f0 in zip(axes, f0_values):
    sub = best_seed[best_seed['f0'] == f0].sort_values('number_of_elements_processed')
    for algo in ORDER:
        s = sub[sub['algorithm'] == algo]
        ax.plot(s['number_of_elements_processed'], s['variance'], color=COLORS[algo], linewidth=1.6, label=algo)
    ax.set_xscale('log'); ax.set_yscale('log')
    ax.set_title(f'f0 finale = {f0:,}'.replace(',', '.'))
    ax.set_xlabel('numero di campioni elaborati')
    ax.set_ylabel('varianza')
    ax.grid(alpha=0.25)
h,l = axes[0].get_legend_handles_labels()
fig.legend(h,l, loc='lower center', ncol=4, frameon=False, bbox_to_anchor=(0.5,-0.01))
fig.suptitle('Best-config: varianza nel tempo (log-log)', y=0.98)
fig.tight_layout(rect=[0,0.05,1,0.95])
out = FIG_DIR / 'best_stream_variance_loglog_seed21041998.png'
fig.savefig(out, bbox_inches='tight')
plt.close(fig)
print('saved', out)


saved /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/best_stream_variance_loglog_seed21041998.png


## 3) Endpoint: MRE per algoritmo e f0 (media sui seed)

In [7]:

best_final = pd.concat([
    final[(final['algorithm'] == algo) & (final['params'] == param)]
    for algo, param in BEST.items()
], ignore_index=True)

agg = (best_final.groupby(['algorithm','f0'], as_index=False)
       .agg(mre_mean=('mean_relative_error','mean'),
            rmse_mean=('rmse','mean'),
            variance_mean=('variance','mean'),
            f0_hat_mean=('f0_heat_mean_t','mean')))

xvals = sorted(agg['f0'].unique().tolist())
x = np.arange(len(xvals))
width = 0.2
fig, ax = plt.subplots(figsize=(10,5))
for i, algo in enumerate(ORDER):
    sub = agg[agg['algorithm']==algo].sort_values('f0')
    ax.bar(x + (i-1.5)*width, sub['mre_mean'], width=width, color=COLORS[algo], label=algo)
ax.set_xticks(x)
ax.set_xticklabels([f'{v:,}'.replace(',', '.') for v in xvals])
ax.set_xlabel('f0 finale')
ax.set_ylabel('MRE medio endpoint')
ax.set_yscale('log')
ax.grid(axis='y', alpha=0.3)
ax.legend(ncol=2, frameon=False)
ax.set_title('Best-config: MRE endpoint per algoritmo')
out = FIG_DIR / 'best_stream_final_mre_by_algorithm_and_f0.png'
fig.tight_layout()
fig.savefig(out, bbox_inches='tight')
plt.close(fig)
print('saved', out)

display(agg.sort_values(['f0','algorithm']))
agg.to_csv(NOTE_DIR / 'ch5_best_streaming_summary.csv', index=False)
print('saved', NOTE_DIR / 'ch5_best_streaming_summary.csv')


saved /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/best_stream_final_mre_by_algorithm_and_f0.png


Unnamed: 0,algorithm,f0,mre_mean,rmse_mean,variance_mean,f0_hat_mean
0,HyperLogLog,100000,0.002836,358.2382,127585.8,100008.8
4,HyperLogLog++,100000,0.001237,153.6605,22550.6,99990.82
8,LogLog,100000,0.005206,649.4314,407725.5,100124.1
12,Probabilistic Counting,100000,0.579277,79492.78,6139769000.0,117344.1
1,HyperLogLog,1000000,0.003195,3938.423,15118310.0,999213.9
5,HyperLogLog++,1000000,0.001629,2026.87,3272587.0,1000904.0
9,LogLog,1000000,0.00539,6789.407,46105630.0,999245.0
13,Probabilistic Counting,1000000,0.911951,1321698.0,1418662000000.0,1646385.0
2,HyperLogLog,5000000,0.0028,17416.88,278970100.0,4994350.0
6,HyperLogLog++,5000000,0.001287,8072.642,62887390.0,5001168.0


saved /Users/daniele/CLionProjects/satp-cpp/notes/ch5_best_streaming_summary.csv


## 4) Grafico aggiuntivo: calibrazione endpoint ($\hat{F}_0$ vs $F_0$)

In [8]:

cal = (best_final.groupby(['algorithm','f0'], as_index=False)
       .agg(f0_hat_mean=('f0_heat_mean_t','mean')))
fig, ax = plt.subplots(figsize=(7,6))
for algo in ORDER:
    s = cal[cal['algorithm']==algo].sort_values('f0')
    ax.plot(s['f0'], s['f0_hat_mean'], marker='o', linewidth=1.8, color=COLORS[algo], label=algo)
mn = min(cal['f0'].min(), cal['f0_hat_mean'].min())
mx = max(cal['f0'].max(), cal['f0_hat_mean'].max())
ax.plot([mn,mx],[mn,mx], 'k--', linewidth=1.5, label='ideale y=x')
ax.set_xscale('log'); ax.set_yscale('log')
ax.set_xlabel('F0 reale endpoint')
ax.set_ylabel('stima media endpoint $\hat{F}_0$')
ax.set_title('Best-config: calibrazione endpoint')
ax.grid(alpha=0.3)
ax.legend(frameon=False)
out = FIG_DIR / 'best_stream_endpoint_calibration_loglog.png'
fig.tight_layout(); fig.savefig(out, bbox_inches='tight'); plt.close(fig)
print('saved', out)


saved /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/best_stream_endpoint_calibration_loglog.png


  ax.set_ylabel('stima media endpoint $\hat{F}_0$')


## 5) Grafico aggiuntivo: tempo di convergenza (soglia MRE <= 5%)

In [9]:

best_stream = pd.concat([
    sdf[(sdf['algorithm'] == algo) & (sdf['params'] == param)]
    for algo, param in BEST.items()
], ignore_index=True)

def first_cross(g, thr=0.05):
    s=g.sort_values('number_of_elements_processed')
    m=s[s['mean_relative_error'] <= thr]
    if m.empty:
        return np.nan
    return m['number_of_elements_processed'].iloc[0]

conv = (best_stream.groupby(['algorithm','seed','f0'])
        .apply(lambda g: first_cross(g, 0.05), include_groups=False)
        .reset_index(name='t_conv_5pct'))
conv_agg = conv.groupby(['algorithm','f0'], as_index=False).agg(t_conv_mean=('t_conv_5pct','mean'))

fig, ax = plt.subplots(figsize=(10,5))
for algo in ORDER:
    s = conv_agg[conv_agg['algorithm']==algo].sort_values('f0')
    ax.plot(s['f0'], s['t_conv_mean'], marker='o', linewidth=1.8, color=COLORS[algo], label=algo)
ax.set_xscale('log'); ax.set_yscale('log')
ax.set_xlabel('f0 finale')
ax.set_ylabel('numero campioni al primo MRE <= 5%')
ax.set_title('Best-config: tempo medio di convergenza (soglia 5%)')
ax.grid(alpha=0.3)
ax.legend(frameon=False)
out = FIG_DIR / 'best_stream_convergence_t5pct_loglog.png'
fig.tight_layout(); fig.savefig(out, bbox_inches='tight'); plt.close(fig)
print('saved', out)

conv_agg.to_csv(NOTE_DIR / 'ch5_best_convergence_t5pct.csv', index=False)
print('saved', NOTE_DIR / 'ch5_best_convergence_t5pct.csv')
display(conv_agg)


saved /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/best_stream_convergence_t5pct_loglog.png
saved /Users/daniele/CLionProjects/satp-cpp/notes/ch5_best_convergence_t5pct.csv


Unnamed: 0,algorithm,f0,t_conv_mean
0,HyperLogLog,100000,1.0
1,HyperLogLog,1000000,1.0
2,HyperLogLog,5000000,1.0
3,HyperLogLog,10000000,1.0
4,HyperLogLog++,100000,1.0
5,HyperLogLog++,1000000,1.0
6,HyperLogLog++,5000000,1.0
7,HyperLogLog++,10000000,1.0
8,LogLog,100000,60209.0
9,LogLog,1000000,47640.0


## 6) Merge su best-config

In [10]:

best_merge = pd.concat([
    mdf[(mdf['algorithm'] == algo) & (mdf['params'] == param)]
    for algo, param in BEST.items()
], ignore_index=True)

sum_merge = (best_merge.groupby('algorithm', as_index=False)
             .agg(rows=('delta_merge_serial_abs','size'),
                  mean_abs_delta=('delta_merge_serial_abs','mean'),
                  max_abs_delta=('delta_merge_serial_abs','max'),
                  mean_rel_delta=('delta_merge_serial_rel','mean'),
                  max_rel_delta=('delta_merge_serial_rel','max')))

display(sum_merge)
sum_merge.to_csv(NOTE_DIR / 'ch5_best_merge_summary.csv', index=False)

fig, ax = plt.subplots(figsize=(8,4))
data=[best_merge.loc[best_merge['algorithm']==a,'delta_merge_serial_abs'].values for a in ORDER]
ax.boxplot(data, labels=ORDER, showfliers=False)
ax.set_ylabel('delta_merge_serial_abs')
ax.set_title('Best-config: merge vs seriale (delta assoluto)')
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=15)
out = FIG_DIR / 'best_merge_delta_abs_by_algorithm.png'
fig.tight_layout(); fig.savefig(out, bbox_inches='tight'); plt.close(fig)
print('saved', out)

nz = (best_merge.assign(non_zero_abs=best_merge['delta_merge_serial_abs']>0,
                        non_zero_rel=best_merge['delta_merge_serial_rel']>0)
      .groupby('algorithm',as_index=False)
      .agg(non_zero_abs_count=('non_zero_abs','sum'),
           non_zero_rel_count=('non_zero_rel','sum'),
           total=('non_zero_abs','size')))
fig, ax = plt.subplots(figsize=(8,4))
ix=np.arange(len(nz)); w=0.35
ax.bar(ix-w/2, nz['non_zero_abs_count'], width=w, label='delta abs > 0')
ax.bar(ix+w/2, nz['non_zero_rel_count'], width=w, label='delta rel > 0')
ax.set_xticks(ix); ax.set_xticklabels(nz['algorithm'], rotation=15)
ax.set_ylabel('numero coppie')
ax.set_title('Best-config: casi non nulli merge vs seriale')
ax.legend(frameon=False); ax.grid(axis='y', alpha=0.3)
out = FIG_DIR / 'best_merge_nonzero_delta_counts.png'
fig.tight_layout(); fig.savefig(out, bbox_inches='tight'); plt.close(fig)
print('saved', out)
nz.to_csv(NOTE_DIR / 'ch5_best_merge_nonzero_counts.csv', index=False)


Unnamed: 0,algorithm,rows,mean_abs_delta,max_abs_delta,mean_rel_delta,max_rel_delta
0,HyperLogLog,500,0.0,0,0.0,0
1,HyperLogLog++,500,0.0,0,0.0,0
2,LogLog,500,0.0,0,0.0,0
3,Probabilistic Counting,500,0.0,0,0.0,0


saved /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/best_merge_delta_abs_by_algorithm.png


saved /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/best_merge_nonzero_delta_counts.png


  ax.boxplot(data, labels=ORDER, showfliers=False)


In [11]:

# export metadata
meta = pd.DataFrame([{
    'best_hllpp': BEST['HyperLogLog++'],
    'best_hll': BEST['HyperLogLog'],
    'best_loglog': BEST['LogLog'],
    'best_pc': BEST['Probabilistic Counting'],
    'seed_for_trajectories': SEED,
}])
meta.to_csv(NOTE_DIR / 'ch5_best_config_meta.csv', index=False)
meta


Unnamed: 0,best_hllpp,best_hll,best_loglog,best_pc,seed_for_trajectories
0,k=18,"k=16,L=32","k=15,L=32",L=23,21041998
