
# Capitolo 5 - Analisi Merge

Questo notebook analizza **solo** i file `results_merge.csv` presenti in `results/**`.
Obiettivo: verificare empiricamente la coerenza tra `merge` e processamento seriale.


In [1]:

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ROOT = Path('..').resolve()
FIG_DIR = ROOT / 'thesis' / 'figures' / 'results'
NOTE_DIR = ROOT / 'notes'
FIG_DIR.mkdir(parents=True, exist_ok=True)
NOTE_DIR.mkdir(parents=True, exist_ok=True)

plt.style.use('default')
plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10


In [2]:

merge_files = sorted(ROOT.glob('results/**/results_merge.csv'))
if not merge_files:
    raise FileNotFoundError('Nessun results_merge.csv trovato in results/**')

mdf = pd.concat([pd.read_csv(p) for p in merge_files], ignore_index=True)
print(f'File caricati: {len(merge_files)}')
print(f'Righe totali: {len(mdf)}')
display(mdf.head())


File caricati: 72
Righe totali: 36000


Unnamed: 0,algorithm,params,mode,pairs,sample_size,pair_index,seed,estimate_merge,estimate_serial,delta_merge_serial_abs,delta_merge_serial_rel
0,HyperLogLog,"k=10,L=32",merge,25,10000000,0,21041998,198942,198942,0,0
1,HyperLogLog,"k=10,L=32",merge,25,10000000,1,21041998,197785,197785,0,0
2,HyperLogLog,"k=10,L=32",merge,25,10000000,2,21041998,197216,197216,0,0
3,HyperLogLog,"k=10,L=32",merge,25,10000000,3,21041998,189231,189231,0,0
4,HyperLogLog,"k=10,L=32",merge,25,10000000,4,21041998,196391,196391,0,0


In [3]:

required_cols = {
    'algorithm','params','mode','pairs','sample_size','pair_index','seed',
    'estimate_merge','estimate_serial','delta_merge_serial_abs','delta_merge_serial_rel'
}
missing = required_cols - set(mdf.columns)
assert not missing, f'Colonne mancanti: {missing}'
assert (mdf['mode'] == 'merge').all(), 'Sono presenti righe non-merge nel dataset caricato'
assert (mdf['delta_merge_serial_abs'] >= 0).all()
assert (mdf['delta_merge_serial_rel'] >= 0).all()

print('Sample size disponibili:', sorted(mdf['sample_size'].unique().tolist()))
print('Seed disponibili:', sorted(mdf['seed'].unique().tolist()))
print('Algoritmi:', sorted(mdf['algorithm'].unique().tolist()))


Sample size disponibili: [10000000]
Seed disponibili: [42, 137357, 10032018, 21041998, 29042026]
Algoritmi: ['HyperLogLog', 'HyperLogLog++', 'LogLog', 'Probabilistic Counting']


## 1) Statistiche sintetiche del delta merge-seriale

In [4]:

summary_algo = (mdf.groupby('algorithm', as_index=False)
                  .agg(rows=('delta_merge_serial_abs','size'),
                       mean_abs_delta=('delta_merge_serial_abs','mean'),
                       max_abs_delta=('delta_merge_serial_abs','max'),
                       mean_rel_delta=('delta_merge_serial_rel','mean'),
                       max_rel_delta=('delta_merge_serial_rel','max')))
display(summary_algo)

summary_param = (mdf.groupby(['algorithm','params'], as_index=False)
                   .agg(rows=('delta_merge_serial_abs','size'),
                        max_abs_delta=('delta_merge_serial_abs','max'),
                        max_rel_delta=('delta_merge_serial_rel','max')))
display(summary_param.head(20))

summary_algo.to_csv(NOTE_DIR / 'ch5_merge_summary_by_algorithm.csv', index=False)
summary_param.to_csv(NOTE_DIR / 'ch5_merge_summary_by_param.csv', index=False)
print('Salvate tabelle merge in notes/')


Unnamed: 0,algorithm,rows,mean_abs_delta,max_abs_delta,mean_rel_delta,max_rel_delta
0,HyperLogLog,6500,0.0,0,0.0,0
1,HyperLogLog++,7500,0.0,0,0.0,0
2,LogLog,6500,0.0,0,0.0,0
3,Probabilistic Counting,15500,0.0,0,0.0,0


Unnamed: 0,algorithm,params,rows,max_abs_delta,max_rel_delta
0,HyperLogLog,"k=10,L=32",500,0,0
1,HyperLogLog,"k=11,L=32",500,0,0
2,HyperLogLog,"k=12,L=32",500,0,0
3,HyperLogLog,"k=13,L=32",500,0,0
4,HyperLogLog,"k=14,L=32",500,0,0
5,HyperLogLog,"k=15,L=32",500,0,0
6,HyperLogLog,"k=16,L=32",500,0,0
7,HyperLogLog,"k=4,L=32",500,0,0
8,HyperLogLog,"k=5,L=32",500,0,0
9,HyperLogLog,"k=6,L=32",500,0,0


Salvate tabelle merge in notes/


## 2) Distribuzione del delta assoluto per algoritmo

In [5]:

algs = sorted(mdf['algorithm'].unique().tolist())
data = [mdf.loc[mdf['algorithm'] == a, 'delta_merge_serial_abs'].values for a in algs]

fig, ax = plt.subplots(figsize=(8, 4))
ax.boxplot(data, labels=algs, showfliers=False)
ax.set_ylabel('delta_merge_serial_abs')
ax.set_title('Distribuzione del delta assoluto (merge vs seriale)')
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=15)

out = FIG_DIR / 'merge_delta_abs_by_algorithm.png'
fig.tight_layout()
fig.savefig(out, bbox_inches='tight')
plt.close(fig)
print('Salvato:', out)


Salvato: /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/merge_delta_abs_by_algorithm.png


  ax.boxplot(data, labels=algs, showfliers=False)


## 3) Conteggio dei casi con delta non nullo

In [6]:

nz = (mdf.assign(non_zero_abs = mdf['delta_merge_serial_abs'] > 0,
                 non_zero_rel = mdf['delta_merge_serial_rel'] > 0)
        .groupby('algorithm', as_index=False)
        .agg(non_zero_abs_count=('non_zero_abs','sum'),
             non_zero_rel_count=('non_zero_rel','sum'),
             total=('non_zero_abs','size')))
display(nz)

fig, ax = plt.subplots(figsize=(8, 4))
x = np.arange(len(nz))
width = 0.35
ax.bar(x - width/2, nz['non_zero_abs_count'], width=width, label='delta abs > 0')
ax.bar(x + width/2, nz['non_zero_rel_count'], width=width, label='delta rel > 0')
ax.set_xticks(x)
ax.set_xticklabels(nz['algorithm'], rotation=15)
ax.set_ylabel('numero di coppie')
ax.set_title('Casi non nulli nel confronto merge vs seriale')
ax.legend(frameon=False)
ax.grid(axis='y', alpha=0.3)

out = FIG_DIR / 'merge_nonzero_delta_counts.png'
fig.tight_layout()
fig.savefig(out, bbox_inches='tight')
plt.close(fig)
print('Salvato:', out)

nz.to_csv(NOTE_DIR / 'ch5_merge_nonzero_counts.csv', index=False)


Unnamed: 0,algorithm,non_zero_abs_count,non_zero_rel_count,total
0,HyperLogLog,0,0,6500
1,HyperLogLog++,0,0,7500
2,LogLog,0,0,6500
3,Probabilistic Counting,0,0,15500


Salvato: /Users/daniele/CLionProjects/satp-cpp/thesis/figures/results/merge_nonzero_delta_counts.png


## 4) Export overview

In [7]:

merge_overview = {
    'merge_files': len(merge_files),
    'rows': int(len(mdf)),
    'algorithms': sorted(mdf['algorithm'].unique().tolist()),
    'sample_size_values': sorted(mdf['sample_size'].unique().tolist()),
    'seeds': sorted(mdf['seed'].unique().tolist()),
    'global_max_abs_delta': float(mdf['delta_merge_serial_abs'].max()),
    'global_max_rel_delta': float(mdf['delta_merge_serial_rel'].max()),
}
ov = pd.DataFrame([merge_overview])
display(ov)
ov.to_csv(NOTE_DIR / 'ch5_merge_overview.csv', index=False)
print('Salvata tabella:', NOTE_DIR / 'ch5_merge_overview.csv')


Unnamed: 0,merge_files,rows,algorithms,sample_size_values,seeds,global_max_abs_delta,global_max_rel_delta
0,72,36000,"[HyperLogLog, HyperLogLog++, LogLog, Probabili...",[10000000],"[42, 137357, 10032018, 21041998, 29042026]",0.0,0.0


Salvata tabella: /Users/daniele/CLionProjects/satp-cpp/notes/ch5_merge_overview.csv
