
# Capitolo 5 - Analisi Streaming

Questo notebook analizza **solo** i file `results_streaming.csv` presenti in `results/**`.
I grafici vengono salvati in `thesis/figures/results/` e sono destinati al Capitolo 5 della tesi.


In [1]:

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ROOT = Path('..').resolve()
FIG_DIR = ROOT / 'thesis' / 'figures' / 'results'
NOTE_DIR = ROOT / 'notes'
FIG_DIR.mkdir(parents=True, exist_ok=True)
NOTE_DIR.mkdir(parents=True, exist_ok=True)

plt.style.use('default')
plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10
plt.rcParams['axes.titlesize'] = 11
plt.rcParams['axes.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 8


In [2]:

stream_files = sorted(ROOT.glob('results/**/results_streaming.csv'))
if not stream_files:
    raise FileNotFoundError('Nessun results_streaming.csv trovato in results/**')

df = pd.concat([pd.read_csv(p) for p in stream_files], ignore_index=True)
print(f'File caricati: {len(stream_files)}')
print(f'Righe totali: {len(df)}')
display(df.head())


File caricati: 72
Righe totali: 288000


Unnamed: 0,algorithm,params,mode,runs,sample_size,number_of_elements_processed,f0,seed,f0_mean_t,f0_heat_mean_t,variance,stddev,rse_theoretical,rse_observed,bias,absolute_bias,relative_bias,mean_relative_error,rmse,mae
0,HyperLogLog,"k=10,L=32",streaming,50,10000000,1,100000,21041998,1.0,1.0,0.0,0.0,0.0325,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,HyperLogLog,"k=10,L=32",streaming,50,10000000,102,100000,21041998,101.94,101.46,4.539184,2.130536,0.0325,0.0209,-0.48,0.48,-0.004709,0.018053,2.097618,1.84
2,HyperLogLog,"k=10,L=32",streaming,50,10000000,203,100000,21041998,202.82,202.66,17.902449,4.231129,0.0325,0.020861,-0.16,0.16,-0.000789,0.016569,4.127953,3.36
3,HyperLogLog,"k=10,L=32",streaming,50,10000000,304,100000,21041998,303.58,302.7,31.846939,5.643309,0.0325,0.018589,-0.88,0.88,-0.002899,0.01449,5.663921,4.4
4,HyperLogLog,"k=10,L=32",streaming,50,10000000,405,100000,21041998,404.32,403.36,78.194286,8.842753,0.0325,0.021871,-0.96,0.96,-0.002374,0.016919,8.669487,6.84


In [3]:

# Quality checks minimi
required_cols = {
    'algorithm','params','mode','runs','sample_size','number_of_elements_processed',
    'f0','seed','f0_mean_t','f0_heat_mean_t','variance','stddev',
    'bias','absolute_bias','relative_bias','mean_relative_error','rmse','mae'
}
missing = required_cols - set(df.columns)
assert not missing, f'Colonne mancanti: {missing}'
assert (df['mode'] == 'streaming').all(), 'Sono presenti righe non-streaming nel dataset caricato'
assert (df['number_of_elements_processed'] >= 1).all(), 'Checkpoint invalidi'

groups = df.groupby(['algorithm','params','sample_size','f0','seed'])['number_of_elements_processed'].nunique()
print('Checkpoint per gruppo - min/max:', groups.min(), groups.max())
print('Sample size disponibili:', sorted(df['sample_size'].unique().tolist()))
print('f0 disponibili:', sorted(df['f0'].unique().tolist()))
print('seed disponibili:', sorted(df['seed'].unique().tolist()))


Checkpoint per gruppo - min/max: 200 200
Sample size disponibili: [10000000]
f0 disponibili: [100000, 1000000, 5000000, 10000000]
seed disponibili: [42, 137357, 10032018, 21041998, 29042026]


## 1) Conteggio stimato vs conteggio reale (configurazione di riferimento)

## 2) Varianza della stima nel tempo (stessi parametri di riferimento)

## 3) Errore relativo medio finale per algoritmo e f0

## 4) Sensibilit√† ai parametri (checkpoint finale, media su seed e f0)

## 5) Differenza HLL++ vs HLL nel tempo (k=16)

## 6) Export riassunto per il capitolo 5

In [9]:

stream_overview = {
    'stream_files': len(stream_files),
    'rows': int(len(df)),
    'algorithms': sorted(df['algorithm'].unique().tolist()),
    'sample_size_values': sorted(df['sample_size'].unique().tolist()),
    'f0_values': sorted(df['f0'].unique().tolist()),
    'seeds': sorted(df['seed'].unique().tolist()),
}
overview_df = pd.DataFrame([stream_overview])
overview_df.to_csv(NOTE_DIR / 'ch5_streaming_overview.csv', index=False)
display(overview_df)
print('Salvata tabella:', NOTE_DIR / 'ch5_streaming_overview.csv')


Unnamed: 0,stream_files,rows,algorithms,sample_size_values,f0_values,seeds
0,72,288000,"[HyperLogLog, HyperLogLog++, LogLog, Probabili...",[10000000],"[100000, 1000000, 5000000, 10000000]","[42, 137357, 10032018, 21041998, 29042026]"


Salvata tabella: /Users/daniele/CLionProjects/satp-cpp/notes/ch5_streaming_overview.csv
