# Analisi Streaming per `seed=21041998` e `n=10000000`

Questo notebook genera **un grafico per ogni valore di `d`** (numero di distinti reali) usando i CSV streaming del framework.

Per ogni grafico:
- asse X: checkpoint di stream (`element_index`)
- asse Y: `F0` stimato medio (`f0_hat_mean`, media sulle 50 run)
- curve: tutti gli algoritmi (con parametri)
- baseline: curva `F0` reale (`f0_mean`)


In [None]:
import pandas as pd
from pathlib import Path

try:
    import plotly.graph_objects as go
except ImportError as exc:
    raise ImportError(
        'Plotly non installato. Esegui: pip install plotly'
    ) from exc


In [None]:
# Trova automaticamente la cartella results sia da root sia da notebooks/
candidates = [Path('results'), Path('../results')]
results_root = next((p for p in candidates if p.exists()), None)
if results_root is None:
    raise FileNotFoundError("Cartella results non trovata. Esegui il notebook dalla root del progetto o da notebooks/.")

streaming_files = sorted(results_root.glob('*/*/results_streaming.csv'))
if not streaming_files:
    raise FileNotFoundError(f"Nessun results_streaming.csv trovato in {results_root}")

frames = []
for f in streaming_files:
    df = pd.read_csv(f)
    df['source_file'] = str(f)
    frames.append(df)

data = pd.concat(frames, ignore_index=True)

numeric_cols = [
    'runs', 'sample_size', 'element_index', 'distinct_count', 'seed',
    'f0_mean', 'f0_hat_mean', 'mean', 'variance', 'stddev', 'bias',
    'difference', 'bias_relative', 'mean_relative_error', 'rmse', 'mae'
]
for c in numeric_cols:
    if c in data.columns:
        data[c] = pd.to_numeric(data[c], errors='coerce')

print(f"Rows totali caricati: {len(data):,}")
print(f"File letti: {len(streaming_files)}")


In [None]:
SEED = 21041998
N = 10_000_000

sel = data[
    (data['mode'] == 'streaming') &
    (data['seed'] == SEED) &
    (data['sample_size'] == N)
].copy()

if sel.empty:
    raise ValueError(f"Nessun dato trovato per seed={SEED}, sample_size={N}")

summary = (
    sel.groupby(['distinct_count', 'algorithm', 'params'])['element_index']
    .nunique()
    .reset_index(name='n_checkpoints')
    .sort_values(['distinct_count', 'algorithm'])
)

print(f"Righe filtrate: {len(sel):,}")
display(summary)


In [None]:
# Un grafico interattivo per ogni d (distinct_count)
# Questa cella e' auto-consistente: se `sel` non esiste, lo ricostruisce.

import pandas as pd
from pathlib import Path
import plotly.graph_objects as go

if 'SEED' not in globals():
    SEED = 21041998
if 'N' not in globals():
    N = 10_000_000

if 'data' not in globals():
    candidates = [Path('results'), Path('../results')]
    results_root = next((p for p in candidates if p.exists()), None)
    if results_root is None:
        raise FileNotFoundError("Cartella results non trovata. Esegui il notebook dalla root del progetto o da notebooks/.")

    streaming_files = sorted(results_root.glob('*/*/results_streaming.csv'))
    if not streaming_files:
        raise FileNotFoundError(f"Nessun results_streaming.csv trovato in {results_root}")

    frames = []
    for f in streaming_files:
        df = pd.read_csv(f)
        df['source_file'] = str(f)
        frames.append(df)
    data = pd.concat(frames, ignore_index=True)

    numeric_cols = [
        'runs', 'sample_size', 'element_index', 'distinct_count', 'seed',
        'f0_mean', 'f0_hat_mean', 'mean', 'variance', 'stddev', 'bias',
        'difference', 'bias_relative', 'mean_relative_error', 'rmse', 'mae'
    ]
    for c in numeric_cols:
        if c in data.columns:
            data[c] = pd.to_numeric(data[c], errors='coerce')

if 'sel' not in globals() or sel is None or len(sel) == 0:
    sel = data[
        (data['mode'] == 'streaming') &
        (data['seed'] == SEED) &
        (data['sample_size'] == N)
    ].copy()

if sel.empty:
    raise ValueError(f"Nessun dato trovato per seed={SEED}, sample_size={N}")

d_values = sorted(sel['distinct_count'].dropna().astype(int).unique())

for d in d_values:
    ddf = sel[sel['distinct_count'] == d].copy()

    # Curva reale F0(t): media tra algoritmi (dovrebbe coincidere)
    truth = (
        ddf.groupby('element_index', as_index=False)['f0_mean']
        .mean()
        .sort_values('element_index')
    )

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=truth['element_index'],
        y=truth['f0_mean'],
        mode='lines',
        name='F0 reale (media su 50 run)',
        line=dict(color='black', dash='dash', width=3),
        hovertemplate='element_index=%{x}<br>F0_reale=%{y}<extra></extra>'
    ))

    for (alg, params), g in ddf.groupby(['algorithm', 'params']):
        gg = (
            g.groupby('element_index', as_index=False)['f0_hat_mean']
            .mean()
            .sort_values('element_index')
        )
        fig.add_trace(go.Scatter(
            x=gg['element_index'],
            y=gg['f0_hat_mean'],
            mode='lines',
            name=f"{alg} ({params})",
            hovertemplate='element_index=%{x}<br>F0_hat_mean=%{y}<extra></extra>'
        ))

    fig.update_layout(
        title=f"Streaming n={N:,}, d={d:,}, seed={SEED}",
        xaxis_title='Indice elemento nello stream (checkpoint)',
        yaxis_title='F0 stimato medio (aggregato su 50 run)',
        template='plotly_white',
        hovermode='x unified',
        legend=dict(itemclick='toggle', itemdoubleclick='toggleothers'),
        xaxis=dict(rangeslider=dict(visible=True)),
    )

    fig.show()


Note:
- Con il framework attuale in streaming vengono salvati checkpoint (max 200), non ogni singolo elemento.
- Il valore `f0_hat_mean` è già la media aggregata sulle run (`p=50`).


## Grafici Varianza (stesso filtro)

Questa sezione genera **un grafico per ogni `d`** con:
- asse X: `element_index` (checkpoint streaming)
- asse Y: `variance`
- curve: tutti gli algoritmi (con parametri)

Nota: qui **non** viene tracciata la curva di `F0` reale.


In [None]:
# Un grafico interattivo per ogni d: varianza (y=variance), senza curva F0 reale.

import pandas as pd
from pathlib import Path
import plotly.graph_objects as go

if 'SEED' not in globals():
    SEED = 21041998
if 'N' not in globals():
    N = 10_000_000

if 'data' not in globals():
    candidates = [Path('results'), Path('../results')]
    results_root = next((p for p in candidates if p.exists()), None)
    if results_root is None:
        raise FileNotFoundError("Cartella results non trovata. Esegui il notebook dalla root del progetto o da notebooks/.")

    streaming_files = sorted(results_root.glob('*/*/results_streaming.csv'))
    if not streaming_files:
        raise FileNotFoundError(f"Nessun results_streaming.csv trovato in {results_root}")

    frames = []
    for f in streaming_files:
        df = pd.read_csv(f)
        df['source_file'] = str(f)
        frames.append(df)
    data = pd.concat(frames, ignore_index=True)

    numeric_cols = [
        'runs', 'sample_size', 'element_index', 'distinct_count', 'seed',
        'f0_mean', 'f0_hat_mean', 'mean', 'variance', 'stddev', 'bias',
        'difference', 'bias_relative', 'mean_relative_error', 'rmse', 'mae'
    ]
    for c in numeric_cols:
        if c in data.columns:
            data[c] = pd.to_numeric(data[c], errors='coerce')

if 'sel' not in globals() or sel is None or len(sel) == 0:
    sel = data[
        (data['mode'] == 'streaming') &
        (data['seed'] == SEED) &
        (data['sample_size'] == N)
    ].copy()

if sel.empty:
    raise ValueError(f"Nessun dato trovato per seed={SEED}, sample_size={N}")

d_values = sorted(sel['distinct_count'].dropna().astype(int).unique())

for d in d_values:
    ddf = sel[sel['distinct_count'] == d].copy()

    fig = go.Figure()

    for (alg, params), g in ddf.groupby(['algorithm', 'params']):
        gg = (
            g.groupby('element_index', as_index=False)['variance']
            .mean()
            .sort_values('element_index')
        )
        fig.add_trace(go.Scatter(
            x=gg['element_index'],
            y=gg['variance'],
            mode='lines',
            name=f"{alg} ({params})",
            hovertemplate='element_index=%{x}<br>variance=%{y}<extra></extra>'
        ))

    fig.update_layout(
        title=f"Streaming n={N:,}, d={d:,}, seed={SEED} — Varianza",
        xaxis_title='Indice elemento nello stream (checkpoint)',
        yaxis_title='Varianza della stima',
        template='plotly_white',
        hovermode='x unified',
        legend=dict(itemclick='toggle', itemdoubleclick='toggleothers'),
        xaxis=dict(rangeslider=dict(visible=True)),
    )

    fig.show()
