
# Esplorazione Statistica dei Dataset SATP

Questo notebook analizza **come sono formati i dataset binari** (`SATPDBN2`) prodotti dal generatore:

1. inventory globale (`n`, `d`, `p`, `seed`, size, ratio di compressione);
2. variabilità strutturale per partizione;
3. analisi statistica su dataset selezionato:
   - frequenze,
   - curva reale `F0(t)` vs curva attesa uniforme,
   - posizioni di prima occorrenza,
   - overlap (Jaccard) tra partizioni.

I grafici sono interattivi (zoom/pan, inclusa scala asse `y`) e con legenda leggibile.


In [1]:

from pathlib import Path
import sys
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

try:
    from IPython.display import display
except Exception:
    def display(x):
        print(x)

PLOT_CONFIG = {
    'scrollZoom': True,
    'displaylogo': False,
    'responsive': True,
}

BASE_LAYOUT = dict(
    template='plotly_white',
    font=dict(size=13, family='Arial', color='#111827'),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='left',
        x=0.0,
        bgcolor='rgba(255,255,255,0.9)',
        bordercolor='#D1D5DB',
        borderwidth=1,
        itemclick='toggleothers',
    ),
    margin=dict(l=70, r=40, t=90, b=70),
)


def find_repo_root() -> Path:
    cwd = Path.cwd().resolve()
    candidates = [cwd, cwd.parent, cwd.parent.parent]
    for c in candidates:
        if (c / 'scripts').exists() and (c / 'datasets').exists() and (c / 'notebooks').exists():
            return c
    raise RuntimeError('Repository root non trovata')

ROOT = find_repo_root()
SCRIPTS = ROOT / 'scripts'
DATASETS = ROOT / 'datasets'

if str(SCRIPTS) not in sys.path:
    sys.path.insert(0, str(SCRIPTS))

from dataset_stats import (  # noqa: E402
    build_inventory,
    analyze_dataset,
    partition_frequency_histogram,
    partition_first_occurrence_positions,
)

print('ROOT:', ROOT)
print('DATASETS:', DATASETS)


ROOT: /Users/daniele/CLionProjects/satp-cpp
DATASETS: /Users/daniele/CLionProjects/satp-cpp/datasets


In [2]:

inventory = build_inventory(DATASETS)
if inventory.empty:
    raise ValueError(f'Nessun dataset trovato in {DATASETS}')

inventory = inventory.sort_values(['n', 'd', 'seed', 'p']).reset_index(drop=True)
print('Numero dataset:', len(inventory))
display(inventory.head(15))

overview = (
    inventory.groupby('n', as_index=False)
    .agg(
        files=('file_name', 'count'),
        d_min=('d', 'min'),
        d_max=('d', 'max'),
        seed_count=('seed', 'nunique'),
        size_gb_mean=('file_size_bytes', lambda s: s.mean() / (1024**3)),
        compression_mean=('overall_compression_ratio', 'mean'),
    )
)

display(overview)


Numero dataset: 120


Unnamed: 0,path,file_name,n,d,p,seed,d_over_n,file_size_bytes,raw_values_bytes,raw_truth_bytes,...,values_compression_ratio,truth_compression_ratio,values_part_size_mean,values_part_size_std,values_part_size_min,values_part_size_max,truth_part_size_mean,truth_part_size_std,truth_part_size_min,truth_part_size_max
0,/Users/daniele/CLionProjects/satp-cpp/datasets...,dataset_n_100_d_1_p_50_s_42.bin,100,1,50,42,0.01,4441,20000.0,650.0,...,0.04235,0.846154,16.94,0.42,14.0,17.0,11.0,0.0,11.0,11.0
1,/Users/daniele/CLionProjects/satp-cpp/datasets...,dataset_n_100_d_1_p_50_s_137357.bin,100,1,50,137357,0.01,4444,20000.0,650.0,...,0.0425,0.846154,17.0,0.0,17.0,17.0,11.0,0.0,11.0,11.0
2,/Users/daniele/CLionProjects/satp-cpp/datasets...,dataset_n_100_d_1_p_50_s_10032018.bin,100,1,50,10032018,0.01,4444,20000.0,650.0,...,0.0425,0.846154,17.0,0.0,17.0,17.0,11.0,0.0,11.0,11.0
3,/Users/daniele/CLionProjects/satp-cpp/datasets...,dataset_n_100_d_1_p_50_s_21041998.bin,100,1,50,21041998,0.01,4444,20000.0,650.0,...,0.0425,0.846154,17.0,0.0,17.0,17.0,11.0,0.0,11.0,11.0
4,/Users/daniele/CLionProjects/satp-cpp/datasets...,dataset_n_100_d_1_p_50_s_29042026.bin,100,1,50,29042026,0.01,4444,20000.0,650.0,...,0.0425,0.846154,17.0,0.0,17.0,17.0,11.0,0.0,11.0,11.0
5,/Users/daniele/CLionProjects/satp-cpp/datasets...,dataset_n_100_d_10_p_50_s_42.bin,100,10,50,42,0.1,9609,20000.0,650.0,...,0.2936,1.066154,117.44,2.458943,111.0,122.0,13.86,1.113732,12.0,17.0
6,/Users/daniele/CLionProjects/satp-cpp/datasets...,dataset_n_100_d_10_p_50_s_137357.bin,100,10,50,137357,0.1,9582,20000.0,650.0,...,0.292,1.073846,116.8,2.660827,112.0,124.0,13.96,1.165504,12.0,17.0
7,/Users/daniele/CLionProjects/satp-cpp/datasets...,dataset_n_100_d_10_p_50_s_10032018.bin,100,10,50,10032018,0.1,9603,20000.0,650.0,...,0.29265,1.086154,117.06,2.36144,111.0,122.0,14.12,1.193985,12.0,17.0
8,/Users/daniele/CLionProjects/satp-cpp/datasets...,dataset_n_100_d_10_p_50_s_21041998.bin,100,10,50,21041998,0.1,9598,20000.0,650.0,...,0.2926,1.08,117.04,2.705254,110.0,122.0,14.04,1.248359,12.0,18.0
9,/Users/daniele/CLionProjects/satp-cpp/datasets...,dataset_n_100_d_10_p_50_s_29042026.bin,100,10,50,29042026,0.1,9590,20000.0,650.0,...,0.29235,1.075385,116.94,2.203724,113.0,121.0,13.98,1.14,13.0,17.0


Unnamed: 0,n,files,d_min,d_max,seed_count,size_gb_mean,compression_mean
0,100,20,1,100,5,9e-06,0.319186
1,1000,20,10,1000,5,7.9e-05,0.395515
2,10000,20,100,10000,5,0.000968,0.502714
3,100000,20,1000,100000,5,0.011666,0.607177
4,1000000,20,10000,1000000,5,0.143026,0.744579
5,10000000,20,100000,10000000,5,1.59267,0.829146


In [3]:

fig = px.scatter(
    inventory,
    x='n',
    y='file_size_bytes',
    color='d_over_n',
    symbol='seed',
    hover_data=['file_name', 'd', 'p', 'overall_compression_ratio'],
    title='Inventory: dimensione file vs n (color = d/n)'
)
fig.update_layout(**BASE_LAYOUT, width=1200, height=650)
fig.update_xaxes(type='log', title='n (elementi per partizione)', fixedrange=False)
fig.update_yaxes(type='log', title='dimensione file (byte)', fixedrange=False)
fig.show(config=PLOT_CONFIG)


In [4]:

fig = px.line(
    inventory.sort_values(['n', 'd_over_n']),
    x='d_over_n',
    y='overall_compression_ratio',
    color='n',
    markers=True,
    hover_data=['d', 'file_name'],
    title='Compression ratio totale vs d/n (una curva per n)'
)
fig.update_layout(**BASE_LAYOUT, width=1200, height=650)
fig.update_xaxes(title='d/n', fixedrange=False)
fig.update_yaxes(title='compressed / raw', fixedrange=False)
fig.show(config=PLOT_CONFIG)


In [5]:

# Scelta dataset target: preferenza n=1e6, d=1e5, p=50, seed=21041998
preferred = inventory[
    (inventory['n'] == 1_000_000)
    & (inventory['d'] == 100_000)
    & (inventory['p'] == 50)
    & (inventory['seed'] == 21041998)
]

if preferred.empty:
    # fallback: dataset con n massimo e seed=21041998
    fallback = inventory[inventory['seed'] == 21041998].sort_values(['n', 'd'], ascending=[False, False])
    if fallback.empty:
        row = inventory.sort_values(['n', 'd'], ascending=[False, False]).iloc[0]
    else:
        row = fallback.iloc[0]
else:
    row = preferred.iloc[0]

DATASET_PATH = Path(row['path'])
print('Dataset selezionato:', DATASET_PATH)

meta = pd.DataFrame([{
    'n': int(row['n']),
    'd': int(row['d']),
    'p': int(row['p']),
    'seed': int(row['seed']),
    'd/n': float(row['d_over_n']),
    'size_GB': float(row['file_size_bytes']) / (1024**3),
    'compression_ratio': float(row['overall_compression_ratio']),
}])
display(meta)

deep = analyze_dataset(DATASET_PATH, max_partitions=3, checkpoints=200, compute_overlap=True)

partition_sizes = deep['partition_sizes']
partition_stats = deep['partition_stats']
f0_curves = deep['f0_curves']
overlap = deep['overlap_jaccard']

display(partition_sizes)
display(partition_stats)


Dataset selezionato: /Users/daniele/CLionProjects/satp-cpp/datasets/uniform_distribution/shuffled/random/dataset_n_1000000_d_100000_p_50_s_21041998.bin


Unnamed: 0,n,d,p,seed,d/n,size_GB,compression_ratio
0,1000000,100000,50,21041998,0.1,0.147588,0.76833


Unnamed: 0,partition,n,d_declared,values_bytes,truth_bytes,raw_values_bytes,raw_truth_bytes,values_ratio,truth_ratio
0,0,1000000,100000,3133205,36137,4000000,125000,0.783301,0.289096
1,1,1000000,100000,3132869,36113,4000000,125000,0.783217,0.288904
2,2,1000000,100000,3132934,36318,4000000,125000,0.783234,0.290544


Unnamed: 0,unique_observed,freq_min,freq_max,freq_mean,freq_std,freq_cv,entropy_bits,gini_impurity,top1_freq,top5_freq_sum,equal_adjacent_ratio,lag1_corr,partition,truth_ones,truth_ones_ratio,d_declared,truth_consistent
0,100000,1,27,10.0,2.99993,0.299993,16.543947,0.999989,27,129,1.6e-05,4.1e-05,0,100000,0.1,100000,True
1,100000,1,26,10.0,3.007567,0.300757,16.543653,0.999989,26,126,1.1e-05,0.000351,1,100000,0.1,100000,True
2,100000,1,25,10.0,2.990863,0.299086,16.54427,0.999989,25,121,8e-06,-3.4e-05,2,100000,0.1,100000,True


In [6]:

size_long = partition_sizes.melt(
    id_vars=['partition'],
    value_vars=['values_bytes', 'truth_bytes', 'raw_values_bytes', 'raw_truth_bytes'],
    var_name='metric',
    value_name='bytes',
)
size_long['MiB'] = size_long['bytes'] / (1024**2)

fig = px.bar(
    size_long,
    x='partition',
    y='MiB',
    color='metric',
    barmode='group',
    title='Dimensioni per partizione: compresso vs raw (MiB)'
)
fig.update_layout(**BASE_LAYOUT, width=1200, height=650)
fig.update_xaxes(title='Partition index', fixedrange=False)
fig.update_yaxes(title='MiB', fixedrange=False)
fig.show(config=PLOT_CONFIG)

ratio_df = partition_sizes.copy()
ratio_df['values_ratio'] = ratio_df['values_bytes'] / ratio_df['raw_values_bytes']
ratio_df['truth_ratio'] = ratio_df['truth_bytes'] / ratio_df['raw_truth_bytes']
ratio_long = ratio_df.melt(
    id_vars=['partition'],
    value_vars=['values_ratio', 'truth_ratio'],
    var_name='ratio_type',
    value_name='ratio',
)
fig = px.line(
    ratio_long,
    x='partition',
    y='ratio',
    color='ratio_type',
    markers=True,
    title='Compression ratio per partizione'
)
fig.update_layout(**BASE_LAYOUT, width=1100, height=520)
fig.update_xaxes(title='Partition index', fixedrange=False)
fig.update_yaxes(title='compressed/raw', fixedrange=False)
fig.show(config=PLOT_CONFIG)


In [7]:

fig = go.Figure()
for part_idx in sorted(f0_curves['partition'].unique()):
    sdf = f0_curves[f0_curves['partition'] == part_idx].sort_values('t')
    fig.add_trace(go.Scatter(
        x=sdf['t'],
        y=sdf['f0_true'],
        mode='lines',
        name=f'F0 reale p{part_idx}',
        line=dict(width=2.2),
    ))
    fig.add_trace(go.Scatter(
        x=sdf['t'],
        y=sdf['f0_expected_uniform'],
        mode='lines',
        name=f'F0 atteso uniforme p{part_idx}',
        line=dict(width=1.8, dash='dot'),
        visible='legendonly',
    ))

fig.update_layout(**BASE_LAYOUT, width=1250, height=700, title='Curva F0(t): reale vs attesa uniforme')
fig.update_xaxes(title='t (posizione nello stream)', fixedrange=False, rangeslider=dict(visible=True, thickness=0.07))
fig.update_yaxes(title='F0(t)', fixedrange=False)
fig.show(config=PLOT_CONFIG)

# Scostamento quantitativo dalla curva attesa
f0_rmse = (
    f0_curves.assign(err=lambda df: df['f0_true'] - df['f0_expected_uniform'])
    .groupby('partition', as_index=False)
    .agg(rmse=('err', lambda s: float(np.sqrt(np.mean(np.square(s))))),
         max_abs_delta=('err', lambda s: float(np.max(np.abs(s)))))
)

display(f0_rmse)

fig = px.line(
    f0_curves,
    x='t',
    y='f0_delta',
    color='partition',
    title='Scostamento F0(t): reale - atteso uniforme',
)
fig.update_layout(**BASE_LAYOUT, width=1200, height=600)
fig.update_xaxes(title='t', fixedrange=False, rangeslider=dict(visible=True, thickness=0.07))
fig.update_yaxes(title='Delta F0', fixedrange=False)
fig.show(config=PLOT_CONFIG)


Unnamed: 0,partition,rmse,max_abs_delta
0,0,190.031185,444.601127
1,1,171.71178,379.370202
2,2,102.756926,255.212011


In [8]:

# Istogramma frequenze ID (partizione 0): quante chiavi appaiono 1 volta, 2 volte, ...
freq_hist = partition_frequency_histogram(DATASET_PATH, partition=0)
if not freq_hist.empty:
    fig = px.bar(
        freq_hist,
        x='frequency',
        y='id_count',
        title='Partizione 0: distribuzione delle frequenze degli ID',
    )
    fig.update_layout(**BASE_LAYOUT, width=1200, height=650)
    fig.update_xaxes(title='Frequenza di una chiave', fixedrange=False)
    fig.update_yaxes(title='Numero di chiavi con quella frequenza', type='log', fixedrange=False)
    fig.show(config=PLOT_CONFIG)
else:
    print('Nessun dato frequenze disponibile')


In [9]:

# Distribuzione posizioni di prima occorrenza (truth bits) per le prime 3 partizioni
frames = []
for p_idx in range(min(3, int(row['p']))):
    first_pos = partition_first_occurrence_positions(DATASET_PATH, partition=p_idx)
    frames.append(first_pos)

first_all = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=['partition', 'position'])

if not first_all.empty:
    fig = px.histogram(
        first_all,
        x='position',
        color='partition',
        nbins=50,
        barmode='overlay',
        opacity=0.6,
        title='Posizioni di prima occorrenza (nuovi distinti)'
    )
    fig.update_layout(**BASE_LAYOUT, width=1200, height=650)
    fig.update_xaxes(title='Posizione t', fixedrange=False)
    fig.update_yaxes(title='Conteggio', fixedrange=False)
    fig.show(config=PLOT_CONFIG)


In [10]:

if not overlap.empty:
    pivot = overlap.pivot(index='partition_i', columns='partition_j', values='jaccard').sort_index().sort_index(axis=1)
    fig = go.Figure(data=go.Heatmap(
        z=pivot.values,
        x=[f'p{int(c)}' for c in pivot.columns],
        y=[f'p{int(r)}' for r in pivot.index],
        colorscale='Viridis',
        zmin=0.0,
        zmax=1.0,
        colorbar=dict(title='Jaccard'),
        hovertemplate='i=%{y}<br>j=%{x}<br>Jaccard=%{z:.4f}<extra></extra>',
    ))
    fig.update_layout(**BASE_LAYOUT, width=900, height=750, title='Overlap tra partizioni (Jaccard sui distinti)')
    fig.update_xaxes(title='Partition j', fixedrange=False)
    fig.update_yaxes(title='Partition i', fixedrange=False)
    fig.show(config=PLOT_CONFIG)

    off_diag = overlap[overlap['partition_i'] != overlap['partition_j']]
    summary = pd.DataFrame([{
        'jaccard_offdiag_mean': float(off_diag['jaccard'].mean()) if not off_diag.empty else 0.0,
        'jaccard_offdiag_min': float(off_diag['jaccard'].min()) if not off_diag.empty else 0.0,
        'jaccard_offdiag_max': float(off_diag['jaccard'].max()) if not off_diag.empty else 0.0,
    }])
    display(summary)
else:
    print('Overlap non disponibile')


Unnamed: 0,jaccard_offdiag_mean,jaccard_offdiag_min,jaccard_offdiag_max
0,0.052877,0.052604,0.053291



## Interpretazione rapida

- **Compression ratio** più basso implica maggiore comprimibilità del dataset.
- **Varianza per partizione** bassa indica dataset strutturalmente uniforme tra run.
- **Curva `F0(t)`**: se cresce in modo regolare e vicino all'atteso uniforme, il generatore sta producendo stream coerenti con il modello casuale.
- **Jaccard tra partizioni**: misura quanto i distinti si sovrappongono tra run.
- **Istogramma frequenze**: evidenzia se il carico è uniforme (atteso qui) o sbilanciato.
