In [1]:

import os
import json
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, classification_report, average_precision_score

from xgboost import XGBClassifier
import joblib


In [3]:

SELECTED_FEATURES = ['cwnd', 'rtt', 'bytes_retrans', 'bytes_acked', 'throughput', 'packets_lost']

# Las tratamos así:
# - Contadores pesados (log1p): bytes_retrans, bytes_acked, packets_lost
# - Continuas (z-score opcional): cwnd, rtt, throughput
CONTADORES = ['bytes_retrans', 'bytes_acked', 'packets_lost']
CONTINUAS  = ['cwnd', 'rtt', 'throughput']

RANDOM_STATE = 42
TEST_SIZE = 0.20

xgb_params = dict(
    tree_method='hist',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    n_jobs=-1,
    random_state=RANDOM_STATE
)



## Cargar datos

Reemplazá la celda siguiente por tu propio *loader* si usás el de `scripts/data_loader.py`.
El notebook intenta detectar un CSV típico de BBR como ejemplo.


In [None]:

import os
import pandas as pd

base_dir = '../data_processed'
all_data = []

for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.csv'):
            path = os.path.join(root, file)
            df = pd.read_csv(path)
            df['variant'] = os.path.basename(root)  #  añadir columna con el nombre de la variante
            all_data.append(df)

# Concatenar todos los DataFrames
df_all = pd.concat(all_data, ignore_index=True)



## Etiquetado y selección de columnas

Si ya tenés una columna de etiqueta (`congestion_event`), el código la utilizará.
Si no existe, por defecto construye una etiqueta simple basada en una caída de `cwnd` (ejemplo para BBR).

In [None]:

# Etiqueta: usar la existente o construir una simple basada en cwnd
if 'congestion_event' not in df.columns and 'cwnd' in df.columns:
    df['congestion_event'] = (df['cwnd'].diff().shift(-1) < -10).astype(int)

# Filtrar filas/columnas necesarias
needed = [c for c in SELECTED_FEATURES if c in df.columns]
if 'congestion_event' in df.columns:
    needed = needed + ['congestion_event']

df = df[needed].dropna()
print("Usando columnas:", needed)
print("Shape después de filtrar:", df.shape)

assert 'congestion_event' in df.columns, "No se encontró la etiqueta 'congestion_event'. Añadila o ajustá esta celda."


In [None]:

X = df[SELECTED_FEATURES].copy()
y = df['congestion_event'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

print("Train:", X_train.shape, "Test:", X_test.shape, "Positives in test:", int(y_test.sum()))


In [None]:

# Intersecciones seguras por si alguna columna no está
CONTADORES_IN = [c for c in CONTADORES if c in X_train.columns]
CONTINUAS_IN  = [c for c in CONTINUAS  if c in X_train.columns]

pre_no_scale = ColumnTransformer(
    transformers=[
        ('log_counts', FunctionTransformer(np.log1p, feature_names_out='one-to-one'), CONTADORES_IN)
    ],
    remainder='passthrough'
)

pre_zscore = ColumnTransformer(
    transformers=[
        ('log_counts', FunctionTransformer(np.log1p, feature_names_out='one-to-one'), CONTADORES_IN),
        ('std_cont', StandardScaler(), CONTINUAS_IN)
    ],
    remainder='passthrough'
)

clf_no_scale = Pipeline([('pre', pre_no_scale), ('xgb', XGBClassifier(**xgb_params))])
clf_zscore  = Pipeline([('pre', pre_zscore),  ('xgb', XGBClassifier(**xgb_params))])


In [None]:

def evaluate(clf, X_tr, y_tr, X_te, y_te, name="model"):
    clf.fit(X_tr, y_tr)
    prob = clf.predict_proba(X_te)[:, 1]
    # thresholds candidatos: todos los prob únicos + 0/0.5/1 para robustez
    thresholds = np.unique(np.concatenate(([0.0, 0.5, 1.0], prob)))
    best_f1, best_t, best_pred = -1.0, 0.5, None
    for t in thresholds:
        pred = (prob >= t).astype(int)
        f1 = f1_score(y_te, pred, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t, best_pred = f1, t, pred
    ap = average_precision_score(y_te, prob)
    print(f"[{name}] Best F1: {best_f1:.4f} @ threshold={best_t:.3f} | AP={ap:.4f}")
    print(classification_report(y_te, best_pred, digits=2))
    return {'clf': clf, 'prob': prob, 'best_t': float(best_t), 'best_f1': float(best_f1), 'ap': float(ap)}

res_no = evaluate(clf_no_scale, X_train, y_train, X_test, y_test, name='log1p-only')
res_z  = evaluate(clf_zscore,  X_train, y_train, X_test, y_test, name='zscore+log1p')

best = res_z if res_z['best_f1'] >= res_no['best_f1'] else res_no
best_name = 'zscore+log1p' if best is res_z else 'log1p-only'
print("✅ Selected pipeline:", best_name)

best_clf = best['clf']
best_t = best['best_t']


In [None]:

joblib.dump(best_clf, 'xgb_pipeline_bbr.joblib')
with open('xgb_threshold.json', 'w') as f:
    json.dump({'threshold': best_t, 'pipeline': 'zscore+log1p' if best_clf is clf_zscore else 'log1p-only'}, f, indent=2)
print("💾 Guardado: xgb_pipeline_bbr.joblib, xgb_threshold.json")



## Notas
- **Solo se usan**: `cwnd`, `rtt`, `bytes_retrans`, `bytes_acked`, `throughput`, `packets_lost`.
- `log1p` mitiga colas pesadas/outliers en contadores; `z-score` aporta comparabilidad y estabilidad numérica en las continuas.
- El pipeline mantiene **reproducibilidad** y evita *data leakage* (transformaciones ajustadas solo en train).
- Se sigue la receta de *feature engineering → entrenamiento* destacada en *ML for Networking*.
