In [5]:
# =========================
# Setup y Configuración
# =========================
import numpy as np
import pandas as pd
from typing import List, Dict

import os
from dataclasses import dataclass
from typing import Tuple, List

from dotenv import load_dotenv
from sqlalchemy import create_engine

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# --- Parámetros principales ---
TARGET_COL = "packets"        
PREDICTION_MODE = "next_step"  # "next_step" (pronóstico paso siguiente) o "final_total" (tamaño final)
PREFIX_LEN = 3                 # Para "final_total": cuántas filas iniciales por flujo usar
N_SPLITS = 5
RANDOM_STATE = 42

# Identidad de flujo (ajusta si usas otro identificador)
FLOW_KEYS: List[str] = ["src_ip", "dst_ip", "src_port", "dst_port", "protocol"]

# Columnas binarias (flags TCP)
FLAG_COLS = ["fin","syn","rst","psh","ack","urg"]

# Nota técnica: El uso de ensambles tipo Random Forest para regresión en problemas de redes
# está ampliamente documentado en la literatura de ML para networking (ver "ml for networking.pdf").


In [6]:
# =========================
# Carga de Datos
# =========================
# Reemplaza esta celda con tu origen real:
#   - SQL: df = pd.read_sql("SELECT * FROM flow_metrics_logs ORDER BY ts ASC", engine)
#   - CSV: df = pd.read_csv("data/flow_metrics_logs.csv")

@dataclass
class DBConfig:
    host: str = os.getenv("DB_HOST")
    port: str = os.getenv("DB_PORT")
    db:   str = os.getenv("DB_NAME")
    user: str = os.getenv("DB_USER")
    pwd:  str = os.getenv("DB_PASSWORD")
    table: str = os.getenv("DATA_TABLE", "flow_metrics_logs")  

def make_engine(cfg: DBConfig):
    url = f"postgresql+psycopg2://{cfg.user}:{cfg.pwd}@{cfg.host}:{cfg.port}/{cfg.db}"
    return create_engine(url)

cfg = DBConfig()
engine = make_engine(cfg)
engine

Q = f"""
SELECT
  ts, pid, src_ip, dst_ip, src_port, dst_port, protocol,
  flow_let, last_timestamp_ns, delta_ns, packets, bytes, 
  fin, syn, rst, psh, ack, urg ,throughput,
  loss_est_pkts dup_acks_est
FROM flow_metrics_logs
ORDER BY ts ASC
"""

df = pd.read_sql(Q, engine)
df.head()




In [None]:
# =========================
# Limpieza de Tipos y Valores
# =========================
numeric_like = [
    "delta_ns", "bytes", "throughput", "loss_est_pkts",
    "dup_acks_est", "last_timestamp_ns", TARGET_COL
]

for c in numeric_like:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

for c in FLAG_COLS:
    if c in df.columns:
        df[c] = df[c].fillna(0).astype(int)

df = df.dropna(subset=["ts"])  # asegurar timestamp
df.head()




In [None]:
# =========================
# Funciones: Objetivo y Utilidades
# =========================
def build_next_step_target(_df: pd.DataFrame, group_keys: List[str], target: str) -> pd.DataFrame:
    g = _df.groupby(group_keys, sort=False)
    out = _df.copy()
    out["y"] = g[target].shift(-1)            # valor acumulado en el próximo registro del flujo
    out["lag_"+target] = g[target].shift(1)   # lag como feature opcional
    # eliminar el último punto de cada flujo (no tiene y)
    last_mask = g.cumcount(ascending=True) == (g[target].transform("size") - 1)
    out = out[~last_mask].copy()
    return out

def build_final_total_target(_df: pd.DataFrame, group_keys: List[str], target: str, prefix_len: int) -> pd.DataFrame:
    g = _df.groupby(group_keys, sort=False)
    out = _df.copy()
    out["y"] = g[target].transform("last")    # total acumulado final del flujo
    out["rank_in_flow"] = g.cumcount() + 1
    out = out[out["rank_in_flow"] <= prefix_len].copy()
    return out

def safe_rate(numer: pd.Series, denom_ns: pd.Series) -> pd.Series:
    secs = denom_ns.astype("float64") / 1e9
    secs = secs.replace(0, np.nan)
    return numer / secs


In [None]:
# =========================
# Construcción del Dataset de Trabajo
# =========================
if PREDICTION_MODE == "next_step":
    print(f'Total de muestras: {len(df)}')	
    work_df = build_next_step_target(df, FLOW_KEYS, TARGET_COL)
    print(f"Total muestras después de build_next_step_target: {len(work_df)}")
    print(work_df.head())
elif PREDICTION_MODE == "final_total":
    work_df = build_final_total_target(df, FLOW_KEYS, TARGET_COL, PREFIX_LEN)
else:
    raise ValueError("PREDICTION_MODE debe ser 'next_step' o 'final_total'.")

# Derivadas por flujo: difs y tasas
g = work_df.groupby(FLOW_KEYS, sort=False)
for col in ["packets", "bytes", "last_timestamp_ns"]:
    if col in work_df.columns:
        work_df["d_"+col] = g[col].diff().fillna(0)

if {"d_packets","delta_ns"}.issubset(work_df.columns):
    work_df["pps"] = safe_rate(work_df["d_packets"], work_df["delta_ns"]).fillna(0)
if {"d_bytes","delta_ns"}.issubset(work_df.columns):
    work_df["bps"] = safe_rate(work_df["d_bytes"], work_df["delta_ns"]).fillna(0)

work_df = work_df.dropna(subset=["y"]).reset_index(drop=True)
work_df.head()






In [None]:
# =========================
# Selección de Features
# =========================
cat_features = [c for c in ["protocol"] if c in work_df.columns]  # categóricas a OHE

num_features_base = [
    "delta_ns","bytes","throughput","loss_est_pkts","dup_acks_est",
    "d_packets","d_bytes","pps","bps","src_port","dst_port","flow_let"
]
num_features = [c for c in num_features_base if c in work_df.columns]

# lag del objetivo como feature (solo en next_step)
if PREDICTION_MODE == "next_step":
    lag_col = "lag_"+TARGET_COL
    if lag_col in work_df.columns:
        num_features.append(lag_col)

bin_features = [c for c in FLAG_COLS if c in work_df.columns]

# No usar directamente el TARGET_COL ni columnas que filtramos
drop_cols = set([TARGET_COL, "y", "rank_in_flow"])
basic_cols = set(FLOW_KEYS + ["ts"])
X_cols = [c for c in (num_features + bin_features + cat_features)
          if c not in drop_cols and c not in basic_cols]

# Quitar columnas completamente NaN
X_cols = [c for c in X_cols if work_df[c].notna().any()]

X_cols




In [None]:
# =========================
# Pipeline (Pre + Modelo)
# =========================
numeric_cols = [c for c in X_cols if c not in ["protocol"]]
categorical_cols = [c for c in X_cols if c in ["protocol"]]

pre = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), categorical_cols),
    ],
    remainder="drop"
)

model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

pipe = Pipeline([
    ("pre", pre),
    ("model", model),
])

pipe




In [None]:
# =========================
# Validación Cruzada por Flujo
# =========================
y = work_df["y"].astype("float64").values
groups = work_df.groupby(FLOW_KEYS, sort=False).ngroup().values

gkf = GroupKFold(n_splits=N_SPLITS)
fold_metrics: List[Dict] = []
y_true_all, y_pred_all = [], []

for fold, (tr, te) in enumerate(gkf.split(work_df[X_cols], y, groups=groups), 1):
    X_tr = work_df.iloc[tr][X_cols]
    X_te = work_df.iloc[te][X_cols]
    y_tr = y[tr]
    y_te = y[te]

    pipe.fit(X_tr, y_tr)
    pred = pipe.predict(X_te)

    mae  = mean_absolute_error(y_te, pred)
    rmse = mean_squared_error(y_te, pred, squared=False)
    r2   = r2_score(y_te, pred)

    fold_metrics.append({"fold": fold, "MAE": mae, "RMSE": rmse, "R2": r2})
    y_true_all.append(y_te)
    y_pred_all.append(pred)

# Métricas globales
y_true_all = np.concatenate(y_true_all) if len(y_true_all) else np.array([])
y_pred_all = np.concatenate(y_pred_all) if len(y_pred_all) else np.array([])

print("=== Resultados CV (GroupKFold por flujo) ===")
for m in fold_metrics:
    print(f"[FOLD {m['fold']}] MAE={m['MAE']:.4f}  RMSE={m['RMSE']:.4f}  R2={m['R2']:.4f}")

if y_true_all.size:
    mae  = mean_absolute_error(y_true_all, y_pred_all)
    rmse = mean_squared_error(y_true_all, y_pred_all, squared=False)
    r2   = r2_score(y_true_all, y_pred_all)
    print(f"[GLOBAL] MAE={mae:.4f}  RMSE={rmse:.4f}  R2={r2:.4f}")
else:
    print("No se acumularon predicciones; revisa que haya suficientes datos por flujo.")




In [None]:
# =========================
# Entrenamiento Final y Exportación
# =========================
pipe.fit(work_df[X_cols], work_df["y"])

artifact = {
    "pipeline": pipe,
    "x_cols": X_cols,
    "flow_keys": FLOW_KEYS,
    "target": TARGET_COL,
    "mode": PREDICTION_MODE,
    "prefix_len": PREFIX_LEN
}

joblib.dump(artifact, f"regressor_flow_{TARGET_COL}.joblib")
print(f"Modelo guardado en regressor_flow_{TARGET_COL}.joblib")




In [None]:
# =========================
# Función para reentrenar con otro objetivo
# =========================
def train_flow_regressor(
    df_in: pd.DataFrame,
    target_col: str = "packets",
    mode: str = "next_step",
    prefix_len: int = 3,
    n_splits: int = 5,
    random_state: int = 42
) -> Dict:
    # 1) ordenar y typificar
    df2 = df_in.sort_values(FLOW_KEYS + ["ts"]).reset_index(drop=True).copy()
    for c in ["delta_ns","bytes","throughput","loss_est_pkts","dup_acks_est","last_timestamp_ns",target_col]:
        if c in df2.columns:
            df2[c] = pd.to_numeric(df2[c], errors="coerce")
    for c in FLAG_COLS:
        if c in df2.columns:
            df2[c] = df2[c].fillna(0).astype(int)

    # 2) objetivo
    if mode == "next_step":
        wk = build_next_step_target(df2, FLOW_KEYS, target_col)
    elif mode == "final_total":
        wk = build_final_total_target(df2, FLOW_KEYS, target_col, prefix_len)
    else:
        raise ValueError("mode debe ser 'next_step' o 'final_total'.")

    # 3) features
    g = wk.groupby(FLOW_KEYS, sort=False)
    for col in ["packets","bytes","last_timestamp_ns"]:
        if col in wk.columns:
            wk["d_"+col] = g[col].diff().fillna(0)
    if {"d_packets","delta_ns"}.issubset(wk.columns):
        wk["pps"] = safe_rate(wk["d_packets"], wk["delta_ns"]).fillna(0)
    if {"d_bytes","delta_ns"}.issubset(wk.columns):
        wk["bps"] = safe_rate(wk["d_bytes"], wk["delta_ns"]).fillna(0)
    wk = wk.dropna(subset=["y"]).reset_index(drop=True)

    cat_features = [c for c in ["protocol"] if c in wk.columns]
    num_features = [
        c for c in [
            "delta_ns","bytes","throughput","loss_est_pkts","dup_acks_est",
            "d_packets","d_bytes","pps","bps","src_port","dst_port","flow_let",
            ("lag_"+target_col if mode == "next_step" else None)
        ] if c and c in wk.columns
    ]
    bin_features = [c for c in FLAG_COLS if c in wk.columns]

    drop_cols = set([target_col, "y", "rank_in_flow"])
    basic_cols = set(FLOW_KEYS + ["ts"])
    X_cols_local = [c for c in (num_features + bin_features + cat_features)
                    if c not in drop_cols and c not in basic_cols and wk[c].notna().any()]

    numeric_cols = [c for c in X_cols_local if c not in ["protocol"]]
    categorical_cols = [c for c in X_cols_local if c in ["protocol"]]

    pre = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(with_mean=False), numeric_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), categorical_cols),
        ],
        remainder="drop"
    )

    model = RandomForestRegressor(
        n_estimators=300, max_depth=None, random_state=random_state, n_jobs=-1
    )
    pipe = Pipeline([("pre", pre), ("model", model)])

    y_local = wk["y"].astype("float64").values
    groups = wk.groupby(FLOW_KEYS, sort=False).ngroup().values
    gkf = GroupKFold(n_splits=n_splits)

    fold_metrics, y_true_all, y_pred_all = [], [], []
    for fold, (tr, te) in enumerate(gkf.split(wk[X_cols_local], y_local, groups=groups), 1):
        pipe.fit(wk.iloc[tr][X_cols_local], y_local[tr])
        pred = pipe.predict(wk.iloc[te][X_cols_local])
        mae  = mean_absolute_error(y_local[te], pred)
        rmse = mean_squared_error(y_local[te], pred, squared=False)
        r2   = r2_score(y_local[te], pred)
        fold_metrics.append({"fold": fold, "MAE": mae, "RMSE": rmse, "R2": r2})
        y_true_all.append(y_local[te])
        y_pred_all.append(pred)

    if y_true_all:
        y_true_all = np.concatenate(y_true_all)
        y_pred_all = np.concatenate(y_pred_all)
        mae  = mean_absolute_error(y_true_all, y_pred_all)
        rmse = mean_squared_error(y_true_all, y_pred_all, squared=False)
        r2   = r2_score(y_true_all, y_pred_all)
    else:
        mae = rmse = r2 = np.nan

    # Entrenamiento final
    pipe.fit(wk[X_cols_local], wk["y"])
    artifact = {
        "pipeline": pipe,
        "x_cols": X_cols_local,
        "flow_keys": FLOW_KEYS,
        "target": target_col,
        "mode": mode,
        "prefix_len": prefix_len,
        "cv_metrics": fold_metrics,
        "global_metrics": {"MAE": mae, "RMSE": rmse, "R2": r2},
    }
    return artifact




In [None]:
# =========================
# Ejemplo: entrenar para "bytes"
# =========================
# artifact_bytes = train_flow_regressor(df, target_col="bytes", mode="next_step", prefix_len=3)
# joblib.dump(artifact_bytes, "regressor_flow_bytes.joblib")
# artifact_bytes["global_metrics"]
