In [None]:
import pandas as pd
import numpy as np
import psycopg2
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import joblib
import os
from dotenv import load_dotenv
load_dotenv()


CONN = dict(
    host=os.getenv("DB_HOST"),
    port=os.getenv("DB_PORT"),
    dbname=os.getenv("DB_NAME"),
    user=os.getenv("DB_USER"),
    password=os.getenv("DB_PASSWORD"),
)
# ---------- 2.1 Carga ----------
QUERY = """
SELECT ts_1s AS ts, src_ip, dst_ip, src_port, dst_port, protocol,
       throughput_bps_t, pps_t, y_bps_next_1s
FROM training_bps_h1s
WHERE ts_1s IS NOT NULL
ORDER BY ts_1s;
"""
with psycopg2.connect(**CONN) as con:
    df = pd.read_sql(QUERY, con)

# clave de flujo
df["flow_id"] = (df["src_ip"].astype(str) + ":" + df["src_port"].astype(str) + ">" +
                 df["dst_ip"].astype(str) + ":" + df["dst_port"].astype(str) + "/" +
                 df["protocol"].astype(str))


# ---------- 2.2 Lags y ventanas por flujo ----------
def add_lags_rolls(g):
    g = g.sort_values("ts")
    for k in [1,2,3,5]:
        g[f"throughput_lag{k}"] = g["throughput_bps_t"].shift(k)
        g[f"pps_lag{k}"]        = g["pps_t"].shift(k)
    # rolling sobre pasado (window=5s)
    g["thr_ma_5"]  = g["throughput_bps_t"].rolling(window=5, min_periods=1).mean()
    g["thr_std_5"] = g["throughput_bps_t"].rolling(window=5, min_periods=2).std()
    g["pps_ma_5"]  = g["pps_t"].rolling(window=5, min_periods=1).mean()
    # pendiente (slope) simple sobre 5s
    g["thr_slope_5"] = g["throughput_bps_t"].diff(5) / 5.0
    return g

df = df.groupby("flow_id", group_keys=False).apply(add_lags_rolls)

# quitar filas con NaN por lags al inicio
df = df.dropna()

FEATURES = [
    "throughput_bps_t","pps_t",
    "throughput_lag1","throughput_lag2","throughput_lag3","throughput_lag5",
    "pps_lag1","pps_lag2","pps_lag3","pps_lag5",
    "thr_ma_5","thr_std_5","pps_ma_5","thr_slope_5"
]
TARGET = "y_bps_next_1s"
X = df[FEATURES].values
y = df[TARGET].values


OperationalError: connection to server at "localhost" (::1), port 15432 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 15432 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?


In [3]:
# split temporal: último 20% como validación
cut = int(len(df) * 0.8)
X_tr, y_tr = X[:cut], y[:cut]
X_va, y_va = X[cut:], y[cut:]

model = XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_tr, y_tr,
          eval_set=[(X_va, y_va)],
          verbose=False)

pred_va = model.predict(X_va)

mae  = mean_absolute_error(y_va, pred_va)

# Compatibilidad con cualquier versión de sklearn
try:
    rmse = mean_squared_error(y_va, pred_va, squared=False)
except TypeError:
    rmse = np.sqrt(mean_squared_error(y_va, pred_va))

print(f"VAL 1s → MAE={mae:,.0f} bps  |  RMSE={rmse:,.0f} bps")


joblib.dump(model, "xgb_bps_h1s.joblib")

VAL 1s → MAE=1,222,011,563 bps  |  RMSE=2,984,995,950 bps


['xgb_bps_h1s.joblib']

In [4]:
# tomemos el último timestamp de un flujo cualquiera (o uno en particular)
flow = df["flow_id"].iloc[-1]
last_row = df[df["flow_id"]==flow].sort_values("ts").iloc[-1]

x_now = last_row[FEATURES].values.reshape(1, -1)
yhat_1s = model.predict(x_now)[0]
print(f"Predicción throughput (siguiente 1s) para {flow}: {yhat_1s:,.0f} bps")


Predicción throughput (siguiente 1s) para 10.0.0.1:5201>10.0.0.2:58628/6: 114 bps
