In [10]:
import os
from urllib.parse import quote_plus
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

HOST = os.getenv("DB_HOST")
PORT = os.getenv("DB_PORT")
DB   = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASS = quote_plus(os.getenv("DB_PASSWORD") or "")  # por si hay caracteres especiales

engine = create_engine(f"postgresql+psycopg2://{USER}:{PASS}@{HOST}:{PORT}/{DB}")
Q = """
SELECT ts, src_ip, dst_ip, src_port, dst_port, protocol,
       throughput_bps_t, pps_t, y_bps_next_1s
FROM training_bps_h1s_v3
WHERE y_bps_next_1s IS NOT NULL
ORDER BY ts;
"""
df = pd.read_sql(Q, engine)

# ID de flujo direccional (string-safe incluso si src_ip/dst_ip vienen como inet/IPv4Address)
df["flow_id"] = (df["src_ip"].astype(str)+":"+df["src_port"].astype(str)+">"+
                 df["dst_ip"].astype(str)+":"+df["dst_port"].astype(str)+"/"+
                 df["protocol"].astype(str))

# sanity check rápido
print(df.shape, df.head(3))


(15442, 10)                          ts    src_ip    dst_ip  src_port  dst_port  protocol  \
0 2025-09-05 20:58:12+00:00  10.0.0.1  10.0.0.2      5201     55886         6   
1 2025-09-05 20:58:12+00:00  10.0.0.1  10.0.0.2      5201     55884         6   
2 2025-09-05 20:58:13+00:00  10.0.0.1  10.0.0.2      5201     55886         6   

   throughput_bps_t     pps_t  y_bps_next_1s                         flow_id  
0           9420.70  876302.0         138.82  10.0.0.1:5201>10.0.0.2:55886/6  
1              0.00       3.0           0.00  10.0.0.1:5201>10.0.0.2:55884/6  
2            138.82  100020.0          36.72  10.0.0.1:5201>10.0.0.2:55886/6  


In [11]:
def add_lags_rolls(g):
    g = g.sort_values("ts")
    for k in [1,2,3,5]:
        g[f"thr_lag{k}"] = g["throughput_bps_t"].shift(k)
        g[f"pps_lag{k}"] = g["pps_t"].shift(k)
    g["thr_ma_5"]   = g["throughput_bps_t"].rolling(5, min_periods=1).mean()
    g["thr_std_5"]  = g["throughput_bps_t"].rolling(5, min_periods=2).std()
    g["pps_ma_5"]   = g["pps_t"].rolling(5, min_periods=1).mean()
    g["thr_slope5"] = g["throughput_bps_t"].diff(5) / 5.0
    return g

df = df.groupby("flow_id", group_keys=False).apply(add_lags_rolls).dropna()

FEATURES = [
  "throughput_bps_t","pps_t",
  "thr_lag1","thr_lag2","thr_lag3","thr_lag5",
  "pps_lag1","pps_lag2","pps_lag3","pps_lag5",
  "thr_ma_5","thr_std_5","pps_ma_5","thr_slope5"
]
TARGET = "y_bps_next_1s"
X, y = df[FEATURES].values, df[TARGET].values


  df = df.groupby("flow_id", group_keys=False).apply(add_lags_rolls).dropna()


In [12]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

cut = int(len(df)*0.8)
X_tr, y_tr = X[:cut], y[:cut]
X_va, y_va = X[cut:],  y[cut:]

model = XGBRegressor(
    objective="reg:squarederror", tree_method="hist",
    n_estimators=700, learning_rate=0.05, max_depth=6,
    subsample=0.8, colsample_bytree=0.8, random_state=42
)
model.fit(X_tr, y_tr, eval_set=[(X_va,y_va)], verbose=False)

pred = model.predict(X_va)

mae  = mean_absolute_error(y_va, pred)
mse  = mean_squared_error(y_va, pred)     # sin 'squared'
rmse = np.sqrt(mse)

mask = y_va > 0
rpe = float(np.mean(np.abs((pred[mask]-y_va[mask]) / y_va[mask]))) if mask.any() else np.nan

print(f"VAL h=1s  MAE={mae:,.0f} bps | RMSE={rmse:,.0f} bps | RPE(y>0)={100*rpe:.2f}%")


VAL h=1s  MAE=109 bps | RMSE=517 bps | RPE(y>0)=14709.43%


In [13]:
# tomá la última fila de un flujo activo para que tenga contexto útil
last_active = df[df["throughput_bps_t"]>0].sort_values("ts").iloc[-1]
yhat_next = model.predict(last_active[FEATURES].values.reshape(1,-1))[0]
print(f"Predicción next 1s ({last_active['flow_id']} @ {last_active['ts']}): {yhat_next/1e6:.2f} Mbps")


Predicción next 1s (10.0.0.1:5201>10.0.0.2:55096/6 @ 2025-09-05 23:05:36+00:00): 0.01 Mbps


In [None]:
#VALIDACION

row = last_active  # la que usaste para predecir
y_true = row["y_bps_next_1s"]
y_pred = yhat_next
abs_err_bps = abs(y_true - y_pred)
rel_err = abs_err_bps / y_true if y_true > 0 else np.nan

print(f"GT next 1s: {y_true/1e6:.2f} Mbps  |  Pred: {y_pred/1e6:.2f} Mbps")
print(f"AbsErr: {abs_err_bps/1e6:.2f} Mbps  |  RelErr: {100*rel_err:.2f}%")
