# Setup

In [54]:

# Imports  
import requests  
import mimetypes  
import math  
from typing import List, Optional  

import pandas as pd  
import numpy as np  
import seaborn as sns  
import matplotlib.pyplot as plt  
import joblib  

from xgboost import XGBClassifier, XGBRegressor  
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV  
from sklearn.metrics import (precision_recall_curve, mean_absolute_error, mean_squared_error,  
                             r2_score, classification_report, confusion_matrix, accuracy_score,  
                             roc_curve, auc, f1_score)  
from sklearn.base import BaseEstimator, TransformerMixin  
from sklearn.compose import ColumnTransformer  
from sklearn.preprocessing import OneHotEncoder, StandardScaler  
from sklearn.exceptions import NotFittedError  
from sklearn.utils.validation import check_is_fitted  

from scipy.spatial import cKDTree  

from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE  


In [55]:
def notificar_telegram(
    extra_msg: Optional[str] = None,
    media_files: Optional[List[str]] = None
):
    """
    Envía un mensaje por Telegram y opcionalmente adjunta archivos.

    Parámetros:
    - extra_msg: Texto adicional. Si se proporciona, se coloca
      dos renglones después del mensaje base.
    - media_files: Lista de rutas de fichero a enviar
      (imágenes, CSV, PDF, etc.).
    """
    BOT_TOKEN = "7812752828:AAFiX6Fi8I-LCspDwwced4Vg9W3VgAxLwMM"
    CHAT_ID   = "7914812338"
    BASE_MSG  = "✅ Código completado correctamente."

    # Construir el texto final
    if extra_msg:
        text = f"{BASE_MSG}\n\n{extra_msg}"
    else:
        text = BASE_MSG

    # 1) Enviar el texto
    url_text = f"https://api.telegram.org/bot{BOT_TOKEN}/sendMessage"
    resp = requests.post(url_text, data={"chat_id": CHAT_ID, "text": text})
    if resp.status_code != 200:
        print("❌ Error al enviar mensaje de texto:", resp.text)
    else:
        print("✅ Mensaje de texto enviado.")

    # 2) Enviar archivos uno a uno si los hay
    if media_files:
        for filepath in media_files:
            mime_type, _ = mimetypes.guess_type(filepath)
            # Elegir endpoint según tipo
            if mime_type and mime_type.startswith("image/"):
                url_media = f"https://api.telegram.org/bot{BOT_TOKEN}/sendPhoto"
                key = "photo"
            else:
                # Para tablas (CSV), documentos (PDF, XLSX...) o cualquier otro
                url_media = f"https://api.telegram.org/bot{BOT_TOKEN}/sendDocument"
                key = "document"

            with open(filepath, "rb") as f:
                files = { key: f }
                data  = { "chat_id": CHAT_ID }
                resp = requests.post(url_media, data=data, files=files)
                if resp.status_code != 200:
                    print(f"❌ Error al enviar {filepath}:", resp.text)
                else:
                    print(f"✅ Archivo enviado: {filepath}")


In [56]:
# %%  
# Bloque 2: Funciones de normalización y conversión  
def convert_service(service: str) -> str:
    aereo_service = [
        "1 kg","2 kg","3 kg","4 kg","5 kg","6 kg","7 kg","8 kg","9 kg",
        "10 kg","11 kg","12 kg","13 kg","14 kg","15 kg","16 kg","17 kg",
        "19 kg","20 kg","21 kg","22 kg","24 kg","25 kg","26 kg","27 kg",
        "FEDEX_EXPRESS_SAVER","FEDEX_EXPRESS_SAVER_Z1","FEDEX_EXPRESS_SAVER_Z2",
        "FEDEX_EXPRESS_SAVER_Z3","FEDEX_EXPRESS_SAVER_Z4","FEDEX_EXPRESS_SAVER_Z5",
        "FEDEX_EXPRESS_SAVER_Z6","FEDEX_EXPRESS_SAVER_Z7","FEDEX_EXPRESS_SAVER_Z8",
        "UPS_STANDAR","UPS_SAVER","Standard","standard",
        "STANDARD_ECOMMERCE_Z1","STANDARD_ECOMMERCE_Z2","STANDARD_ECOMMERCE_Z3",
        "STANDARD_ECOMMERCE_Z4","STANDARD_ECOMMERCE_Z5","STANDARD_ECOMMERCE_Z6",
        "STANDARD_ECOMMERCE_Z7","STANDARD_ECOMMERCE_Z8","STANDARD_OVERNIGHT",
        "STANDARD_OVERNIGHT_Z4","STANDARD_OVERNIGHT_Z6",
        "STANDARD_SPECIAL_Z1","STANDARD_SPECIAL_Z2","STANDARD_SPECIAL_Z3",
        "STANDARD_SPECIAL_Z4","STANDARD_SPECIAL_Z5","STANDARD_SPECIAL_Z6",
        "STANDARD_SPECIAL_Z7","STANDARD_Z1","STANDARD_Z2","STANDARD_Z3",
        "STANDARD_Z4","STANDARD_Z5","EXPRESS DOMESTIC","ECONOMY SELECT DOMESTIC",
        "EXPRESS_SPECIAL_Z1","EXPRESS_SPECIAL_Z2","EXPRESS_SPECIAL_Z3",
        "EXPRESS_SPECIAL_Z4","EXPRESS_SPECIAL_Z5","EXPRESS_SPECIAL_Z6",
        "EXPRESS_SPECIAL_Z7","EXPRESS_ECOMMERCE_Z1","EXPRESS_ECOMMERCE_Z2",
        "EXPRESS_ECOMMERCE_Z3","EXPRESS_ECOMMERCE_Z4","EXPRESS_ECOMMERCE_Z5",
        "EXPRESS_ECOMMERCE_Z6","EXPRESS_ECOMMERCE_Z7","EXPRESS_ECOMMERCE_Z8",
        "Terrestre","Dia Sig.","nextday","economico","Metropoli",
        "ground","saver","pickup","SENDEX"
    ]
    return "aereo" if service in aereo_service else "terrestre"

def convert_carrier(carrier: str) -> str:
    carrier_convert = {
        "Afimex": "Afimex",
        "buho": "Buho",
        "DHL": "DHL",
        "Estafeta": "Estafeta",
        "FDXM": "Fedex",
        "FEDEX MEXICO": "Fedex",
        "FEDEX": "Fedex",
        "fedex": "Fedex",
        "JT Express": "JT Express",
        "JTEX": "JT Express",
        "Paquetexpress": "Paquetexpress",
        "PAQUETEXPRESS": "Paquetexpress",
        "SENDEX": "Sendex",
        "UPS": "UPS"
    }
    return carrier_convert.get(carrier, carrier)

def convert_estado(cp: int) -> str:
    estados = [
        {'name':'Ciudad de México',   'min':1000,  'max':16900},
        {'name':'Aguascalientes',     'min':20000, 'max':20997},
        {'name':'Baja California',    'min':21000, 'max':22997},
        {'name':'Baja California Sur','min':23000, 'max':23997},
        {'name':'Campeche',           'min':24000, 'max':24940},
        {'name':'Coahuila',           'min':25000, 'max':27999},
        {'name':'Colima',             'min':28000, 'max':28989},
        {'name':'Chiapas',            'min':29000, 'max':30997},
        {'name':'Chihuahua',          'min':31000, 'max':33997},
        {'name':'Durango',            'min':34000, 'max':35987},
        {'name':'Guanajuato',         'min':36000, 'max':38997},
        {'name':'Guerrero',           'min':39000, 'max':41998},
        {'name':'Hidalgo',            'min':42000, 'max':43998},
        {'name':'Jalisco',            'min':44100, 'max':49996},
        {'name':'México',             'min':50000, 'max':57950},
        {'name':'Michoacán',          'min':58000, 'max':61998},
        {'name':'Morelos',            'min':62000, 'max':62996},
        {'name':'Nayarit',            'min':63000, 'max':63996},
        {'name':'Nuevo León',         'min':64000, 'max':67996},
        {'name':'Oaxaca',             'min':68000, 'max':71998},
        {'name':'Puebla',             'min':72000, 'max':75997},
        {'name':'Querétaro',          'min':76000, 'max':76998},
        {'name':'Quintana Roo',       'min':77000, 'max':77997},
        {'name':'San Luis Potosí',    'min':78000, 'max':79998},
        {'name':'Sinaloa',            'min':80000, 'max':82996},
        {'name':'Sonora',             'min':83000, 'max':85994},
        {'name':'Tabasco',            'min':86000, 'max':86998},
        {'name':'Tamaulipas',         'min':87000, 'max':89970},
        {'name':'Tlaxcala',           'min':90000, 'max':90990},
        {'name':'Veracruz',           'min':91000, 'max':96998},
        {'name':'Yucatán',            'min':97000, 'max':97990},
        {'name':'Zacatecas',          'min':98000, 'max':99998}
    ]
    for e in estados:
        if e['min'] <= cp <= e['max']:
            return e['name']
    closest = min(estados, key=lambda e: abs(cp - ((e['min'] + e['max'])/2)))
    return closest['name']


In [57]:
# Bloque 4: Carga y limpieza inicial
def get_db():
    url = "https://wapi.wing.buhologistics.com/getDatasetInfo"
    headers = {'api-key': '0DJB9c_xpbQbprsg7iZLaUR'}
    return requests.get(url, headers=headers).json()["data"]

df = pd.DataFrame(get_db()).dropna()
df.drop(columns=["client","coordinates_origin_pc","coordinates_dest_pc"], errors="ignore", inplace=True)
df['service_mode'] = df['service'].apply(convert_service)
df['carrier']      = df['carrier'].apply(convert_carrier)
df['origin_state'] = df['origin_pc'].astype(int).apply(convert_estado)
df['dest_state']   = df['dest_pc'].astype(int).apply(convert_estado)

# días hábiles
start = pd.to_datetime(df['start_date']).values.astype('datetime64[D]')
end   = pd.to_datetime(df['delivery_date']).values.astype('datetime64[D]')
df['delivery_time_bd'] = np.busday_count(start, end)

# filtros
df = df[(df['delivery_time_bd']>=0) & (df['delivery_time_bd']<=10) & (df['rate']<=20)]


In [58]:
# Bloque 5: Geolocalización
df_coord     = pd.read_excel("coordenadas_mx.xlsx")
postal_codes = df_coord['codigo_postal'].values.reshape(-1,1)
coords       = df_coord[['latitud','longitud']].values
tree         = cKDTree(postal_codes)

def lookup_coord(cp:int):
    m = df_coord[df_coord['codigo_postal']==cp]
    if not m.empty:
        return m.iloc[0][['latitud','longitud']].tolist()
    _, idx = tree.query([[cp]])
    return coords[idx[0]].tolist()

def haversine_vec(lat1, lon1, lat2, lon2):
    R = 6371.0
    φ1,φ2 = np.radians(lat1), np.radians(lat2)
    Δφ    = np.radians(lat2-lat1)
    Δλ    = np.radians(lon2-lon1)
    a     = np.sin(Δφ/2)**2 + np.cos(φ1)*np.cos(φ2)*np.sin(Δλ/2)**2
    return R * 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

In [59]:
# Bloque 6: Transformer & Preprocessor
class RawToFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        rows = []
        for _, r in X.iterrows():
            cp_o, cp_d = int(r["origin_pc"]), int(r["dest_pc"])
            fecha      = pd.to_datetime(r["start_date"])
            lo, ld     = lookup_coord(cp_o), lookup_coord(cp_d)
            dist       = round(haversine_vec(lo[0], lo[1], ld[0], ld[1]), 2)
            d = {
                "rate":          float(r["rate"]),
                "distance":      dist,
                "origin_state":  r["origin_state"],
                "dest_state":    r["dest_state"],
                "carrier_service": f"{r['carrier']}_{r['service_mode']}",
                "day_week":      fecha.weekday(),
                "month":         fecha.month
            }
            d["day_sin"]    = np.sin(2*np.pi * d["day_week"]/7)
            d["day_cos"]    = np.cos(2*np.pi * d["day_week"]/7)
            d["month_sin"]  = np.sin(2*np.pi * d["month"]/12)
            d["month_cos"]  = np.cos(2*np.pi * d["month"]/12)
            d["is_weekend"] = int(d["day_week"]>=5)
            d["rate_x_dist"]= d["rate"] * d["distance"]
            rows.append(d)
        df_f = pd.DataFrame(rows)
        return df_f.drop(columns=["day_week","month"])

numeric_feats     = ["rate","distance","day_sin","day_cos","month_sin","month_cos","is_weekend","rate_x_dist"]
categorical_feats = ["origin_state","dest_state","carrier_service"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_feats),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_feats),
], remainder="drop")


In [None]:
# Bloque 7: Train/Test split
test_n = 500
test_samples = [df[df["incidence"]==c].sample(test_n, random_state=42) for c in [0,1]]
test_df = pd.concat(test_samples).sample(frac=1, random_state=42)
train_df = df.drop(test_df.index)


## Incidencia

In [None]:
# %%  
# Bloque completo: GridSearch + SMOTE + Evaluación con thresholds configurables

from sklearn.metrics import precision_recall_curve, classification_report, confusion_matrix

# — Parámetros de configuración al inicio —
param_grid    = {
    "n_estimators":  [100, 200, 300, 500],
    "max_depth":     [3, 6, 10],
    "learning_rate": [0.001, 0.01, 0.1],
}
cv            = 3
default_thr   = 0.3       # umbral por defecto
beta          = 2.0       # β para Fβ-score
precision_min = 0.8       # precisión mínima deseada para el “Caso Precision” (5c)
threshold_min = 0.25       # umbral mínimo para el “Caso MaxRec” (5d)

# 1) Split train/validation
train_sub_df, val_df = train_test_split(
    train_df,
    test_size=0.20,
    random_state=42,
    stratify=train_df["incidence"]
)

# 2) Preprocesado UNA sola vez
X_raw_train = RawToFeatures().fit_transform(train_sub_df)
X_pre_train = preprocessor.fit_transform(X_raw_train)
y_train     = train_sub_df["incidence"]

X_raw_val = RawToFeatures().transform(val_df)
X_pre_val = preprocessor.transform(X_raw_val)
y_val     = val_df["incidence"]

# 3) SMOTE una sola vez
smote      = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_pre_train, y_train)

# 4) GridSearch sobre XGBClassifier con scale_pos_weight ajustado
n_cand = np.prod([len(v) for v in param_grid.values()])
print(f"Fitting {cv} folds for each of {n_cand} candidates, totalling {cv * n_cand} fits")

neg, pos = len(y_res) - y_res.sum(), y_res.sum()
clf = XGBClassifier(
    eval_metric="logloss",
    scale_pos_weight=(neg/pos) * 1.5,
    random_state=42,
    n_jobs=-1
)
grid = GridSearchCV(clf, param_grid, scoring="f1", cv=cv, verbose=1)
grid.fit(X_res, y_res)

best_clf = grid.best_estimator_
print("▶ Mejores parámetros incidencia:", grid.best_params_)

# 5) Predicciones en VALIDATION
probs_val = best_clf.predict_proba(X_pre_val)[:, 1]

# — 5a) Umbral por defecto —
y_def = (probs_val >= default_thr).astype(int)

# — 5b) Umbral óptimo Fβ (β = 2.0) —
prec, rec, thr = precision_recall_curve(y_val, probs_val)
f2_scores      = (1 + beta**2) * (prec * rec) / (beta**2 * prec + rec + 1e-9)
idx_f2         = np.argmax(f2_scores)
thr_f2         = thr[idx_f2]
print(f"▶ Umbral F{beta}-opt: {thr_f2:.4f} (F{beta} = {f2_scores[idx_f2]:.4f})")

# — 5c) Caso Precision: umbral que garantiza al menos precision_min y maximiza recall —
cands_prec = [(p, r, t) for p, r, t in zip(prec, rec, thr) if p >= precision_min]
if cands_prec:
    p_sel, r_sel, thr_prec = max(cands_prec, key=lambda x: x[1])  # max recall
    print(f"▶ Umbral precisión ≥ {precision_min:.2f}: {thr_prec:.4f} (prec = {p_sel:.4f}, recall = {r_sel:.4f})")
else:
    # fallback al umbral que más se acerca a precision_min
    idx_fb    = np.argmin([abs(p - precision_min) for p in prec])
    thr_prec  = thr[idx_fb]
    r_sel     = rec[idx_fb]
    p_sel     = prec[idx_fb]
    print(f"⚠️ No hay umbral con precisión ≥ {precision_min:.2f};")
    print(f"   mejor aproximación: {thr_prec:.4f} (prec = {p_sel:.4f}, recall = {r_sel:.4f})")

# — 5d) Caso MaxRec puro (threshold ≥ threshold_min) —
cands_rec = [(r, t) for r, t in zip(rec, thr) if t >= threshold_min]
if cands_rec:
    r_maxrec, thr_maxrec = max(cands_rec, key=lambda x: x[0])
    print(f"▶ Máximo recall con threshold ≥ {threshold_min}: {r_maxrec:.4f} (umbral = {thr_maxrec:.4f})")
else:
    thr_maxrec = threshold_min
    idx_fb     = np.argmin(abs(thr - threshold_min))
    r_maxrec   = rec[idx_fb]
    print(f"⚠️ No hay umbral ≥ {threshold_min}; usando {thr_maxrec:.4f} (recall = {r_maxrec:.4f})")

# 6) Definir wrappers para cada modelo (pipeline + umbral)
class PipelineWithThreshold:
    def __init__(self, pipeline, threshold):
        self.pipeline  = pipeline
        self.threshold = threshold
    def predict(self, X):
        probs = self.pipeline.predict_proba(X)[:,1]
        return (probs >= self.threshold).astype(int)

full_pipeline = Pipeline([
    ("raw",  RawToFeatures()),
    ("prep", preprocessor),
    ("clf",  best_clf)
])

default_model = PipelineWithThreshold(full_pipeline, default_thr)
f2_model      = PipelineWithThreshold(full_pipeline, thr_f2)
prec_model    = PipelineWithThreshold(full_pipeline, thr_prec)
maxrec_model  = PipelineWithThreshold(full_pipeline, thr_maxrec)

# Guardar wrappers
joblib.dump(default_model, "xgb_pipeline_default_thr.pkl")
joblib.dump(f2_model,      "xgb_pipeline_f2_thr.pkl")
joblib.dump(prec_model,    "xgb_pipeline_prec_thr.pkl")
joblib.dump(maxrec_model,  "xgb_pipeline_maxrec_thr.pkl")

print(f"Guardados wrappers con thresholds:\n"
      f"  defecto={default_thr}, F{beta}={thr_f2:.4f}, "
      f"prec({precision_min})={thr_prec:.4f}, maxrec={thr_maxrec:.4f}")

# 7) Impresión de métricas sobre test_df

y_true = test_df["incidence"]

for name, model in [
    ("Default", default_model),
    (f"F{beta}-opt",  f2_model),
    (f"Prec≥{precision_min}", prec_model),
    ("MaxRec",        maxrec_model)
]:
    y_pred = model.predict(test_df)
    print(f"\n=== Evaluación: {name} ===")
    print("Classification Report:")
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Fitting 3 folds for each of 36 candidates, totalling 108 fits
▶ Mejores parámetros incidencia: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 500}
▶ Umbral F2.0-opt: 0.1517 (F2.0 = 0.5638)
▶ Umbral precisión ≥ 0.80: 0.9166 (prec = 0.8006, recall = 0.2605)
▶ Máximo recall con threshold ≥ 0.25: 0.6622 (umbral = 0.2501)
Guardados wrappers con thresholds:
  defecto=0.3, F2.0=0.1517, prec(0.8)=0.9166, maxrec=0.2501

=== Evaluación: Default ===
Classification Report:
              precision    recall  f1-score   support

         0.0     0.7154    0.9300    0.8087       100
         1.0     0.9000    0.6300    0.7412       100

    accuracy                         0.7800       200
   macro avg     0.8077    0.7800    0.7749       200
weighted avg     0.8077    0.7800    0.7749       200

Confusion Matrix:
[[93  7]
 [37 63]]

=== Evaluación: F2.0-opt ===
Classification Report:
              precision    recall  f1-score   

In [68]:
notificar_telegram("Incidencia listo")

✅ Mensaje de texto enviado.


In [69]:
# Bloque 9: Evaluación Incidencia con 3 Thresholds
inc_pipeline  = joblib.load("xgb_pipeline.pkl")

# 1) Predecir probabilidades de incidencia
probs  = inc_pipeline.predict_proba(test_df)[:, 1]
y_true = test_df["incidence"]

# 2) Threshold por defecto
default_thr = 0.3

# 3) Threshold óptimo para F1
prec, rec, thr = precision_recall_curve(y_true, probs)
f1_scores = 2 * prec * rec / (prec + rec + 1e-9)
best_idx  = np.argmax(f1_scores)
f1_thr    = thr[best_idx]

# 4) Threshold que maximiza recall puro
rec_idx = np.argmax(rec)
rec_thr = thr[rec_idx]

# 5) Diccionario de thresholds
thresholds = {
    f"Default ({default_thr})"   : default_thr,
    f"F1 max ({f1_thr:.4f})"     : f1_thr,
    f"Max Recall ({rec_thr:.4f})": rec_thr
}

# 6) Imprimir métricas para cada caso
for name, t in thresholds.items():
    y_pred = (probs >= t).astype(int)
    print(f"\n=== Evaluación: {name} ===")
    print(f"Umbral = {t:.4f}\n")
    print("Classification Report:")
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))



=== Evaluación: Default (0.3) ===
Umbral = 0.3000

Classification Report:
              precision    recall  f1-score   support

         0.0     0.8727    0.9600    0.9143       100
         1.0     0.9556    0.8600    0.9053       100

    accuracy                         0.9100       200
   macro avg     0.9141    0.9100    0.9098       200
weighted avg     0.9141    0.9100    0.9098       200

Confusion Matrix:
[[96  4]
 [14 86]]

=== Evaluación: F1 max (0.1732) ===
Umbral = 0.1732

Classification Report:
              precision    recall  f1-score   support

         0.0     0.9394    0.9300    0.9347       100
         1.0     0.9307    0.9400    0.9353       100

    accuracy                         0.9350       200
   macro avg     0.9350    0.9350    0.9350       200
weighted avg     0.9350    0.9350    0.9350       200

Confusion Matrix:
[[93  7]
 [ 6 94]]

=== Evaluación: Max Recall (0.0000) ===
Umbral = 0.0000

Classification Report:
              precision    recall  f1-s

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Tiempo de entrega

In [None]:
# 4) Precomputar features para regresión sobre todo train_df
X_raw_reg = RawToFeatures().fit_transform(train_df)
X_pre_reg = preprocessor.fit_transform(X_raw_reg)

# 5) Target transformado (sqrt)
y_time = np.sqrt(train_df["delivery_time_bd"])


# — Bloque X: GridSearch+RandomizedSearch para tiempo de entrega con TODOS los datos —

# 1) Ya deberías tener:
#    X_pre_reg  (matriz de features: rate, distance, day_sin/cos, month_sin/cos, is_weekend, rate_x_dist,
#                plus todas las dummies de origin_state, dest_state, carrier_service)
#    y_time     (√ de delivery_time_bd)

# 2) Define el regresor y el espacio de búsqueda
reg = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)
param_dist = {
    "n_estimators":     [100, 200, 300, 500],
    "max_depth":        [None, 5, 10, 15],
    "learning_rate":    [0.001, 0.01, 0.1, 0.2, 0.5],
    "subsample":        [0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma":            [0, 0.1, 0.5, 1],
    "min_child_weight": [1, 3, 5],
    "reg_alpha":        [0, 0.1, 1],
    "reg_lambda":       [1, 5, 10],
}

# 3) RandomizedSearchCV SOBRE X_pre_reg Y y_time
rand = RandomizedSearchCV(
    reg,
    param_dist,
    n_iter=50,
    scoring="neg_mean_absolute_error",
    cv=5,
    verbose=1,
    random_state=42
)
rand.fit(X_pre_reg, y_time)   # <— aquí usas las variables que ya existían

best_reg = rand.best_estimator_
print("▶ Mejores parámetros Tiempo de Entrega:", rand.best_params_)

# 4) Guardar pipeline completo
full_time_pipeline = Pipeline([
    ("raw",  RawToFeatures()),
    ("prep", preprocessor),
    ("reg",  best_reg)
])
joblib.dump(full_time_pipeline, "xgbreg_pipeline.pkl")
print("✅ Guardado xgbreg_pipeline.pkl")


# 5) Evaluación rápida sobre test_df:
X_raw_test = RawToFeatures().transform(test_df)
X_pre_test = preprocessor.transform(X_raw_test)
y_test     = test_df["delivery_time_bd"]

pred_sqrt  = best_reg.predict(X_pre_test)
pred_days  = np.square(pred_sqrt)

print("\n=== Evaluación TEST Tiempo de Entrega ===")
print("MAE:",  mean_absolute_error(y_test, pred_days))
mse  = mean_squared_error(y_test, pred_days)
rmse = np.sqrt(mse)
print("RMSE:", rmse)
print("R²:",   r2_score(y_test, pred_days))



Fitting 5 folds for each of 50 candidates, totalling 250 fits
▶ Mejores parámetros Tiempo de Entrega: {'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0.1, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.6}
✅ Guardado xgbreg_pipeline.pkl

=== Evaluación TEST Tiempo de Entrega ===
MAE: 0.7795506119728088
RMSE: 1.3308660607244196
R²: 0.22786325216293335


In [71]:
notificar_telegram("Entrenamiento completado")

✅ Mensaje de texto enviado.
