In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from itertools import product
from xgboost import XGBClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, f1_score, recall_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import torch
from transformers import AutoTokenizer, AutoModel

from tqdm.auto import tqdm
import mlflow, os
import math

os.environ["DATABRICKS_HOST"] = "https://dbc-e0c2984f-335b.cloud.databricks.com/"
os.environ["DATABRICKS_TOKEN"] = ""
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Desarrollo_de_Soluciones/XGBoost_BGE")

MODEL_NAME = "BAAI/bge-base-en-v1.5"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#print(torch.__version__)
print(f"Using device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

  from .autonotebook import tqdm as notebook_tqdm
2025/10/13 00:46:53 INFO mlflow.tracking.fluent: Experiment with name '/Desarrollo_de_Soluciones/XGBoost_BGE' does not exist. Creating a new experiment.


Using device: NVIDIA GeForce RTX 3090


In [2]:
# cargar los datos

data_training = pd.read_csv("../data/cleaned_train_dataset.csv")
data_test = pd.read_csv("../data/cleaned_test_dataset.csv")
data_validation = pd.read_csv("../data/cleaned_val_dataset.csv")

In [3]:
# Vectorizador

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [4]:
def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    return summed / counts

@torch.no_grad()
def encode_texts(texts, batch_size=16, max_length=256):
    embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding", leave=False):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            list(map(str, batch)),
            padding=True, truncation=True, max_length=max_length, return_tensors="pt"
        ).to(device)

        outputs = model(**enc)
        pooled = mean_pooling(outputs.last_hidden_state, enc["attention_mask"])
        embs.append(pooled.cpu())
    return torch.cat(embs, dim=0).numpy()

In [5]:
# Crear vectores a partir de los datos

X_train = encode_texts(data_training["text"].astype(str).tolist(), batch_size=16, max_length=256)
X_val   = encode_texts(data_validation["text"].astype(str).tolist(), batch_size=16, max_length=256)
X_test  = encode_texts(data_test["text"].astype(str).tolist(), batch_size=16, max_length=256)

y_train = data_training["label"].astype(int).values
y_test  = data_test["label"].astype(int).values  
y_val   = data_validation["label"].astype(int).values

print(f"Embeddings: train={X_train.shape}, val={X_val.shape}, test={X_test.shape}")

                                                         

Embeddings: train=(816, 768), val=(247, 768), test=(258, 768)




In [6]:
xgb_grid = {
    "learning_rate": [0.1, 0.05],
    "n_estimators": [400, 600, 800],
    "max_depth": [4, 6, 8],
}

pos = y_train.sum()
neg = len(y_train) - pos
spw = float(neg / max(pos, 1))

In [7]:
def grid_items(grid):
    keys = list(grid.keys())
    for values in product(*[grid[k] for k in keys]):
        yield dict(zip(keys, values))

In [8]:
total = math.prod(len(v) for v in xgb_grid.values())
pbar = tqdm(total=total, desc="Runs", leave=True)

for xgb_params in grid_items(xgb_grid):
    run_name = f"bge_xgb_md{xgb_params['max_depth']}_ne{xgb_params['n_estimators']}_lr{xgb_params['learning_rate']}"
    pbar.set_description(run_name)

    with mlflow.start_run(run_name=run_name):
        mlflow.set_tag("encoder", MODEL_NAME)
        mlflow.set_tag("featurizer", "bge_embedding")
        mlflow.set_tag("clf", "XGBClassifier")

        mlflow.log_param("embed_dim", int(X_train.shape[1]))
        mlflow.log_param("n_train", int(X_train.shape[0]))
        mlflow.log_param("n_val", int(X_val.shape[0]))
        for k, v in xgb_params.items(): mlflow.log_param(f"xgb_{k}", v)
        mlflow.log_param("xgb_scale_pos_weight", spw)

        model = XGBClassifier(
            objective="binary:logistic",
            eval_metric=["aucpr", "auc"],
            tree_method="hist",          # usa "gpu_hist" si tienes GPU y xgboost con CUDA
            n_jobs=-1,
            scale_pos_weight=spw,
            random_state=42,
            **xgb_params
        )

        # Entrenamiento; puedes añadir early stopping si quieres
        model.fit(
            X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            verbose=False,
            # early_stopping_rounds=50,  # <- opcional
        )

        val_scores = model.predict_proba(X_val)[:, 1]
        val_pred = (val_scores >= 0.5).astype(int)

        metrics = {
            "pr_auc":   float(average_precision_score(y_val, val_scores)),
            "roc_auc":  float(roc_auc_score(y_val, val_scores)),
            "f1":       float(f1_score(y_val, val_pred)),
            "recall":   float(recall_score(y_val, val_pred)),
            "accuracy": float(accuracy_score(y_val, val_pred)),
        }
        mlflow.log_metrics(metrics)

    pbar.update(1)

pbar.close()

bge_xgb_md6_ne400_lr0.1:   6%|▌         | 1/18 [00:04<01:16,  4.53s/it]

🏃 View run bge_xgb_md4_ne400_lr0.1 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/7c88ee44710b4cb1a1c9e85c5e677c27
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md8_ne400_lr0.1:  11%|█         | 2/18 [00:09<01:16,  4.81s/it]

🏃 View run bge_xgb_md6_ne400_lr0.1 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/2d77c71cb04247018cc39353f7ba2512
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md4_ne600_lr0.1:  17%|█▋        | 3/18 [00:14<01:13,  4.91s/it]

🏃 View run bge_xgb_md8_ne400_lr0.1 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/f394cf56fae447a3b73bbaee080464b6
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md6_ne600_lr0.1:  22%|██▏       | 4/18 [00:19<01:10,  5.04s/it]

🏃 View run bge_xgb_md4_ne600_lr0.1 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/db9b3edbfddd494594f39b6c08ff4ca6
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md8_ne600_lr0.1:  28%|██▊       | 5/18 [00:25<01:09,  5.32s/it]

🏃 View run bge_xgb_md6_ne600_lr0.1 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/16ab3e6adb4e4846850391fb08a3db0c
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md4_ne800_lr0.1:  33%|███▎      | 6/18 [00:31<01:06,  5.54s/it]

🏃 View run bge_xgb_md8_ne600_lr0.1 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/d463ad08e089468185db014ea87c5954
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md6_ne800_lr0.1:  39%|███▉      | 7/18 [00:37<01:00,  5.50s/it]

🏃 View run bge_xgb_md4_ne800_lr0.1 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/8d17bc788e6b41c68f71874e4dc5996a
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md8_ne800_lr0.1:  44%|████▍     | 8/18 [00:43<00:57,  5.73s/it]

🏃 View run bge_xgb_md6_ne800_lr0.1 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/ceefb394d4934f3689b42aa94b261fd5
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md4_ne400_lr0.05:  50%|█████     | 9/18 [00:49<00:53,  5.91s/it]

🏃 View run bge_xgb_md8_ne800_lr0.1 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/a6d571b3df7b4f7784707816989de236
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md6_ne400_lr0.05:  56%|█████▌    | 10/18 [00:53<00:43,  5.43s/it]

🏃 View run bge_xgb_md4_ne400_lr0.05 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/8dbc5bb9858d4fc69ff49f5e1ab6f5b8
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md8_ne400_lr0.05:  61%|██████    | 11/18 [00:59<00:39,  5.63s/it]

🏃 View run bge_xgb_md6_ne400_lr0.05 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/6296369850754d43873c14f311525e8c
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md4_ne600_lr0.05:  67%|██████▋   | 12/18 [01:06<00:35,  5.96s/it]

🏃 View run bge_xgb_md8_ne400_lr0.05 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/05e3633e83784613b87c9356ab881d5c
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md6_ne600_lr0.05:  72%|███████▏  | 13/18 [01:12<00:29,  5.84s/it]

🏃 View run bge_xgb_md4_ne600_lr0.05 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/73bf38a955c346838e30c23ff00baa18
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md8_ne600_lr0.05:  78%|███████▊  | 14/18 [01:19<00:24,  6.18s/it]

🏃 View run bge_xgb_md6_ne600_lr0.05 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/68c1e4c9488d4e41ae8cc277da07cee0
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md4_ne800_lr0.05:  83%|████████▎ | 15/18 [01:26<00:19,  6.54s/it]

🏃 View run bge_xgb_md8_ne600_lr0.05 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/f0d1553d5a044fc2a373945f1812361d
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md6_ne800_lr0.05:  89%|████████▉ | 16/18 [01:32<00:12,  6.44s/it]

🏃 View run bge_xgb_md4_ne800_lr0.05 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/3f46a4aa735b448681437ffc52fbef5b
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md8_ne800_lr0.05:  94%|█████████▍| 17/18 [01:40<00:06,  6.75s/it]

🏃 View run bge_xgb_md6_ne800_lr0.05 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/c69d3e1f7d2144958a5fea1d01f05a63
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339


bge_xgb_md8_ne800_lr0.05: 100%|██████████| 18/18 [01:48<00:00,  6.03s/it]

🏃 View run bge_xgb_md8_ne800_lr0.05 at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339/runs/f61a6f1512cd4effb6defa72355d28c2
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/975753697605339





In [9]:
print("Experimento Concluido con Exito!")

Experimento Concluido con Exito!
