In [None]:
import pandas as pd
import numpy as np
import math
from itertools import product
from sklearn.svm import SVC
from sklearn.metrics import average_precision_score, roc_auc_score, f1_score, recall_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import torch
from transformers import AutoTokenizer, AutoModel

from tqdm.auto import tqdm
import mlflow, os


os.environ["DATABRICKS_HOST"] = "https://dbc-e0c2984f-335b.cloud.databricks.com/"
os.environ["DATABRICKS_TOKEN"] = ""
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Desarrollo_de_Soluciones/SVM_bge")

MODEL_NAME = "BAAI/bge-base-en-v1.5"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#print(torch.__version__)
print(f"Using device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

  from .autonotebook import tqdm as notebook_tqdm
2025/10/13 22:28:11 INFO mlflow.tracking.fluent: Experiment with name '/Desarrollo_de_Soluciones/SVM_bge' does not exist. Creating a new experiment.


Using device: NVIDIA GeForce RTX 3090


In [2]:
# cargar los datos

data_training = pd.read_csv("../data/cleaned_train_dataset.csv")
data_test = pd.read_csv("../data/cleaned_test_dataset.csv")
data_validation = pd.read_csv("../data/cleaned_val_dataset.csv")

In [3]:
# Vectorizador

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [4]:
def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    return summed / counts

@torch.no_grad()
def encode_texts(texts, batch_size=16, max_length=256):
    embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding", leave=False):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            list(map(str, batch)),
            padding=True, truncation=True, max_length=max_length, return_tensors="pt"
        ).to(device)

        outputs = model(**enc)
        pooled = mean_pooling(outputs.last_hidden_state, enc["attention_mask"])
        embs.append(pooled.cpu())
    return torch.cat(embs, dim=0).numpy()

In [5]:
# Crear vectores a partir de los datos

X_train = encode_texts(data_training["text"].astype(str).tolist(), batch_size=16, max_length=256)
X_val   = encode_texts(data_validation["text"].astype(str).tolist(), batch_size=16, max_length=256)
X_test  = encode_texts(data_test["text"].astype(str).tolist(), batch_size=16, max_length=256)

y_train = data_training["label"].astype(int).values
y_test  = data_test["label"].astype(int).values  
y_val   = data_validation["label"].astype(int).values

print(f"Embeddings: train={X_train.shape}, val={X_val.shape}, test={X_test.shape}")

                                                         

Embeddings: train=(816, 768), val=(247, 768), test=(258, 768)




In [6]:
svm_grid = {
    "linear": {
        "C": [0.1, 1.0, 10.0],
    },
    "rbf": {
        "C": [1.0, 10.0],
        "gamma": ["scale", 1e-3],
    },
    "poly": {
        "C": [1.0, 10.0],
        "degree": [2, 3],
        "gamma": ["scale"],
        "coef0": [0.0, 1.0],
    },
}

In [7]:
def grid_items(grid):
    keys = list(grid.keys())
    for values in product(*[grid[k] for k in keys]):
        yield dict(zip(keys, values))

In [8]:
from sklearn.svm import SVC

def grid_items_svm(svm_grid):
    for kernel, subgrid in svm_grid.items():
        keys = list(subgrid.keys())
        for values in product(*[subgrid[k] for k in keys]):
            params = dict(zip(keys, values))
            yield kernel, params

def count_total_runs(svm_grid):
    total = 0
    for kernel, subgrid in svm_grid.items():
        n = 1
        for k in subgrid:
            n *= len(subgrid[k])
        total += n
    return total

total = count_total_runs(svm_grid)
pbar = tqdm(total=total, desc="Runs", leave=True)

for kernel, sv_params in grid_items_svm(svm_grid):
    run_suffix = "_".join([f"{k}{sv_params[k]}" for k in sorted(sv_params.keys())])
    run_name = f"bge_SVC_{kernel}__{run_suffix}"
    pbar.set_description(run_name)

    with mlflow.start_run(run_name=run_name):
        mlflow.set_tag("encoder", MODEL_NAME)
        mlflow.set_tag("featurizer", "contextual_mean_pool")
        mlflow.set_tag("clf", "SVC")
        mlflow.log_param("kernel", kernel)
        mlflow.log_param("embed_dim", int(X_train.shape[1]))
        mlflow.log_param("n_train", int(X_train.shape[0]))
        mlflow.log_param("n_val", int(X_val.shape[0]))
        for k, v in sv_params.items():
            mlflow.log_param(f"SVM_{k}", v)

        clf = SVC(
            kernel=kernel,
            class_weight="balanced",
            probability=True,          # necesario para predict_proba
            max_iter=5000,
            **sv_params
        )

        pipe = Pipeline([
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("clf", clf),
        ])

        pipe.fit(X_train, y_train)

        val_scores = pipe.predict_proba(X_val)[:, 1]
        val_pred = (val_scores >= 0.5).astype(int)
        metrics = {
            "pr_auc":   float(average_precision_score(y_val, val_scores)),
            "roc_auc":  float(roc_auc_score(y_val, val_scores)),
            "f1":       float(f1_score(y_val, val_pred)),
            "recall":   float(recall_score(y_val, val_pred)),
            "accuracy": float(accuracy_score(y_val, val_pred)),
        }
        mlflow.log_metrics(metrics)

    pbar.update(1)

pbar.close()


bge_SVC_linear__C0.1:   0%|          | 0/15 [00:00<?, ?it/s]

🏃 View run bge_SVC_linear__C0.1 at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/848f73b5f9734fa5a48c41304fac61d5
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632


bge_SVC_linear__C10.0:  13%|█▎        | 2/15 [00:05<00:33,  2.59s/it]

🏃 View run bge_SVC_linear__C1.0 at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/70b50298423b4f77b8520f1164527f5f
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632
🏃 View run bge_SVC_linear__C10.0 at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/f3a01aedacb54d15a328ebb84851eeeb
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632


bge_SVC_rbf__C1.0_gamma0.001:  27%|██▋       | 4/15 [00:10<00:28,  2.62s/it]

🏃 View run bge_SVC_rbf__C1.0_gammascale at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/02c255dab66544cb89a4ba370b67a938
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632


bge_SVC_rbf__C10.0_gammascale:  33%|███▎      | 5/15 [00:13<00:26,  2.66s/it]

🏃 View run bge_SVC_rbf__C1.0_gamma0.001 at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/756758d1be8449f284cbef609f0817e5
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632


bge_SVC_rbf__C10.0_gammascale:  40%|████      | 6/15 [00:16<00:24,  2.71s/it]

🏃 View run bge_SVC_rbf__C10.0_gammascale at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/adde5bbcea724a3b9f0cea86b2c52f10
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632


bge_SVC_poly__C1.0_coef00.0_degree2_gammascale:  47%|████▋     | 7/15 [00:18<00:21,  2.72s/it]

🏃 View run bge_SVC_rbf__C10.0_gamma0.001 at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/bbf53e2cca8542d19f373a87da1b5803
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632
🏃 View run bge_SVC_poly__C1.0_coef00.0_degree2_gammascale at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/96e31867099741edbc85d64acecb6a87
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632


bge_SVC_poly__C1.0_coef01.0_degree2_gammascale:  60%|██████    | 9/15 [00:24<00:17,  2.88s/it]

🏃 View run bge_SVC_poly__C1.0_coef01.0_degree2_gammascale at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/7dd63b835dad467fbb162c624deea42e
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632


bge_SVC_poly__C1.0_coef01.0_degree3_gammascale:  67%|██████▋   | 10/15 [00:27<00:14,  2.93s/it]

🏃 View run bge_SVC_poly__C1.0_coef00.0_degree3_gammascale at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/9d82e757f778473cb153571aa8986246
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632


bge_SVC_poly__C10.0_coef00.0_degree2_gammascale:  73%|███████▎  | 11/15 [00:30<00:11,  2.95s/it]

🏃 View run bge_SVC_poly__C1.0_coef01.0_degree3_gammascale at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/424b4299d9d44b96b99e06d320ad8cca
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632


bge_SVC_poly__C10.0_coef01.0_degree2_gammascale:  80%|████████  | 12/15 [00:33<00:08,  2.97s/it]

🏃 View run bge_SVC_poly__C10.0_coef00.0_degree2_gammascale at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/e97667662ab3452980421d4f6fe4688d
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632


bge_SVC_poly__C10.0_coef00.0_degree3_gammascale:  87%|████████▋ | 13/15 [00:36<00:05,  2.99s/it]

🏃 View run bge_SVC_poly__C10.0_coef01.0_degree2_gammascale at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/62768e810733412ab0dc09321d8778cf
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632


bge_SVC_poly__C10.0_coef01.0_degree3_gammascale:  93%|█████████▎| 14/15 [00:39<00:03,  3.02s/it]

🏃 View run bge_SVC_poly__C10.0_coef00.0_degree3_gammascale at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/bd2f19e279cf4369a11c1335079e6d8b
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632


bge_SVC_poly__C10.0_coef01.0_degree3_gammascale: 100%|██████████| 15/15 [00:43<00:00,  2.87s/it]

🏃 View run bge_SVC_poly__C10.0_coef01.0_degree3_gammascale at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632/runs/57ee2ed163a4418eb480d1b7e27f5306
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565632





In [9]:
print("Experimento Concluido con Exito!")

Experimento Concluido con Exito!


In [10]:
from IPython.display import Markdown

def export_code_cells():
    from IPython import get_ipython
    cells = get_ipython().user_ns['In']
    code = '\n\n'.join([c for c in cells if c.strip()])
    return Markdown(f'```python\n{code}\n```')

#export_code_cells()