In [None]:
import pandas as pd
import numpy as np
import math
from itertools import product

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import average_precision_score, roc_auc_score, f1_score, recall_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import torch
from transformers import AutoTokenizer, AutoModel

from tqdm.auto import tqdm
import mlflow, os

os.environ["DATABRICKS_HOST"] = "https://dbc-e0c2984f-335b.cloud.databricks.com/"
os.environ["DATABRICKS_TOKEN"] = ""
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Desarrollo_de_Soluciones/NaiveBayes_bge")

MODEL_NAME = "BAAI/bge-base-en-v1.5"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#print(torch.__version__)
print(f"Using device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

  from .autonotebook import tqdm as notebook_tqdm
2025/10/13 22:30:51 INFO mlflow.tracking.fluent: Experiment with name '/Desarrollo_de_Soluciones/NaiveBayes_bge' does not exist. Creating a new experiment.


Using device: NVIDIA GeForce RTX 3090


In [2]:
# cargar los datos

data_training = pd.read_csv("../data/cleaned_train_dataset.csv")
data_test = pd.read_csv("../data/cleaned_test_dataset.csv")
data_validation = pd.read_csv("../data/cleaned_val_dataset.csv")

In [3]:
# Vectorizador

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [4]:
def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    return summed / counts

@torch.no_grad()
def encode_texts(texts, batch_size=16, max_length=256):
    embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding", leave=False):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            list(map(str, batch)),
            padding=True, truncation=True, max_length=max_length, return_tensors="pt"
        ).to(device)

        outputs = model(**enc)
        pooled = mean_pooling(outputs.last_hidden_state, enc["attention_mask"])
        embs.append(pooled.cpu())
    return torch.cat(embs, dim=0).numpy()

In [5]:
# Crear vectores a partir de los datos

X_train = encode_texts(data_training["text"].astype(str).tolist(), batch_size=16, max_length=256)
X_val   = encode_texts(data_validation["text"].astype(str).tolist(), batch_size=16, max_length=256)
X_test  = encode_texts(data_test["text"].astype(str).tolist(), batch_size=16, max_length=256)

y_train = data_training["label"].astype(int).values
y_test  = data_test["label"].astype(int).values  
y_val   = data_validation["label"].astype(int).values

print(f"Embeddings: train={X_train.shape}, val={X_val.shape}, test={X_test.shape}")

                                                         

Embeddings: train=(816, 768), val=(247, 768), test=(258, 768)




In [6]:
nb_grid = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6, 1e-5],
}


In [7]:
def grid_items(grid):
    keys = list(grid.keys())
    for values in product(*[grid[k] for k in keys]):
        yield dict(zip(keys, values))

In [8]:
total = math.prod(len(v) for v in nb_grid.values())
pbar = tqdm(total=total, desc="Runs", leave=True)

for nb_params in grid_items(nb_grid):
    run_name = f"bge_gaussiannb_vs{nb_params['var_smoothing']}"
    pbar.set_description(run_name)

    with mlflow.start_run(run_name=run_name):
        # Tags / params de experimento
        mlflow.set_tag("encoder", MODEL_NAME)
        mlflow.set_tag("featurizer", "contextual_mean_pool")
        mlflow.set_tag("clf", "GaussianNB")

        mlflow.log_param("embed_dim", int(X_train.shape[1]))
        mlflow.log_param("n_train", int(X_train.shape[0]))
        mlflow.log_param("n_val", int(X_val.shape[0]))
        for k, v in nb_params.items():
            mlflow.log_param(f"nb_{k}", v)

        # Pipeline: estandarización + GaussianNB
        pipe = Pipeline([
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("clf", GaussianNB(var_smoothing=nb_params["var_smoothing"]))
        ])

        pipe.fit(X_train, y_train)

        # Métricas en val
        val_scores = pipe.predict_proba(X_val)[:, 1]
        val_pred = (val_scores >= 0.5).astype(int)

        metrics = {
            "pr_auc":   float(average_precision_score(y_val, val_scores)),
            "roc_auc":  float(roc_auc_score(y_val, val_scores)),
            "f1":       float(f1_score(y_val, val_pred)),
            "recall":   float(recall_score(y_val, val_pred)),
            "accuracy": float(accuracy_score(y_val, val_pred)),
        }
        mlflow.log_metrics(metrics)

    pbar.update(1)

pbar.close()


bge_gaussiannb_vs1e-08:  20%|██        | 1/5 [00:02<00:10,  2.70s/it]

🏃 View run bge_gaussiannb_vs1e-09 at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565633/runs/cdb6f52e7cba4ee790164bcad1e76337
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565633


bge_gaussiannb_vs1e-07:  40%|████      | 2/5 [00:05<00:07,  2.50s/it]

🏃 View run bge_gaussiannb_vs1e-08 at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565633/runs/d49467bc0b6b4cb28a82114bbb0635cb
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565633
🏃 View run bge_gaussiannb_vs1e-07 at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565633/runs/3237ce3a556d4246bf62d9f98c2125fe
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565633


bge_gaussiannb_vs1e-05:  80%|████████  | 4/5 [00:09<00:02,  2.32s/it]

🏃 View run bge_gaussiannb_vs1e-06 at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565633/runs/8f817149587947e78f506c5e356beb46
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565633


bge_gaussiannb_vs1e-05: 100%|██████████| 5/5 [00:11<00:00,  2.35s/it]

🏃 View run bge_gaussiannb_vs1e-05 at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565633/runs/f4027c8d4e60434f9471c3532123c254
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565633





In [9]:
print("Experimento Concluido con Exito!")

Experimento Concluido con Exito!


In [10]:
from IPython.display import Markdown

def export_code_cells():
    from IPython import get_ipython
    cells = get_ipython().user_ns['In']
    code = '\n\n'.join([c for c in cells if c.strip()])
    return Markdown(f'```python\n{code}\n```')

#export_code_cells()