In [None]:
import pandas as pd
import numpy as np
import math
from itertools import product

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import average_precision_score, roc_auc_score, f1_score, recall_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import torch
from transformers import AutoTokenizer, AutoModel

from tqdm.auto import tqdm
import mlflow, os

os.environ["DATABRICKS_HOST"] = "https://dbc-69cf0a80-d1d0.cloud.databricks.com/"
os.environ["DATABRICKS_TOKEN"] = ""
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("PubMedBert_NaiveBayes")

MODEL_NAME = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#print(torch.__version__)
print(f"Using device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

RestException: RESOURCE_DOES_NOT_EXIST: Could not find experiment with ID None.

In [None]:
# cargar los datos

data_training = pd.read_csv("../data/cleaned_train_dataset.csv")
data_test = pd.read_csv("../data/cleaned_test_dataset.csv")
data_validation = pd.read_csv("../data/cleaned_val_dataset.csv")

In [None]:
# Vectorizador

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    return summed / counts

@torch.no_grad()
def encode_texts(texts, batch_size=16, max_length=256):
    embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding", leave=False):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            list(map(str, batch)),
            padding=True, truncation=True, max_length=max_length, return_tensors="pt"
        ).to(device)

        outputs = model(**enc)
        pooled = mean_pooling(outputs.last_hidden_state, enc["attention_mask"])
        embs.append(pooled.cpu())
    return torch.cat(embs, dim=0).numpy()

In [None]:
# Crear vectores a partir de los datos

X_train = encode_texts(data_training["text"].astype(str).tolist(), batch_size=16, max_length=256)
X_val   = encode_texts(data_validation["text"].astype(str).tolist(), batch_size=16, max_length=256)
X_test  = encode_texts(data_test["text"].astype(str).tolist(), batch_size=16, max_length=256)

y_train = data_training["label"].astype(int).values
y_test  = data_test["label"].astype(int).values  
y_val   = data_validation["label"].astype(int).values

print(f"Embeddings: train={X_train.shape}, val={X_val.shape}, test={X_test.shape}")

Encoding:   0%|          | 0/51 [00:00<?, ?it/s]

Encoding:   0%|          | 0/16 [00:00<?, ?it/s]

Encoding:   0%|          | 0/17 [00:00<?, ?it/s]

Embeddings: train=(816, 768), val=(247, 768), test=(258, 768)


In [None]:
# Grid de hiperparámetros para Naive Bayes

nb_grid = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6, 1e-5],  # smoothing parameter for GaussianNB
}

In [None]:
def grid_items(grid):
    keys = list(grid.keys())
    for values in product(*[grid[k] for k in keys]):
        yield dict(zip(keys, values))

In [None]:
nb_total = math.prod(len(v) for v in nb_grid.values())
pbar = tqdm(total=nb_total, desc="Runs", leave=True)

for nb_params in grid_items(nb_grid):
    run_name = f"pubmedbert_nb_vs{nb_params['var_smoothing']}"
    pbar.set_description(run_name)

    with mlflow.start_run(run_name=run_name):
        mlflow.set_tag("encoder", MODEL_NAME)
        mlflow.set_tag("featurizer", "contextual_mean_pool")
        mlflow.set_tag("model_name", "GaussianNB")
        mlflow.log_param("embed_dim", int(X_train.shape[1]))
        mlflow.log_param("n_train", int(X_train.shape[0]))
        mlflow.log_param("n_val", int(X_val.shape[0]))
        for k, v in nb_params.items(): mlflow.log_param(f"nb_{k}", v)

        # Escalado + Naive Bayes en pipeline (escalado ayuda con embeddings densos)
        pipe = Pipeline([
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("clf", GaussianNB(**nb_params))
        ])
        pipe.fit(X_train, y_train)

        # Métricas en val
        val_scores = pipe.predict_proba(X_val)[:, 1]
        val_pred = (val_scores >= 0.5).astype(int)
        metrics = {
            "pr_auc":   float(average_precision_score(y_val, val_scores)),
            "roc_auc":  float(roc_auc_score(y_val, val_scores)),
            "f1":       float(f1_score(y_val, val_pred)),
            "recall":   float(recall_score(y_val, val_pred)),
            "accuracy": float(accuracy_score(y_val, val_pred)),
        }
        mlflow.log_metrics(metrics)

    pbar.update(1)

pbar.close()

pubmedbert_logreg_C0.1_saga:  17%|█▋        | 1/6 [00:01<00:08,  1.80s/it]     

🏃 View run pubmedbert_logreg_C0.1_liblinear at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/298073712096804/runs/79deae6708d74d0f9f7e7399b3afd07a
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/298073712096804


pubmedbert_logreg_C1.0_liblinear:  33%|███▎      | 2/6 [00:03<00:08,  2.02s/it]

🏃 View run pubmedbert_logreg_C0.1_saga at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/298073712096804/runs/631badbbbc8a4da79b89dfe270a2639c
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/298073712096804


pubmedbert_logreg_C1.0_saga:  50%|█████     | 3/6 [00:05<00:05,  1.71s/it]     

🏃 View run pubmedbert_logreg_C1.0_liblinear at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/298073712096804/runs/307e15a520264679856a1cd66af0f515
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/298073712096804


pubmedbert_logreg_C5.0_liblinear:  67%|██████▋   | 4/6 [00:07<00:04,  2.05s/it]

🏃 View run pubmedbert_logreg_C1.0_saga at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/298073712096804/runs/b9dc9a663023408c895d23c691c85301
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/298073712096804


pubmedbert_logreg_C5.0_saga:  83%|████████▎ | 5/6 [00:09<00:01,  1.76s/it]     

🏃 View run pubmedbert_logreg_C5.0_liblinear at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/298073712096804/runs/e821876f04164efb9e446f9d572633df
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/298073712096804


pubmedbert_logreg_C5.0_saga: 100%|██████████| 6/6 [00:12<00:00,  2.08s/it]

🏃 View run pubmedbert_logreg_C5.0_saga at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/298073712096804/runs/547fcfd7814646959522858f096d9d8b
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/298073712096804





In [None]:
print("Experimento Concluido con Exito!")

Experimento Concluido con Exito!


In [None]:
from IPython.display import Markdown

def export_code_cells():
    from IPython import get_ipython
    cells = get_ipython().user_ns['In']
    code = '\n\n'.join([c for c in cells if c.strip()])
    return Markdown(f'```python\n{code}\n```')

#export_code_cells()