In [1]:
import pandas as pd, numpy as np, math, os
from itertools import product
from tqdm.auto import tqdm

import torch
from transformers import AutoTokenizer, AutoModel

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import average_precision_score, roc_auc_score, f1_score, recall_score, accuracy_score
import mlflow

os.environ["DATABRICKS_HOST"] = "https://dbc-2d843358-2bd3.cloud.databricks.com/"
os.environ["DATABRICKS_TOKEN"] = "dapid34b343fd31d5e3c797e5d9d6966dcf2"
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Desarrollo_de_Soluciones/Coachrane_Dataset_Contextuales")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#print(torch.__version__)
print(f"Using device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: NVIDIA GeForce RTX 3090


In [2]:
# Datos

data_training = pd.read_csv("../data/cleaned_train_dataset.csv")
data_test = pd.read_csv("../data/cleaned_test_dataset.csv")
data_validation = pd.read_csv("../data/cleaned_val_dataset.csv")
data_cochrane = pd.read_csv("../data/cochrane_sample_large.csv")
data_cochrane["label"] = (data_cochrane["label"].str.strip().str.lower().map({"plain":0,"technical":1}).astype("int8"))
assert set(data_cochrane["label"].unique()) <= {0,1}

y_train    = data_training["label"].astype(int).values
y_test     = data_test["label"].astype(int).values
y_val      = data_validation["label"].astype(int).values
y_cochrane = data_cochrane["label"].astype(int).values

texts_train = data_training["text"].astype(str).tolist()
texts_val   = data_validation["text"].astype(str).tolist()
texts_test  = data_test["text"].astype(str).tolist()
texts_coc   = data_cochrane["text"].astype(str).tolist()

In [3]:
# Vectorizadores

def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    return summed / counts

@torch.no_grad()
def encode_texts(texts, tokenizer, model, batch_size=16, max_length=256):
    embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding", leave=False):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)
        out = model(**enc)
        pooled = mean_pooling(out.last_hidden_state, enc["attention_mask"])
        embs.append(pooled.float().cpu())
    X = torch.cat(embs, dim=0).numpy().astype(np.float32)
    return X

ENCODERS = [
    {"name": "PubMedBERT", "hf_id": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", "max_length": 512},
    {"name": "BGE-base",   "hf_id": "BAAI/bge-base-en-v1.5", "max_length": 512},
]

In [4]:
# Construcción de los Clasificadores

def build_models():
    models = []
    models.append(("LogisticRegression",
                   Pipeline([
                       ("scaler", StandardScaler(with_mean=True, with_std=True)),
                       ("clf", LogisticRegression(C=1.0, solver="liblinear",
                                                  class_weight="balanced", max_iter=2000, n_jobs=-1))
                   ])))
    models.append(("SVC",
                   Pipeline([
                       ("scaler", StandardScaler(with_mean=True, with_std=True)),
                       ("clf", SVC(kernel="linear", C=1.0, probability=True, class_weight="balanced", max_iter=5000))
                   ])))
    models.append(("GaussianNB",
                   Pipeline([
                       ("scaler", StandardScaler(with_mean=True, with_std=True)),
                       ("clf", GaussianNB(var_smoothing=1e-8))
                   ])))
    models.append(("XGBClassifier",
                   XGBClassifier(objective="binary:logistic", learning_rate=0.1, n_estimators=400,
                                 max_depth=4, tree_method="hist", n_jobs=-1, eval_metric=["aucpr","auc"])))
    return models

def get_scores_and_preds(model, X):
    if hasattr(model, "predict_proba"):
        s = model.predict_proba(X)[:, 1]
        p = (s >= 0.5).astype(int)
    else:
        s = model.decision_function(X)
        p = (s >= 0).astype(int)
    return s, p

In [5]:
# Experimentos

for enc_cfg in ENCODERS:
    MODEL_NAME = enc_cfg["hf_id"]
    enc_label  = enc_cfg["name"]
    print(f"\n=== Encoder: {enc_label} ({MODEL_NAME}) ===")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME).to(device).eval()

    X_train = encode_texts(texts_train, tokenizer, model, batch_size=16, max_length=enc_cfg["max_length"])
    X_val   = encode_texts(texts_val,   tokenizer, model, batch_size=16, max_length=enc_cfg["max_length"])
    X_test  = encode_texts(texts_test,  tokenizer, model, batch_size=16, max_length=enc_cfg["max_length"])
    X_coc   = encode_texts(texts_coc,   tokenizer, model, batch_size=16, max_length=enc_cfg["max_length"])

    print(f"Embeddings ({enc_label}): train={X_train.shape}, val={X_val.shape}, test={X_test.shape}, coc={X_coc.shape}")

    modelos = build_models()
    pbar = tqdm(modelos, desc=f"Model eval ({enc_label})", leave=True)

    for model_name, estimator in pbar:
        run_name = f"{enc_label}__{model_name}"
        pbar.set_description(run_name)

        with mlflow.start_run(run_name=run_name):
            mlflow.set_tag("encoder_family", enc_label)
            mlflow.set_tag("encoder_hf_id", MODEL_NAME)
            mlflow.log_param("embed_dim", int(X_train.shape[1]))
            mlflow.log_param("n_train", int(X_train.shape[0]))
            mlflow.log_param("n_val", int(X_val.shape[0]))

            # Entrenar 
            estimator.fit(X_train, y_train)

            # --- Validación interna (val) ---
            s_val, yhat_val = get_scores_and_preds(estimator, X_val)
            metrics_val = {
                "val_pr_auc": float(average_precision_score(y_val, s_val)),
                "val_roc_auc": float(roc_auc_score(y_val, s_val)),
                "val_f1": float(f1_score(y_val, yhat_val)),
                "val_recall": float(recall_score(y_val, yhat_val)),
                "val_accuracy": float(accuracy_score(y_val, yhat_val)),
            }
            mlflow.log_metrics(metrics_val)

            # --- Evaluación con Cochrane ---
            s_coc, yhat_coc = get_scores_and_preds(estimator, X_coc)
            metrics_coc = {
                "coc_pr_auc": float(average_precision_score(y_cochrane, s_coc)),
                "coc_roc_auc": float(roc_auc_score(y_cochrane, s_coc)),
                "coc_f1": float(f1_score(y_cochrane, yhat_coc)),
                "coc_recall": float(recall_score(y_cochrane, yhat_coc)),
                "coc_accuracy": float(accuracy_score(y_cochrane, yhat_coc)),
            }
            mlflow.log_metrics(metrics_coc)



=== Encoder: PubMedBERT (microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract) ===


                                                         

Embeddings (PubMedBERT): train=(816, 768), val=(247, 768), test=(258, 768), coc=(1007, 768)


PubMedBERT__SVC:  25%|██▌       | 1/4 [00:02<00:07,  2.57s/it]               

🏃 View run PubMedBERT__LogisticRegression at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090/runs/dc55e4cc4b1f41fcb06a3e59e77a198b
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090
🏃 View run PubMedBERT__SVC at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090/runs/502e02d2a32d4dc592881acd9cd81c3c
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090


PubMedBERT__XGBClassifier:  75%|███████▌  | 3/4 [00:07<00:02,  2.43s/it]

🏃 View run PubMedBERT__GaussianNB at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090/runs/9c1bdc0507734721aa5cd90f695ae1c4
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090
🏃 View run PubMedBERT__XGBClassifier at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090/runs/090545751b294229a9a9128f4aa4f109
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090


PubMedBERT__XGBClassifier: 100%|██████████| 4/4 [00:10<00:00,  2.74s/it]



=== Encoder: BGE-base (BAAI/bge-base-en-v1.5) ===


                                                         

Embeddings (BGE-base): train=(816, 768), val=(247, 768), test=(258, 768), coc=(1007, 768)


BGE-base__SVC:  25%|██▌       | 1/4 [00:02<00:07,  2.42s/it]               

🏃 View run BGE-base__LogisticRegression at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090/runs/a7b033bd83e44ab1a6d7e5cbda03a975
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090


BGE-base__GaussianNB:  50%|█████     | 2/4 [00:04<00:04,  2.42s/it]

🏃 View run BGE-base__SVC at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090/runs/f8d3a7f0640b4ab49fcc5bac224209b6
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090


BGE-base__GaussianNB:  75%|███████▌  | 3/4 [00:07<00:02,  2.38s/it]

🏃 View run BGE-base__GaussianNB at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090/runs/e7a0997de8ad4d4a88dd1f0d99ad61da
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090


BGE-base__XGBClassifier: 100%|██████████| 4/4 [00:11<00:00,  2.93s/it]

🏃 View run BGE-base__XGBClassifier at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090/runs/0399be1ef4cd4d42a9e22af9a68b7933
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813090





In [6]:
print("Experimento Concluido con Exito!")

Experimento Concluido con Exito!


In [7]:
from IPython.display import Markdown

def export_code_cells():
    from IPython import get_ipython
    cells = get_ipython().user_ns['In']
    code = '\n\n'.join([c for c in cells if c.strip()])
    return Markdown(f'```python\n{code}\n```')

#export_code_cells()