In [1]:
import pandas as pd, numpy as np, math, os
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, roc_auc_score, f1_score, recall_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from tqdm.auto import tqdm

import torch
from transformers import AutoTokenizer, AutoModel
import mlflow

os.environ["DATABRICKS_HOST"] = "https://dbc-2d843358-2bd3.cloud.databricks.com/"
os.environ["DATABRICKS_TOKEN"] = "dapid34b343fd31d5e3c797e5d9d6966dcf2"
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Desarrollo_de_Soluciones/Combined_Datasets_Contextuales")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#print(torch.__version__)
print(f"Using device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: NVIDIA GeForce RTX 3090


In [2]:
# Datos

data_training  = pd.read_csv("../data/cleaned_train_dataset.csv")
data_test      = pd.read_csv("../data/cleaned_test_dataset.csv")
data_validation= pd.read_csv("../data/cleaned_val_dataset.csv")
data_cochrane  = pd.read_csv("../data/cochrane_sample_large.csv")
data_cochrane["label"] = (data_cochrane["label"].str.strip().str.lower().map({"plain":0,"technical":1}).astype("int8"))

frames = [df[["text","label"]] for df in (data_training, data_validation, data_test)]
frames.append(data_cochrane[["text","label"]])
data_all = pd.concat(frames, ignore_index=True).dropna(subset=["text","label"])
data_all["label"] = data_all["label"].astype(int)

train_df, val_df = train_test_split(data_all, test_size=0.2, shuffle=True, random_state=42, stratify=data_all["label"])

y_train = train_df["label"].to_numpy()
y_val   = val_df["label"].to_numpy()
texts_train = train_df["text"].astype(str).tolist()
texts_val   = val_df["text"].astype(str).tolist()

In [3]:
# Vectorizadores

def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    return summed / counts

@torch.no_grad()
def encode_texts(texts, tokenizer, model, batch_size=16, max_length=256):
    embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding", leave=False):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)
        out = model(**enc)
        pooled = mean_pooling(out.last_hidden_state, enc["attention_mask"])
        embs.append(pooled.float().cpu())
    return torch.cat(embs, dim=0).numpy().astype(np.float32)

# --- Encoders a comparar (por separado) ---
ENCODERS = [
    {"name": "PubMedBERT", "hf_id": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", "max_length": 512},
    {"name": "BGE-base",   "hf_id": "BAAI/bge-base-en-v1.5",                                  "max_length": 512},
]

In [4]:
# Modelos

def build_models():
    return [
        ("LogisticRegression", Pipeline([
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("clf", LogisticRegression(C=1.0, solver="liblinear",
                                       class_weight="balanced", max_iter=2000, n_jobs=-1))
        ])),
        ("SVC", Pipeline([
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("clf", SVC(kernel="linear", C=1.0, probability=True,
                        class_weight="balanced", max_iter=5000))
        ])),
        ("GaussianNB", Pipeline([
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("clf", GaussianNB(var_smoothing=1e-8))
        ])),
        ("XGBClassifier", XGBClassifier(
            objective="binary:logistic", learning_rate=0.1, n_estimators=400,
            max_depth=4, tree_method="hist", n_jobs=-1, eval_metric=["aucpr","auc"]
        )),
    ]

def get_scores_and_preds(model, X):
    if hasattr(model, "predict_proba"):
        s = model.predict_proba(X)[:, 1]
        p = (s >= 0.5).astype(int)
    else:
        s = model.decision_function(X)
        p = (s >= 0).astype(int)
    return s, p

In [5]:
for enc in ENCODERS:
    MODEL_NAME = enc["hf_id"]
    enc_name   = enc["name"]
    MAX_LEN    = enc["max_length"]

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    enc_model = AutoModel.from_pretrained(MODEL_NAME).to(device).eval()

    X_train = encode_texts(texts_train, tokenizer, enc_model, batch_size=16, max_length=MAX_LEN)
    X_val   = encode_texts(texts_val,   tokenizer, enc_model, batch_size=16, max_length=MAX_LEN)

    print(f"\nEmbeddings {enc_name}: train={X_train.shape}, val={X_val.shape}")

    modelos = build_models()
    pbar = tqdm(modelos, desc=f"Combined eval ({enc_name})", leave=True)

    for model_name, est in pbar:
        run_name = f"{enc_name}__{model_name}"
        pbar.set_description(run_name)

        with mlflow.start_run(run_name=run_name):
            mlflow.set_tag("dataset", "combined_old+cochrane")
            mlflow.set_tag("encoder_name", enc_name)
            mlflow.set_tag("encoder_hf_id", MODEL_NAME)
            mlflow.log_param("embed_dim", int(X_train.shape[1]))
            mlflow.log_param("n_train", int(X_train.shape[0]))
            mlflow.log_param("n_val", int(X_val.shape[0]))
            mlflow.log_param("max_length", int(MAX_LEN))

            est.fit(X_train, y_train)

            s_val, yhat_val = get_scores_and_preds(est, X_val)
            mlflow.log_metrics({
                "val_pr_auc": float(average_precision_score(y_val, s_val)),
                "val_roc_auc": float(roc_auc_score(y_val, s_val)),
                "val_f1": float(f1_score(y_val, yhat_val)),
                "val_recall": float(recall_score(y_val, yhat_val)),
                "val_accuracy": float(accuracy_score(y_val, yhat_val)),
            })

                                                           


Embeddings PubMedBERT: train=(1862, 768), val=(466, 768)




🏃 View run PubMedBERT__LogisticRegression at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091/runs/b6dc8fe512934734bf2709af58753fb1
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091


PubMedBERT__GaussianNB:  50%|█████     | 2/4 [00:05<00:05,  2.60s/it]        

🏃 View run PubMedBERT__SVC at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091/runs/7e67ab294307464d8127e8c06158d70e
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091


PubMedBERT__XGBClassifier:  75%|███████▌  | 3/4 [00:07<00:02,  2.39s/it]

🏃 View run PubMedBERT__GaussianNB at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091/runs/6611dd6d558d4886adf75cacc34235ea
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091
🏃 View run PubMedBERT__XGBClassifier at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091/runs/a3ea6b3f688340a9b70947d6a399690b
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091


PubMedBERT__XGBClassifier: 100%|██████████| 4/4 [00:13<00:00,  3.33s/it]
                                                           


Embeddings BGE-base: train=(1862, 768), val=(466, 768)


BGE-base__SVC:  25%|██▌       | 1/4 [00:02<00:07,  2.56s/it]               

🏃 View run BGE-base__LogisticRegression at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091/runs/35c9afab66f04979b28894ed310903f3
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091


BGE-base__GaussianNB:  50%|█████     | 2/4 [00:04<00:04,  2.46s/it]

🏃 View run BGE-base__SVC at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091/runs/4a6c1989dfc94c0db3755fc02383ef68
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091


BGE-base__XGBClassifier:  75%|███████▌  | 3/4 [00:07<00:02,  2.37s/it]

🏃 View run BGE-base__GaussianNB at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091/runs/e6b9ad6644bc4ba7b5744e442e97fc79
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091


BGE-base__XGBClassifier: 100%|██████████| 4/4 [00:11<00:00,  2.86s/it]

🏃 View run BGE-base__XGBClassifier at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091/runs/411ab43e67424c55822f5b578163737a
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/3326969877813091





In [6]:
print("Experimento Concluido con Exito!")

Experimento Concluido con Exito!


In [7]:
from IPython.display import Markdown

def export_code_cells():
    from IPython import get_ipython
    cells = get_ipython().user_ns['In']
    code = '\n\n'.join([c for c in cells if c.strip()])
    return Markdown(f'```python\n{code}\n```')

#export_code_cells()