In [1]:
import pandas as pd
import numpy as np
import math
from itertools import product

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score, f1_score, recall_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import torch
from transformers import AutoTokenizer, AutoModel

from tqdm.auto import tqdm
import mlflow, os

os.environ["DATABRICKS_HOST"] = "https://dbc-e0c2984f-335b.cloud.databricks.com/"
os.environ["DATABRICKS_TOKEN"] = "dapi407bc8d2e8ea23807d8c3d135876f810"
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Desarrollo_de_Soluciones/LogReg_bge")

MODEL_NAME = "BAAI/bge-base-en-v1.5"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#print(torch.__version__)
print(f"Using device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")


  from .autonotebook import tqdm as notebook_tqdm


Using device: NVIDIA GeForce RTX 3090


In [2]:
# cargar los datos

data_training = pd.read_csv("../data/cleaned_train_dataset.csv")
data_test = pd.read_csv("../data/cleaned_test_dataset.csv")
data_validation = pd.read_csv("../data/cleaned_val_dataset.csv")

In [3]:
# Vectorizador

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [4]:
def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    return summed / counts

@torch.no_grad()
def encode_texts(texts, batch_size=16, max_length=256):
    embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding", leave=False):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            list(map(str, batch)),
            padding=True, truncation=True, max_length=max_length, return_tensors="pt"
        ).to(device)

        outputs = model(**enc)
        pooled = mean_pooling(outputs.last_hidden_state, enc["attention_mask"])
        embs.append(pooled.cpu())
    return torch.cat(embs, dim=0).numpy()

In [5]:
# Crear vectores a partir de los datos

X_train = encode_texts(data_training["text"].astype(str).tolist(), batch_size=16, max_length=256)
X_val   = encode_texts(data_validation["text"].astype(str).tolist(), batch_size=16, max_length=256)
X_test  = encode_texts(data_test["text"].astype(str).tolist(), batch_size=16, max_length=256)

y_train = data_training["label"].astype(int).values
y_test  = data_test["label"].astype(int).values  
y_val   = data_validation["label"].astype(int).values

print(f"Embeddings: train={X_train.shape}, val={X_val.shape}, test={X_test.shape}")

                                                         

Embeddings: train=(816, 768), val=(247, 768), test=(258, 768)




In [6]:
# Grid de hiperparámetros para Logistic Regression

logreg_grid = {
    "C": [0.1, 1.0, 5.0],
    
    "solver": ["liblinear", "saga"],
}

In [7]:
def grid_items(grid):
    keys = list(grid.keys())
    for values in product(*[grid[k] for k in keys]):
        yield dict(zip(keys, values))

In [8]:
total = math.prod(len(v) for v in logreg_grid.values())
pbar = tqdm(total=total, desc="Runs", leave=True)

for lr_params in grid_items(logreg_grid):
    run_name = f"bge_logreg_C{lr_params['C']}_{lr_params['solver']}"
    pbar.set_description(run_name)

    with mlflow.start_run(run_name=run_name):
        mlflow.set_tag("encoder", MODEL_NAME)
        mlflow.set_tag("featurizer", "contextual_mean_pool")
        mlflow.log_param("embed_dim", int(X_train.shape[1]))
        mlflow.log_param("n_train", int(X_train.shape[0]))
        mlflow.log_param("n_val", int(X_val.shape[0]))
        for k, v in lr_params.items(): mlflow.log_param(f"logreg_{k}", v)

        # Escalado + LogReg en pipeline (escalado ayuda a LogReg/SVM con densos)
        pipe = Pipeline([
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("clf", LogisticRegression(
                class_weight="balanced",
                max_iter=2000,
                n_jobs=-1,
                **lr_params
            ))
        ])
        pipe.fit(X_train, y_train)

        # Métricas en val
        val_scores = pipe.predict_proba(X_val)[:, 1]
        val_pred = (val_scores >= 0.5).astype(int)
        metrics = {
            "pr_auc":   float(average_precision_score(y_val, val_scores)),
            "roc_auc":  float(roc_auc_score(y_val, val_scores)),
            "f1":       float(f1_score(y_val, val_pred)),
            "recall":   float(recall_score(y_val, val_pred)),
            "accuracy": float(accuracy_score(y_val, val_pred)),
        }
        mlflow.log_metrics(metrics)

    pbar.update(1)

pbar.close()

bge_logreg_C0.1_saga:  17%|█▋        | 1/6 [00:01<00:07,  1.46s/it]     

🏃 View run bge_logreg_C0.1_liblinear at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/703118421924059/runs/be041719fb93400fb04c15d253373d82
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/703118421924059


bge_logreg_C1.0_liblinear:  33%|███▎      | 2/6 [00:03<00:06,  1.60s/it]

🏃 View run bge_logreg_C0.1_saga at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/703118421924059/runs/8cc3477c61fd4df79ea65d339d7efee0
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/703118421924059


bge_logreg_C1.0_saga:  50%|█████     | 3/6 [00:04<00:04,  1.48s/it]     

🏃 View run bge_logreg_C1.0_liblinear at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/703118421924059/runs/c60d599fe7be4ae5a10a0b68a564d50e
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/703118421924059


bge_logreg_C5.0_liblinear:  67%|██████▋   | 4/6 [00:06<00:03,  1.75s/it]

🏃 View run bge_logreg_C1.0_saga at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/703118421924059/runs/943faa45073d47eea78fae4bf8830f01
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/703118421924059


bge_logreg_C5.0_saga:  83%|████████▎ | 5/6 [00:08<00:01,  1.61s/it]     

🏃 View run bge_logreg_C5.0_liblinear at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/703118421924059/runs/bca516ec495e47838d3f7fd343d4781a
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/703118421924059


bge_logreg_C5.0_saga: 100%|██████████| 6/6 [00:10<00:00,  1.81s/it]

🏃 View run bge_logreg_C5.0_saga at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/703118421924059/runs/4d518d7fe3244f3ab9cafdeb3d2a6b9c
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/703118421924059





In [9]:
print("Experimento Concluido con Exito!")

Experimento Concluido con Exito!


In [10]:
from IPython.display import Markdown

def export_code_cells():
    from IPython import get_ipython
    cells = get_ipython().user_ns['In']
    code = '\n\n'.join([c for c in cells if c.strip()])
    return Markdown(f'```python\n{code}\n```')

#export_code_cells()