In [1]:
from google.colab import drive
drive.mount('/content/drive')
debug = False

Mounted at /content/drive


In [2]:
import os
os.chdir("/content/drive/MyDrive/thesis/2_MODELING")
from dotenv import load_dotenv

load_dotenv("../.env.prod")

True

In [3]:
from loc_mpnet.loader import TripletWithFourierDataset, make_collate_fn
from loc_mpnet.Model import LocHead, HybridEmbedder

In [4]:
import pandas as pd
train = pd.read_parquet("../files/processed/paired_datasets/train.parquet")

train=train[['vacant_id', 'pos_candidate_id', 'neg_candidate_id','pos_candidate_full_text', 'neg_candidate_full_text','neg_vacant_full_text',  'neg_vacant_fourier_feature',  'pos_candidate_fourier_features','neg_candidate_fourier_features' ]]
train=train.rename(columns={"neg_vacant_full_text":"anchor_full_text", "neg_vacant_fourier_feature": "anchor_fourier_feature"})

In [5]:
val = pd.read_parquet("../files/processed/paired_datasets/val.parquet")

val=val[['vacant_id', 'pos_candidate_id', 'neg_candidate_id','pos_candidate_full_text', 'neg_candidate_full_text','neg_vacant_full_text',  'neg_vacant_fourier_feature',  'pos_candidate_fourier_features','neg_candidate_fourier_features' ]]
val=val.rename(columns={"neg_vacant_full_text":"anchor_full_text", "neg_vacant_fourier_feature": "anchor_fourier_feature"})

In [6]:
train = train.sample(frac=0.2)
val = val.sample(frac=0.2)

In [7]:
import ast

def to_1d_float_col(col):
    def fix(x):
        if x is None:
            return []
        if isinstance(x, str):
            x = ast.literal_eval(x)
        return [float(t) for t in x]
    return col.apply(fix)

train["anchor_fourier_feature"] = to_1d_float_col(train["anchor_fourier_feature"])
train["pos_candidate_fourier_features"] = to_1d_float_col(train["pos_candidate_fourier_features"])
train["neg_candidate_fourier_features"] = to_1d_float_col(train["neg_candidate_fourier_features"])

val["anchor_fourier_feature"] = to_1d_float_col(val["anchor_fourier_feature"])
val["pos_candidate_fourier_features"] = to_1d_float_col(val["pos_candidate_fourier_features"])
val["neg_candidate_fourier_features"] = to_1d_float_col(val["neg_candidate_fourier_features"])



In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

In [None]:

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
model_path = "/content/drive/MyDrive/thesis/2_MODELING/base-best_3/checkpoint-76104" # esta es la octava epoca
# Load your base ST model (fine-tuned or original)
base_model = SentenceTransformer(model_path,  device=device, )


Using device: cuda


In [10]:
import numpy as np
example_fourier = np.array(train["anchor_fourier_feature"].iloc[0], dtype="float32")
fourier_dim = example_fourier.shape[-1]
print("Fourier dim:", fourier_dim)



Fourier dim: 8


In [11]:
hybrid_model = HybridEmbedder(
    base_model=base_model,
    fourier_dim=fourier_dim,
    proj_dim=256,
    loc_out_dim=32,
).to(device)

In [12]:
for p in hybrid_model.base_model.parameters():
    p.requires_grad = False

In [13]:
from torch.utils.data import DataLoader


In [14]:
collate_fn = make_collate_fn(base_model)
train_dataset = TripletWithFourierDataset(train)
val_dataset   = TripletWithFourierDataset(val)
train_loader = DataLoader(
    train_dataset,
    batch_size=512, # ojito
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=512,
    shuffle=False,
    collate_fn=collate_fn,
)

In [15]:
def encode_hybrid(hybrid_model, base_model, texts, fouriers, device="cuda", batch_size=256):
    hybrid_model.to(device)
    hybrid_model.eval()
    all_embs = []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_four  = torch.tensor(
                np.asarray(fouriers[i:i+batch_size], dtype="float32"),
                dtype=torch.float32,
                device=device,
            )

            features = base_model.tokenize(batch_texts)
            features = {k: v.to(device) for k, v in features.items()}

            embs = hybrid_model(features, batch_four)   # [B, D], normalized
            all_embs.append(embs.cpu())

    return torch.cat(all_embs, dim=0)
def triplet_eval_hybrid(hybrid_model, base_model, df, device="cuda", batch_size=256):

    anchors   = df["anchor_full_text"].tolist()
    positives = df["pos_candidate_full_text"].tolist()
    negatives = df["neg_candidate_full_text"].tolist()

    a_four = np.stack(df["anchor_fourier_feature"].to_numpy())
    p_four = np.stack(df["pos_candidate_fourier_features"].to_numpy())
    n_four = np.stack(df["neg_candidate_fourier_features"].to_numpy())

    with torch.no_grad():
        a_emb = encode_hybrid(hybrid_model, base_model, anchors,   a_four, device=device, batch_size=batch_size)
        p_emb = encode_hybrid(hybrid_model, base_model, positives, p_four, device=device, batch_size=batch_size)
        n_emb = encode_hybrid(hybrid_model, base_model, negatives, n_four, device=device, batch_size=batch_size)

        # Ensure normalized
        a_emb = F.normalize(a_emb, p=2, dim=-1)
        p_emb = F.normalize(p_emb, p=2, dim=-1)
        n_emb = F.normalize(n_emb, p=2, dim=-1)

        pos_cos = 1 - F.cosine_similarity(a_emb, p_emb)
        neg_cos = 1 - F.cosine_similarity(a_emb, n_emb)

        pos_euc = torch.norm(a_emb - p_emb, p=2, dim=-1)
        neg_euc = torch.norm(a_emb - n_emb, p=2, dim=-1)

        pos_manh = torch.norm(a_emb - p_emb, p=1, dim=-1)
        neg_manh = torch.norm(a_emb - n_emb, p=1, dim=-1)

        total = a_emb.size(0)
        acc_cos  = (pos_cos  < neg_cos).sum().item()  / total
        acc_euc  = (pos_euc  < neg_euc).sum().item()  / total
        acc_manh = (pos_manh < neg_manh).sum().item() / total
        acc_max  = max(acc_cos, acc_euc, acc_manh)

    return {
        "cosine_accuracy": acc_cos,
        "euclidean_accuracy": acc_euc,
        "manhattan_accuracy": acc_manh,
        "max_accuracy": acc_max,
    }

In [None]:
metrics_before = triplet_eval_hybrid(hybrid_model, base_model, val, batch_size=1024)
metrics_before  # no se corre sobre val completo sino sobre test, por eso cambia el ACC

In [17]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
from torch.optim import AdamW
import gc

optimizer = AdamW(
    filter(lambda p: p.requires_grad, hybrid_model.parameters()),
    lr=2e-5,
    weight_decay=0.01,
)

def move_features_to_device(features, device):
    return {k: v.to(device) for k, v in features.items()}

def multiple_negatives_ranking_loss(anchor_emb, pos_emb, temperature: float = 1.0):
    """
    anchor_emb, pos_emb: [B, D], L2-normalized
    """
    # similarity matrix: [B, B]
    scores = torch.matmul(anchor_emb, pos_emb.T) / temperature
    labels = torch.arange(anchor_emb.size(0), device=anchor_emb.device)
    return F.cross_entropy(scores, labels)
triplet_criterion = nn.TripletMarginLoss(
    margin=0.2,
    p=2,
)

lambda_triplet = 1.0

num_epochs = 1  #

from tqdm.auto import tqdm

num_epochs = 2

for epoch in range(1, num_epochs + 1):
    hybrid_model.train()
    total_loss = 0.0
    total_mnr = 0.0
    total_triplet = 0.0
    total_samples = 0


    for (
        anchor_features,
        pos_features,
        neg_features,
        a_four,
        p_four,
        n_four,
    ) in tqdm(train_loader, desc=f"Epoch {epoch}", leave=True):


        anchor_features = move_features_to_device(anchor_features, device)
        pos_features    = move_features_to_device(pos_features, device)
        neg_features    = move_features_to_device(neg_features, device)

        a_four = a_four.to(device)
        p_four = p_four.to(device)
        n_four = n_four.to(device)

        anchor_emb = hybrid_model(anchor_features, a_four)  # [B, D]
        pos_emb    = hybrid_model(pos_features, p_four)     # [B, D]
        neg_emb    = hybrid_model(neg_features, n_four)     # [B, D]

        loss_mnr     = multiple_negatives_ranking_loss(anchor_emb, pos_emb, temperature=1.0)
        loss_triplet = triplet_criterion(anchor_emb, pos_emb, neg_emb)
        loss         = loss_mnr  #+ lambda_triplet * loss_triplet prueba rápdia no funcional

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_size = anchor_emb.size(0)
        total_loss    += loss.item()         * batch_size
        total_mnr     += loss_mnr.item()     * batch_size
        total_triplet += loss_triplet.item() * batch_size
        total_samples += batch_size

    avg_loss    = total_loss    / total_samples
    avg_mnr     = total_mnr     / total_samples
    avg_triplet = total_triplet / total_samples

    print(
        f"Epoch {epoch} | "
        f"L_total={avg_loss:.4f} | "
        f"L_MNR={avg_mnr:.4f} | "
        f"L_triplet={avg_triplet:.4f}"
    )

    gc.collect()
    torch.cuda.empty_cache()




Epoch 1:   0%|          | 0/106 [00:00<?, ?it/s]

Epoch 1 | L_total=6.1085 | L_MNR=5.9596 | L_triplet=0.1489


Epoch 2:   0%|          | 0/106 [00:00<?, ?it/s]

Epoch 2 | L_total=5.9168 | L_MNR=5.7689 | L_triplet=0.1479


In [19]:

metrics_after  = triplet_eval_hybrid(hybrid_model, base_model, val, batch_size=1024)


In [None]:
metrics_after

In [None]:
metrics_before

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()