In [None]:


import os
import ast
import gc
from typing import Dict, List

import numpy as np
import pandas as pd
from dotenv import load_dotenv

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Gemini client
from google import genai
from google.genai import types as genai_types

load_dotenv("../.env.prod")

from google.colab import drive
drive.mount("/content/drive")


device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

API_KEY = os.getenv("GEMINI_KEY")
client = genai.Client(api_key="API_KEY")

GEMINI_EMBEDDING_MODEL = "gemini-embedding-001"
GEMINI_OUTPUT_DIM = 768



BATCH_GEMINI = 64
BATCH_TRAIN = 512
LR = 2e-4
N_EPOCHS = 2
LOC_OUT_DIM = 32
PROJ_DIM = 256


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cpu


In [None]:


def normalize_text(x: str) -> str:
    if not isinstance(x, str):
        x = "" if x is None else str(x)
    return x[:MAX_CHARS_PER_TEXT]

def to_1d_float_col(col: pd.Series) -> pd.Series:
    def fix(x):
        if x is None:
            return []
        if isinstance(x, str):
            x = ast.literal_eval(x)
        return [float(t) for t in x]
    return col.apply(fix)


base_dir = "/content/drive/MyDrive/thesis/2_MODELING"

train = pd.read_parquet("../files/processed/paired_datasets/train.parquet").sample(10000)
val   = pd.read_parquet("../files/processed/paired_datasets/val.parquet").sample(10000)

keep_cols = [
    "vacant_id",
    "pos_candidate_id",
    "neg_candidate_id",
    "pos_candidate_full_text",
    "neg_candidate_full_text",
    "neg_vacant_full_text",
    "neg_vacant_fourier_feature",
    "pos_candidate_fourier_features",
    "neg_candidate_fourier_features",
]

train = train[keep_cols].sample(10000) # limitamos las filas por el costo del API
val   = val[keep_cols].sample(10000)

train = train.rename(
    columns={
        "neg_vacant_full_text": "anchor_full_text",
        "neg_vacant_fourier_feature": "anchor_fourier_feature",
    }
)
val = val.rename(
    columns={
        "neg_vacant_full_text": "anchor_full_text",
        "neg_vacant_fourier_feature": "anchor_fourier_feature",
    }
)

if TRAIN_FRAC < 1.0:
    train = train.sample(frac=TRAIN_FRAC, random_state=42)
if VAL_FRAC < 1.0:
    val = val.sample(frac=VAL_FRAC, random_state=42)

train["anchor_fourier_feature"] = to_1d_float_col(train["anchor_fourier_feature"])
train["pos_candidate_fourier_features"] = to_1d_float_col(train["pos_candidate_fourier_features"])
train["neg_candidate_fourier_features"] = to_1d_float_col(train["neg_candidate_fourier_features"])

val["anchor_fourier_feature"] = to_1d_float_col(val["anchor_fourier_feature"])
val["pos_candidate_fourier_features"] = to_1d_float_col(val["pos_candidate_fourier_features"])
val["neg_candidate_fourier_features"] = to_1d_float_col(val["neg_candidate_fourier_features"])

example_fourier = np.array(train["anchor_fourier_feature"].iloc[0], dtype="float32")
fourier_dim = example_fourier.shape[-1]
print("Fourier dim:", fourier_dim)
print("Train rows:", len(train), " Val rows:", len(val))


import time
from typing import Dict, List

MAX_CHARS_PER_TEXT = 2000          # truncate long descriptions
MAX_TOTAL_TEXTS    = None          # e.g. 5000, or None for unlimited
MAX_REQUESTS_PER_MIN = 100          # <-- set this to your Gemini tier limit
GEMINI_BATCH_SIZE  = 8             # smaller batch size = fewer tokens per call


def encode_gemini(texts: List[str],
                  batch_size: int = GEMINI_BATCH_SIZE) -> np.ndarray:
    """

    """
    if not texts:
        return np.zeros((0, GEMINI_OUTPUT_DIM), dtype="float32")

    all_vecs = []

    cfg = genai_types.EmbedContentConfig(
        task_type="SEMANTIC_SIMILARITY", # escogemos la especialidad de búsqueda semántica como se explica en el texto
        output_dimensionality=GEMINI_OUTPUT_DIM,
    )

    window_start = time.time()
    calls_in_window = 0
    min_secs_per_call = 60.0 / MAX_REQUESTS_PER_MIN if MAX_REQUESTS_PER_MIN else 0.0

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        if MAX_REQUESTS_PER_MIN:
            now = time.time()
            if now - window_start >= 60.0:
                window_start = now
                calls_in_window = 0

            if calls_in_window >= MAX_REQUESTS_PER_MIN:
                sleep_for = 60.0 - (now - window_start)
                if sleep_for > 0:
                    print(f"[Gemini] Rate limit reached, sleeping {sleep_for:.1f}s...")
                    time.sleep(sleep_for)
                window_start = time.time()
                calls_in_window = 0

            if calls_in_window > 0 and min_secs_per_call > 0:
                elapsed = now - window_start
                expected_time = calls_in_window * min_secs_per_call
                if elapsed < expected_time:
                    sleep_for = expected_time - elapsed
                    time.sleep(sleep_for)

        result = client.models.embed_content(
            model=GEMINI_EMBEDDING_MODEL,
            contents=batch_texts,
            config=cfg,
        )
        calls_in_window += 1

        batch_vecs = [np.asarray(e.values, dtype="float32") for e in result.embeddings]
        batch_vecs = np.stack(batch_vecs, axis=0)
        all_vecs.append(batch_vecs)

    arr = np.concatenate(all_vecs, axis=0)
    return arr


def build_text2emb_mapping(dfs: List[pd.DataFrame]) -> Dict[str, np.ndarray]:

    original_texts: List[str] = []
    for df in dfs:
        original_texts.extend(df["anchor_full_text"].tolist())
        original_texts.extend(df["pos_candidate_full_text"].tolist())
        original_texts.extend(df["neg_candidate_full_text"].tolist())

    orig_to_trunc: Dict[str, str] = {}
    for t in original_texts:
        t_orig = "" if t is None else str(t)
        t_trunc = normalize_text(t_orig)
        orig_to_trunc[t_orig] = t_trunc

    truncated_texts = list(dict.fromkeys(orig_to_trunc.values()))
    print("Unique truncated texts to embed with Gemini:", len(truncated_texts))

    embs = encode_gemini(truncated_texts)
    trunc2emb = {t: emb for t, emb in zip(truncated_texts, embs)}

    text2emb: Dict[str, np.ndarray] = {}
    for t_orig, t_trunc in orig_to_trunc.items():
        text2emb[t_orig] = trunc2emb[t_trunc]

    print("Total original texts mapped:", len(text2emb))
    return text2emb



print("\nPrecomputing Gemini embeddings for train+val texts...")
text2emb = build_text2emb_mapping([train, val])
print("Done.\n")




10000
Fourier dim: 8
Train rows: 2000  Val rows: 2000

Precomputing Gemini embeddings for train+val texts...
Unique truncated texts to embed with Gemini: 10021
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
Total original texts mapped: 10029
Done.



In [None]:

class GeminiFourierTripletDataset(Dataset): # M5 previo
    def __init__(self, df: pd.DataFrame, text2emb: Dict[str, np.ndarray]):
        self.df = df.reset_index(drop=True)
        self.text2emb = text2emb

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]

        a_text = self.text2emb[row["anchor_full_text"]]
        p_text = self.text2emb[row["pos_candidate_full_text"]]
        n_text = self.text2emb[row["neg_candidate_full_text"]]

        a_four = np.asarray(row["anchor_fourier_feature"], dtype="float32")
        p_four = np.asarray(row["pos_candidate_fourier_features"], dtype="float32")
        n_four = np.asarray(row["neg_candidate_fourier_features"], dtype="float32")

        return (
            torch.from_numpy(a_text),
            torch.from_numpy(p_text),
            torch.from_numpy(n_text),
            torch.from_numpy(a_four),
            torch.from_numpy(p_four),
            torch.from_numpy(n_four),
        )


train_dataset = GeminiFourierTripletDataset(train, text2emb)
val_dataset   = GeminiFourierTripletDataset(val, text2emb)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_TRAIN,
    shuffle=True,
    drop_last=False,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_TRAIN,
    shuffle=False,
    drop_last=False,
)




In [None]:

class LocHead(nn.Module):
    """
    Simple MLP for location Fourier features -> loc_out_dim.
    If you already have a LocHead implementation in loc_mpnet.Model, you can
    swap this class for that one to keep it 100% identical.
    """
    def __init__(self, fourier_dim: int, loc_out_dim: int = LOC_OUT_DIM):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(fourier_dim, 128),
            nn.ReLU(),
            nn.Linear(128, loc_out_dim),
            nn.ReLU(),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)


class GeminiHead(nn.Module):

    def __init__(
        self,
        text_dim: int,
        fourier_dim: int,
        proj_dim: int = PROJ_DIM,
        loc_out_dim: int = LOC_OUT_DIM,
        use_location: bool = True,
    ):
        super().__init__()
        self.use_location = use_location

        if use_location:
            self.loc_head = LocHead(fourier_dim, loc_out_dim)
            in_dim = text_dim + loc_out_dim
        else:
            self.loc_head = None
            in_dim = text_dim

        self.proj = nn.Sequential(
            nn.Linear(in_dim, proj_dim),
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim),
        )

    def forward(self, text_emb: torch.Tensor, fourier: torch.Tensor | None) -> torch.Tensor:
        # Normalize Gemini text embedding first
        x = F.normalize(text_emb, p=2, dim=-1)

        if self.use_location:
            loc = self.loc_head(fourier)
            x = torch.cat([x, loc], dim=-1)

        x = self.proj(x)
        x = F.normalize(x, p=2, dim=-1)
        return x


# Two heads that we will train separately:
head_text_only = GeminiHead(
    text_dim=GEMINI_OUTPUT_DIM,
    fourier_dim=fourier_dim,
    proj_dim=PROJ_DIM,
    loc_out_dim=LOC_OUT_DIM,
    use_location=False,
).to(device)

head_text_loc = GeminiHead(
    text_dim=GEMINI_OUTPUT_DIM,
    fourier_dim=fourier_dim,
    proj_dim=PROJ_DIM,
    loc_out_dim=LOC_OUT_DIM,
    use_location=True,
).to(device)



In [None]:


def multiple_negatives_ranking_loss(
    anchor_emb: torch.Tensor,
    pos_emb: torch.Tensor,
    temperature: float = 1.0,
) -> torch.Tensor: # nos toca rehacer la función pues no podemos utilizar la de S-BERT
    scores = torch.matmul(anchor_emb, pos_emb.T) / temperature
    labels = torch.arange(anchor_emb.size(0), device=anchor_emb.device)
    return F.cross_entropy(scores, labels)


triplet_criterion = nn.TripletMarginLoss(margin=0.2, p=2)
lambda_triplet = 1.0



In [None]:

def train_head(
    model: nn.Module,
    dataloader: DataLoader,
    n_epochs: int = N_EPOCHS,
    lr: float = LR,
    use_location: bool = True,
    name: str = "head",
):
    model.to(device)
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=0.01,
    )

    for epoch in range(1, n_epochs + 1):
        model.train()
        total_loss = 0.0
        total_mnr = 0.0
        total_triplet = 0.0
        total_samples = 0

        for (
            a_text,
            p_text,
            n_text,
            a_four,
            p_four,
            n_four,
        ) in dataloader:
            a_text = a_text.to(device)
            p_text = p_text.to(device)
            n_text = n_text.to(device)

            if use_location:
                a_four = a_four.to(device)
                p_four = p_four.to(device)
                n_four = n_four.to(device)
            else:
                a_four = p_four = n_four = None

            anchor_emb = model(a_text, a_four)
            pos_emb    = model(p_text, p_four)
            neg_emb    = model(n_text, n_four)

            loss_mnr = multiple_negatives_ranking_loss(anchor_emb, pos_emb, temperature=1.0)
            loss_triplet = triplet_criterion(anchor_emb, pos_emb, neg_emb)
            loss = loss_mnr  #+ lambda_triplet * loss_triplet xambiamos este loss, para dejar el mismo que en los otros modelos. 

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            bs = anchor_emb.size(0)
            total_loss    += loss.item()         * bs
            total_mnr     += loss_mnr.item()     * bs
            total_triplet += loss_triplet.item() * bs
            total_samples += bs

        avg_loss    = total_loss    / total_samples
        avg_mnr     = total_mnr     / total_samples
        avg_triplet = total_triplet / total_samples

        print(
            f"[{name}] Epoch {epoch} | "
            f"L_total={avg_loss:.4f} | "
            f"L_MNR={avg_mnr:.4f} | "
            f"L_triplet={avg_triplet:.4f}"
        )

        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()


print("\nTraining head: Gemini + Projection (NO location)...")
train_head(
    head_text_only,
    train_loader,
    n_epochs=N_EPOCHS,
    lr=LR,
    use_location=False,
    name="Gemini+Proj(no-loc)",
)

print("\nTraining head: Gemini + LocHead + Projection (WITH location)...")
train_head(
    head_text_loc,
    train_loader,
    n_epochs=N_EPOCHS,
    lr=LR,
    use_location=True,
    name="Gemini+Loc+Proj",
)



def eval_raw_gemini(df: pd.DataFrame) -> dict:

    anchors = df["anchor_full_text"].tolist()
    positives = df["pos_candidate_full_text"].tolist()
    negatives = df["neg_candidate_full_text"].tolist()

    with torch.no_grad():
        a_emb = torch.from_numpy(encode_gemini(anchors)).to(device)
        p_emb = torch.from_numpy(encode_gemini(positives)).to(device)
        n_emb = torch.from_numpy(encode_gemini(negatives)).to(device)

        a_emb = F.normalize(a_emb, p=2, dim=-1)
        p_emb = F.normalize(p_emb, p=2, dim=-1)
        n_emb = F.normalize(n_emb, p=2, dim=-1)

        pos_cos = 1 - F.cosine_similarity(a_emb, p_emb)
        neg_cos = 1 - F.cosine_similarity(a_emb, n_emb)

        pos_euc = torch.norm(a_emb - p_emb, p=2, dim=-1)
        neg_euc = torch.norm(a_emb - n_emb, p=2, dim=-1)

        pos_manh = torch.norm(a_emb - p_emb, p=1, dim=-1)
        neg_manh = torch.norm(a_emb - n_emb, p=1, dim=-1)

        total = a_emb.size(0)
        acc_cos = (pos_cos < neg_cos).sum().item() / total
        acc_euc = (pos_euc < neg_euc).sum().item() / total
        acc_manh = (pos_manh < neg_manh).sum().item() / total
        acc_max = max(acc_cos, acc_euc, acc_manh)

    return {
        "cosine_accuracy": acc_cos,
        "euclidean_accuracy": acc_euc,
        "manhattan_accuracy": acc_manh,
        "max_accuracy": acc_max,
    }


def eval_head(
    model: nn.Module,
    dataset: GeminiFourierTripletDataset,
    batch_size: int = BATCH_TRAIN,
    use_location: bool = True,
) -> dict:


    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
    )

    model.eval()
    model.to(device)

    all_pos_cos = []
    all_neg_cos = []
    all_pos_euc = []
    all_neg_euc = []
    all_pos_manh = []
    all_neg_manh = []

    with torch.no_grad():
        for (
            a_text,
            p_text,
            n_text,
            a_four,
            p_four,
            n_four,
        ) in loader:
            a_text = a_text.to(device)
            p_text = p_text.to(device)
            n_text = n_text.to(device)

            if use_location:
                a_four = a_four.to(device)
                p_four = p_four.to(device)
                n_four = n_four.to(device)
            else:
                a_four = p_four = n_four = None

            a_emb = model(a_text, a_four)
            p_emb = model(p_text, p_four)
            n_emb = model(n_text, n_four)

            pos_cos = 1 - F.cosine_similarity(a_emb, p_emb)
            neg_cos = 1 - F.cosine_similarity(a_emb, n_emb)

            pos_euc = torch.norm(a_emb - p_emb, p=2, dim=-1)
            neg_euc = torch.norm(a_emb - n_emb, p=2, dim=-1)

            pos_manh = torch.norm(a_emb - p_emb, p=1, dim=-1)
            neg_manh = torch.norm(a_emb - n_emb, p=1, dim=-1)

            all_pos_cos.append(pos_cos.cpu())
            all_neg_cos.append(neg_cos.cpu())
            all_pos_euc.append(pos_euc.cpu())
            all_neg_euc.append(neg_euc.cpu())
            all_pos_manh.append(pos_manh.cpu())
            all_neg_manh.append(neg_manh.cpu())

    pos_cos = torch.cat(all_pos_cos)
    neg_cos = torch.cat(all_neg_cos)
    pos_euc = torch.cat(all_pos_euc)
    neg_euc = torch.cat(all_neg_euc)
    pos_manh = torch.cat(all_pos_manh)
    neg_manh = torch.cat(all_neg_manh)

    total = pos_cos.size(0)
    acc_cos = (pos_cos < neg_cos).sum().item() / total
    acc_euc = (pos_euc < neg_euc).sum().item() / total
    acc_manh = (pos_manh < neg_manh).sum().item() / total
    acc_max = max(acc_cos, acc_euc, acc_manh)

    return {
        "cosine_accuracy": acc_cos,
        "euclidean_accuracy": acc_euc,
        "manhattan_accuracy": acc_manh,
        "max_accuracy": acc_max,
    }





Training head: Gemini + Projection (NO location)...
[Gemini+Proj(no-loc)] Epoch 1 | L_total=6.4108 | L_MNR=6.2140 | L_triplet=0.1967
[Gemini+Proj(no-loc)] Epoch 2 | L_total=6.4051 | L_MNR=6.2132 | L_triplet=0.1919

Training head: Gemini + LocHead + Projection (WITH location)...
[Gemini+Loc+Proj] Epoch 1 | L_total=6.4115 | L_MNR=6.2139 | L_triplet=0.1976
[Gemini+Loc+Proj] Epoch 2 | L_total=6.4065 | L_MNR=6.2129 | L_triplet=0.1936


In [None]:
print("\n=== Evaluating on val ===")
metrics_raw = eval_raw_gemini(val)
print("Raw Gemini (no head, no loc):", metrics_raw)

metrics_text_only = eval_head(
    head_text_only,
    val_dataset,
    batch_size=BATCH_TRAIN,
    use_location=False,
)
print("Gemini + Projection (NO loc):", metrics_text_only)

metrics_text_loc = eval_head(
    head_text_loc,
    val_dataset,
    batch_size=BATCH_TRAIN,
    use_location=True,
)
print("Gemini + LocHead + Projection (WITH loc):", metrics_text_loc)

print("\nDone.")



=== Evaluating on val ===
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
[Gemini] Rate limit reached, sleeping 0.5s...
Raw Gemini (no head, no loc): {'cosine_accuracy': 0.584, 'euclidean_accuracy': 0.584, 'manhattan_accuracy': 0.5815, 'max_accuracy': 0.584}
Gemini + Projection (NO loc): {'cosine_accuracy': 0.6005, 'euclidean_accuracy': 0.6005, 'manhattan_accuracy': 0.6045, 'max_accuracy': 0.6045}
Gemini + LocHead + Projection (WITH loc): {'cosine_accuracy': 0.5885, 'euclidean_accuracy': 0.5885, 'manhattan_accuracy': 0.6015, 'max_accuracy': 0.6015}

Done.
