# Flujo de entrenamiento (con precálculo / cache de embeddings por chunk)

Este notebook implementa el flujo **2-etapas**:

1) **Precalcular embeddings**: `texto licitación -> chunking -> ModeloB (GPT-OSS congelado) -> embeddings por chunk` y guardar en disco (`cache_dir/xxx.pt`)

2) **Entrenar ModeloC** (cross-chunk + MLP) leyendo solo embeddings cacheados (rápido)

3) **Inferencia**: `texto -> ModeloB -> embeddings -> ModeloC -> y_hat`

In [11]:
# Requisitos
!pip install -U transformers accelerate pandas

import os
from dataclasses import dataclass
from typing import List, Tuple, Optional

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

from transformers import AutoTokenizer, AutoModel
torch.cuda.is_available()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

True

## Configuración

In [2]:
@dataclass
class CFG:
    model_id: str = "openai/gpt-oss-20b"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    dtype: torch.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

    # Chunking
    max_len: int = 4096
    stride: int = 2048

    # Cross-chunk (liviano)
    d_model: int = 512
    n_heads: int = 8
    ffn_dim: int = 2048
    dropout: float = 0.1

    # Training
    batch_size: int = 8
    lr: float = 2e-4
    epochs: int = 10

    # Cache
    cache_dir: str = "./cache_chunk_embs"

cfg = CFG()
cfg

CFG(model_id='openai/gpt-oss-20b', device='cuda', dtype=torch.bfloat16, max_len=4096, stride=2048, d_model=512, n_heads=8, ffn_dim=2048, dropout=0.1, batch_size=8, lr=0.0002, epochs=10, cache_dir='./cache_chunk_embs')

## ModeloB – GPT-OSS congelado (embeddings por chunk)

In [3]:
class ModelB_ChunkEmbedder(nn.Module):
    def __init__(self, gpt_model, tokenizer, max_len=4096, stride=2048, device="cuda"):
        super().__init__()
        self.gpt = gpt_model.eval()
        for p in self.gpt.parameters():
            p.requires_grad_(False)

        self.tokenizer = tokenizer
        self.max_len = max_len
        self.stride = stride
        self.device = device

        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.pad_id = self.tokenizer.pad_token_id

    @torch.no_grad()
    def forward(self, text: str) -> torch.Tensor:
        enc = self.tokenizer(text, return_tensors="pt", truncation=False)
        input_ids = enc["input_ids"][0].to(self.device)
        attn_mask = enc["attention_mask"][0].to(self.device)
        L = int(attn_mask.sum().item())

        chunk_embs = []
        
        for start in range(0, max(1, L), self.stride):
            end = min(start + self.max_len, L)
            ids = input_ids[start:end]
            am  = attn_mask[start:end]
            if ids.numel() == 0:
                continue

            pad_len = self.max_len - ids.numel()
            if pad_len > 0:
                ids = torch.cat([ids, torch.full((pad_len,), self.pad_id, device=self.device, dtype=ids.dtype)])
                am  = torch.cat([am, torch.zeros((pad_len,), device=self.device, dtype=am.dtype)])

            # Procesar UN chunk a la vez (no batching)
            out = self.gpt(input_ids=ids.unsqueeze(0), attention_mask=am.unsqueeze(0), return_dict=True)
            h = out.last_hidden_state  # [1,T,d]
            
            last_idx = int(am.sum().item()) - 1
            chunk_embs.append(h[0, last_idx].clone())  # [d]
            
            # Liberar memoria
            del out, h
            torch.cuda.empty_cache()

            if end == L:
                break

        return torch.stack(chunk_embs)  # [N,d]

## ModeloC – Cross-chunk (no causal) + MLP regressor (liviano)

In [4]:
class ModelC_CrossChunkRegressor(nn.Module):
    def __init__(self, d_in: int, d_model: int = 512, n_heads: int = 8, ffn_dim: int = 2048, dropout: float = 0.1):
        super().__init__()
        self.proj = nn.Linear(d_in, d_model)

        layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=ffn_dim,
            dropout=dropout,
            batch_first=True,
            norm_first=True,
            activation="gelu",
        )
        self.encoder = nn.TransformerEncoder(layer, num_layers=1)

        self.cls = nn.Parameter(torch.zeros(1, 1, d_model))
        nn.init.normal_(self.cls, std=0.02)

        self.head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, 1),
        )

    def forward(self, chunk_embs: torch.Tensor, valid_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        B, N, _ = chunk_embs.shape
        x = self.proj(chunk_embs)

        cls = self.cls.expand(B, 1, -1)
        x = torch.cat([cls, x], dim=1)

        if valid_mask is not None:
            cls_valid = torch.ones((B, 1), device=x.device, dtype=torch.bool)
            valid = torch.cat([cls_valid, valid_mask], dim=1)
            pad_mask = ~valid
        else:
            pad_mask = None

        x = self.encoder(x, src_key_padding_mask=pad_mask)
        pooled = x[:, 0]
        return self.head(pooled)

## Cargar datos (placeholder)

In [7]:
# Reemplazá esto por tu loader real (CSV/DB/paths/etc.)
from pydoc import text
import pandas as pd

df = pd.read_csv("public-road-works-analysis/data/dataset.csv")

ids = list(df["Id llamado"])
texts = []
for id in ids[:50]:
    f = open(f"public-road-works-analysis/data/pbcs_extracted/{id}.txt")
    texts.append(f.read())
targets = torch.tensor(list(df["Cantidad de oferentes"]))
len(texts), targets.shape

(50, torch.Size([244]))

In [15]:
len(set(ids))

243

## Inicializar GPT-OSS congelado + ModeloC

In [8]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)
base = AutoModel.from_pretrained(cfg.model_id, dtype=cfg.dtype).to(cfg.device)

modelB = ModelB_ChunkEmbedder(base, tokenizer, max_len=cfg.max_len, stride=cfg.stride, device=cfg.device)

d_in = base.config.hidden_size
modelC = ModelC_CrossChunkRegressor(d_in=d_in, d_model=cfg.d_model, n_heads=cfg.n_heads, ffn_dim=cfg.ffn_dim, dropout=cfg.dropout).to(cfg.device)

d_in

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/27.9M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

MXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0, we will default to dequantizing the model to bf16


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.17G [00:00<?, ?B/s]

model-00000-of-00002.safetensors:   0%|          | 0.00/4.79G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.80G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



2880

## (1) Precálculo / cache de embeddings

In [None]:
@torch.no_grad()
def precache_embeddings(texts: List[str], targets: torch.Tensor, tender_ids: List, modelB: ModelB_ChunkEmbedder, cache_dir: str):
    """
    Genera embeddings y los guarda con el ID de licitación como nombre de archivo.
    Args:
        texts: Lista de textos de licitaciones
        targets: Tensor con cantidad de oferentes
        tender_ids: Lista de IDs de licitación (mismo orden que texts)
        modelB: Modelo para generar embeddings
        cache_dir: Directorio donde guardar los .pt
    """
    os.makedirs(cache_dir, exist_ok=True)

    for tender_id, txt, y in tqdm(zip(tender_ids, texts, targets), total=len(texts), desc="Cacheando embeddings"):
        path = os.path.join(cache_dir, f"{tender_id}.pt")
        if os.path.exists(path):
            continue

        embs = modelB(txt).cpu()  # [N,d]
        torch.save({"embs": embs, "y": float(y.item()), "tender_id": tender_id}, path)

    print("cache listo:", cache_dir)

# Pasar los IDs de licitación como tercer argumento
precache_embeddings(texts, targets, ids[:50], modelB, cfg.cache_dir)

Cacheando embeddings:   0%|          | 0/50 [00:00<?, ?it/s]

cache listo: ./cache_chunk_embs


## (2) Dataset cacheado + collate (padding por N chunks)

In [8]:
class CachedChunkEmbDataset(Dataset):
    def __init__(self, cache_dir: str):
        self.files = sorted([os.path.join(cache_dir, f) for f in os.listdir(cache_dir) if f.endswith(".pt")])

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx: int):
        d = torch.load(self.files[idx], map_location="cpu")
        return d["embs"].float(), torch.tensor(d["y"], dtype=torch.float32)

def collate_pad_chunks(batch: List[Tuple[torch.Tensor, torch.Tensor]]):
    embs_list, y_list = zip(*batch)
    B = len(embs_list)
    d = embs_list[0].shape[1]
    Nmax = max(e.shape[0] for e in embs_list)

    embs = torch.zeros((B, Nmax, d), dtype=torch.float32)
    valid = torch.zeros((B, Nmax), dtype=torch.bool)

    for i, e in enumerate(embs_list):
        n = e.shape[0]
        embs[i, :n] = e
        valid[i, :n] = True

    y = torch.stack(y_list).view(B, 1)
    return embs, valid, y

ds = CachedChunkEmbDataset(cfg.cache_dir)
dl = DataLoader(ds, batch_size=cfg.batch_size, shuffle=True, collate_fn=collate_pad_chunks)

len(ds), next(iter(dl))[0].shape

  d = torch.load(self.files[idx], map_location="cpu")


(4, torch.Size([4, 62, 2880]))

## (3) Entrenamiento de ModeloC

In [9]:
def train_modelC(modelC: nn.Module, dl: DataLoader, cfg: CFG):
    modelC.train()
    opt = torch.optim.AdamW(modelC.parameters(), lr=cfg.lr, weight_decay=0.01)
    loss_fn = nn.SmoothL1Loss()

    for ep in range(1, cfg.epochs + 1):
        total = 0.0
        for embs, valid, y in dl:
            embs = embs.to(cfg.device)
            valid = valid.to(cfg.device)
            y = y.to(cfg.device)

            y_hat = modelC(embs, valid)
            loss = loss_fn(y_hat, y)

            opt.zero_grad(set_to_none=True)
            loss.backward()
            opt.step()

            total += float(loss.item())

        print(f"epoch {ep:02d} | loss={total/len(dl):.4f}")

train_modelC(modelC, dl, cfg)

  d = torch.load(self.files[idx], map_location="cpu")


epoch 01 | loss=1.3138
epoch 02 | loss=0.4863
epoch 03 | loss=0.3313
epoch 04 | loss=0.4157
epoch 05 | loss=0.2966
epoch 06 | loss=0.3600
epoch 07 | loss=0.3172
epoch 08 | loss=0.2919
epoch 09 | loss=0.2631
epoch 10 | loss=0.1600


## Guardar pesos de ModeloC

In [10]:
os.makedirs("./checkpoints", exist_ok=True)
ckpt_path = "./checkpoints/modelC_crosschunk.pt"
torch.save(modelC.state_dict(), ckpt_path)
ckpt_path

'./checkpoints/modelC_crosschunk.pt'

## (4) Inferencia final (ModeloA = B + C)

In [13]:
class ModelA_Full(nn.Module):
    def __init__(self, modelB: ModelB_ChunkEmbedder, modelC: nn.Module):
        super().__init__()
        self.B = modelB
        self.C = modelC

    @torch.no_grad()
    def predict_one(self, text: str) -> torch.Tensor:
        chunk_vecs = self.B(text)              # [N,d] - bfloat16
        chunk_vecs = chunk_vecs.float()        # Convertir a float32
        chunk_vecs = chunk_vecs.unsqueeze(0)   # [1,N,d]
        valid = torch.ones((1, chunk_vecs.size(1)), device=chunk_vecs.device, dtype=torch.bool)
        self.C.eval()
        y_hat = self.C(chunk_vecs, valid)      # [1,1]
        return y_hat.squeeze(0)

modelA = ModelA_Full(modelB, modelC)

f = open("341016.txt", "r")
test_text = f.read()
pred = modelA.predict_one(test_text)
pred

tensor([1.4982], device='cuda:0')