# Lab BERT


In [None]:
# instalar las librerias
# google colab ya las tiene
!pip install -U transformers datasets evaluate accelerate bertviz torch torchvision torchaudio -q


In [None]:
# esta no viene con google colab
!pip install evaluate

## Imports y utilidades

In [None]:

import math
import torch
import numpy as np
from transformers import (
    BertTokenizerFast, BertModel, BertForSequenceClassification,
    pipeline, AutoTokenizer, AutoModel, AutoModelForQuestionAnswering
)
from datasets import load_dataset
from tqdm.auto import tqdm

def cosine_sim(a, b, eps=1e-8):
    a = a / (np.linalg.norm(a, axis=-1, keepdims=True) + eps)
    b = b / (np.linalg.norm(b, axis=-1, keepdims=True) + eps)
    return (a * b).sum(axis=-1)
device = "cuda" if torch.cuda.is_available() else "cpu"
device


## Tokenización WordPiece y embeddings contextuales

In [None]:
text = "El banco está lleno de gente. El banco está junto al río."
tok = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
model = BertModel.from_pretrained("bert-base-multilingual-cased").to(device).eval()

enc = tok(text, return_tensors="pt")
with torch.no_grad():
    out = model(**{k: v.to(device) for k, v in enc.items()})
last_hidden = out.last_hidden_state[0]  # [seq_len, hidden]
tokens = tok.convert_ids_to_tokens(enc["input_ids"][0])

print("Tokens:", tokens)
print("Shape embeddings:", last_hidden.shape)


**Reto:** compara los embeddings del primer token `banco` en dos oraciones distintas (ambigüedad semántica).  

tip: segmenta el texto en dos inputs y compara los 2 vectores de `banco` con coseno.


In [None]:

sent1 = "El banco está lleno de gente."
#sent2 = "El banco está junto al río."
#sent2 = "El banco de carpinteria esta sucio."
sent2 = "El banco de jugadores reserva del DIM es muy bueno."


enc1 = tok(sent1, return_tensors="pt")
enc2 = tok(sent2, return_tensors="pt")

with torch.no_grad():
    h1 = model(**{k: v.to(device) for k, v in enc1.items()}).last_hidden_state[0].cpu().numpy()
    h2 = model(**{k: v.to(device) for k, v in enc2.items()}).last_hidden_state[0].cpu().numpy()

toks1 = tok.convert_ids_to_tokens(enc1["input_ids"][0])
toks2 = tok.convert_ids_to_tokens(enc2["input_ids"][0])

# Busca índice de 'banco' (puede tokenizarse en subpalabras; aquí asumimos token completo en mBERT)
idx1 = toks1.index("banco") if "banco" in toks1 else 0
idx2 = toks2.index("banco") if "banco" in toks2 else 0

sim = cosine_sim(h1[idx1:idx1+1], h2[idx2:idx2+1])[0]
print("Token sent1:", toks1[idx1], "| Token sent2:", toks2[idx2])
print("Similitud coseno entre 'banco' (contextos distintos):", float(sim))


## Clasificación de textos (análisis de sentimientos) con `pipeline`

In [None]:
clf = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
ejemplos = [
    "Este restaurante fue fantástico, volveré pronto.",
    "El servicio fue terrible y la comida fría."
]
clf(ejemplos)


## NER (Reconocimiento de Entidades)

In [None]:
ner_en = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
texto_en = "Barack Obama was born in Hawaii and served as President of the United States."
ner_en(texto_en)

In [None]:
# NER en español
# Otros modelos en HF: "PlanTL-GOB-ES/roberta-base-bne-capitel-ner"
ner_es = pipeline("token-classification", model="mrm8488/bert-spanish-cased-finetuned-ner", aggregation_strategy="simple")
texto_es = "Shakira nació en Barranquilla y es una cantante colombiana."
ner_es(texto_es)


## Pregunta–Respuesta (SQuAD-like)

In [None]:
qa = pipeline("question-answering", model="deepset/bert-base-cased-squad2")
contexto = """
BERT es un modelo de lenguaje basado en Transformers desarrollado por Google AI en 2018.
Fue entrenado en Wikipedia y BookCorpus, y ha alcanzado resultados de estado del arte en múltiples benchmarks.
"""
qa({"question": "¿Quién desarrolló BERT?", "context": contexto})


## Similitud de oraciones (embeddings BERT + coseno)

In [None]:

tok_en = BertTokenizerFast.from_pretrained("bert-base-uncased")
mdl_en = BertModel.from_pretrained("bert-base-uncased").to(device).eval()

sentences = [
    "The cat sits on the mat.",
    "A feline is resting on a rug.",
    "We are training a neural network.",
]

def cls_embed(sentence):
    enc = tok_en(sentence, return_tensors="pt")
    with torch.no_grad():
        h = mdl_en(**{k: v.to(device) for k, v in enc.items()}).last_hidden_state[:, 0, :]
    return h[0].cpu().numpy()

embs = np.stack([cls_embed(s) for s in sentences])
S = cosine_sim(embs[:, None, :], embs[None, :, :])
print("Matriz de similitud (coseno):\n", np.round(S, 3))


## Fine-tuning en clasificación (SST-2, subconjunto)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate
import random

# Carga SST2 y reduce tamaño para demo rápida
ds = load_dataset("glue", "sst2")
small_train = ds["train"].shuffle(seed=42).select(range(200))   # 200 ejemplos
small_val = ds["validation"].shuffle(seed=42).select(range(200))

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize(ex):
    return tokenizer(ex["sentence"], truncation=True)

small_train = small_train.map(tokenize, batched=True)
small_val = small_val.map(tokenize, batched=True)

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).to(device)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric = evaluate.load("glue", "sst2")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": (preds == labels).mean()}

args = TrainingArguments(
    output_dir="bert-sst2-demo",
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=10,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=small_train,
    eval_dataset=small_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
eval_res = trainer.evaluate()
eval_res


In [None]:
# BERT example:

from transformers import BertTokenizer, BertModel
import torch

# Cargar modelo y tokenizador
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Texto de ejemplo
text = "BERT is amazing for NLP"
inputs = tokenizer(text, return_tensors='pt')

# Obtener embeddings
with torch.no_grad():
    outputs = model(**inputs)

last_hidden_state = outputs.last_hidden_state
cls_embedding = last_hidden_state[:,0,:]  # [CLS] token
print(cls_embedding.shape)  # (1, 768)