# Modelo de Lenguaje n-gram con KenLM

# SI7003 NLP - SI7016 Applied NLP
# Lecture 04 examples
# this notebook can run on google colab

In [None]:
#configuración en google colab accediendo al Drive (opcional)
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install numpy transformers bertviz matplotlib seaborn --quiet

In [None]:
# Modelos n-gram y evaluación (repaso)
import numpy as np
from collections import defaultdict, Counter
import math

# Corpus de ejemplo
corpus = "el gato duerme el gato come el ratón duerme"
tokens = corpus.split()

# Unigramas y Bigramas
unigrams = Counter(tokens)
bigrams = Counter(zip(tokens[:-1], tokens[1:]))

# Vocabulario y total
V = len(unigrams)
N = len(tokens)

# Probabilidades MLE
P_unigram = {w: c/N for w, c in unigrams.items()}
P_bigram = {bg: c/unigrams[bg[0]] for bg, c in bigrams.items()}

# Log-likelihood y perplejidad de una frase
sentence = "el gato duerme"
s_toks = sentence.split()
logprob = 0.0
for i in range(1, len(s_toks)):
    bigram = (s_toks[i-1], s_toks[i])
    prob = P_bigram.get(bigram, 1/V)
    logprob += math.log(prob)
perplexity = math.exp(-logprob / (len(s_toks)-1))
print("Perplexity:", perplexity)

In [None]:
# Red neuronal tipo Bengio 2003
import torch
import torch.nn as nn
import torch.nn.functional as F

# Toy dataset y vocab
sentences = ["el gato duerme", "el gato come"]
vocab = list(set(" ".join(sentences).split()))
vocab_size = len(vocab)
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}

# Datos para entrenamiento
X = []
y = []
for s in sentences:
    toks = s.split()
    for i in range(1, len(toks)):
        X.append(word2idx[toks[i-1]])
        y.append(word2idx[toks[i]])
X = torch.tensor(X)
y = torch.tensor(y)

# Modelo
class NNLM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.fc1 = nn.Linear(emb_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.emb(x)
        x = torch.tanh(self.fc1(x))
        x = self.fc2(x)
        return x

model = NNLM(vocab_size, 10, 16)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Entrenamiento
for epoch in range(200):
    optimizer.zero_grad()
    out = model(X)
    loss = loss_fn(out, y)
    loss.backward()
    optimizer.step()

print("Entrenamiento completo. Última pérdida:", loss.item())


In [None]:
# ejemplo de red neuronal feed forward
#!pip install torch matplotlib scikit-learn
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. Generar datos artificiales
X, y = make_moons(n_samples=1000, noise=0.2, random_state=42)

# 2. Preprocesamiento
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convertir a tensores
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1)

X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test).unsqueeze(1)

# 3. Definir  red neuronal feedforward
class FeedforwardNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(2, 8),
            nn.Tanh(),
            nn.Linear(8, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

model = FeedforwardNN()

# 4. Definir función de pérdida y optimizador
criterion = nn.BCELoss()  # Binary Cross Entropy
optimizer = optim.Adam(model.parameters(), lr=0.01)

# 5. Entrenamiento
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# 6. Evaluación
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    predictions = (y_pred > 0.5).float()
    accuracy = (predictions == y_test_tensor).float().mean()
    print(f"\nAccuracy on test set: {accuracy:.4f}")

# 7. Visualizar resultados
plt.figure(figsize=(6, 5))
plt.scatter(X_test[:, 0], X_test[:, 1], c=predictions.squeeze(), cmap="coolwarm", alpha=0.7)
plt.title("Predicciones de la red feedforward")
plt.xlabel("x1")
plt.ylabel("x2")
plt.grid(True)
plt.show()

In [None]:
# RNN vs LSTM
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.RNN(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.emb(x)
        h0 = torch.zeros(1, x.size(0), hidden_dim)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out


In [None]:
# Atención escalar
Q = torch.tensor([[0.0, 1.0]])  # Query
K = torch.tensor([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]])  # Keys
V = torch.tensor([[1.0, 0.0], [10.0, 0.0], [100.0, 5.0]])  # Values

dk = K.size(-1)
scores = Q @ K.T / math.sqrt(dk)
weights = F.softmax(scores, dim=-1)
context = weights @ V
print("Pesos de atención:", weights)
print("Contexto resultante:", context)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Paso 1: Crear Q, K y V manualmente
Q = np.array([[1, 0, 1, 0],   # query para token 1
              [0, 1, 0, 1],   # query para token 2
              [1, 1, 1, 1]])  # query para token 3

K = np.array([[1, 0, 1, 0],   # key para token 1
              [0, 1, 0, 1],   # key para token 2
              [1, 1, 0, 0]])  # key para token 3

V = np.array([[1, 0, 0, 0],   # value para token 1
              [0, 1, 0, 0],   # value para token 2
              [0, 0, 1, 0]])  # value para token 3

# Paso 2: Calcular los scores (QK^T)
scores = Q @ K.T
print("Raw scores (QK^T):\n", scores)

# Paso 3: Escalar por sqrt(d_k) (aquí d_k = 4)
dk = Q.shape[1]
scaled_scores = scores / np.sqrt(dk)
print("\nScaled scores:\n", scaled_scores)

# Paso 4: Aplicar softmax fila por fila
def softmax(x):
    e_x = np.exp(x - np.max(x))  # estabilidad numérica
    return e_x / e_x.sum(axis=-1, keepdims=True)

attention_weights = softmax(scaled_scores)
print("\nAttention Weights (softmax):\n", attention_weights)

# Paso 5: Multiplicar por V (ponderar los valores)
output = attention_weights @ V
print("\nOutput (Attention weighted values):\n", output)

# Paso 6: Visualizar la matriz de atención
plt.figure(figsize=(6, 4))
sns.heatmap(attention_weights, annot=True, cmap='Blues', xticklabels=['Tok1', 'Tok2', 'Tok3'], yticklabels=['Q1', 'Q2', 'Q3'])
plt.title('Matriz de Atención (peso de cada token sobre los valores)')
plt.xlabel('Keys (K)')
plt.ylabel('Queries (Q)')
plt.show()

In [None]:
# Notebook: Visualización de atención con BERT y BERTViz (modelo en inglés)

from transformers import AutoTokenizer, AutoModel
from bertviz import head_view, model_view
import torch

# ------------------------------
# Paso 1: Cargar modelo BERT en inglés
# ------------------------------
model_name = 'bert-base-uncased'  # Modelo entrenado para inglés

model = AutoModel.from_pretrained(model_name, output_attentions=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.eval();

# ------------------------------
# Paso 2: Tokenizar oración de ejemplo
# ------------------------------
sentence = "The cat sat on the mat."
inputs = tokenizer.encode_plus(sentence, return_tensors='pt', add_special_tokens=True)
input_ids = inputs['input_ids']

# Obtener tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

# ------------------------------
# Paso 3: Ejecutar modelo y obtener atención
# ------------------------------
with torch.no_grad():
    outputs = model(input_ids)
    attention = outputs.attentions  # Tuple con 12 tensores (capas), cada uno de forma (1, n_heads, seq_len, seq_len)

# ------------------------------
# Paso 4: Visualización con BERTViz
# ------------------------------
# Vista por cabeza
head_view(attention, tokens=tokens)

# Vista por capa (opcional)
# model_view(attention, tokens=tokens)

In [None]:
# Notebook: Visualización de atención con BERT y BERTViz (modelo en español)

from transformers import AutoTokenizer, AutoModel
from bertviz import head_view, model_view
import torch

# ------------------------------
# Paso 1: Cargar modelo BERT en español
# ------------------------------
model_name = 'dccuchile/bert-base-spanish-wwm-cased'  # Modelo entrenado para español

model = AutoModel.from_pretrained(model_name, output_attentions=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.eval();

# ------------------------------
# Paso 2: Tokenizar oración de ejemplo
# ------------------------------
sentence = "El gato se sentó sobre la alfombra."
inputs = tokenizer.encode_plus(sentence, return_tensors='pt', add_special_tokens=True)
input_ids = inputs['input_ids']

# Obtener tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

# ------------------------------
# Paso 3: Ejecutar modelo y obtener atención
# ------------------------------
with torch.no_grad():
    outputs = model(input_ids)
    attention = outputs.attentions  # Tuple con 12 tensores (capas), cada uno de forma (1, n_heads, seq_len, seq_len)

# ------------------------------
# Paso 4: Visualización con BERTViz
# ------------------------------
# Vista por cabeza
head_view(attention, tokens=tokens)

# Vista por capa (opcional)
# model_view(attention, tokens=tokens)
