# Self-supervised learning of embeddings via a language model based on feed-forward neural networks

<a target="_blank" href="https://colab.research.google.com/github/jaspock/me/blob/main/docs/materials/transformers/assets/notebooks/embeddings-ffnn.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<a href="http://dlsi.ua.es/~japerez/"><img src="https://img.shields.io/badge/Universitat-d'Alacant-5b7c99" style="margin-left:10px"></a>

Notebook and code jointly written by Juan Antonio Pérez and an AI-based code generator in 2025.

This notebook presents a very simple language model based on feed-forward neural networks. Its main purpose is to show not how to implement a language model, but how a self-supervised task can be solved so that word embeddings are learned as a by-product. 

It is assumed that you are already familiar with the basics of PyTorch, but at a absolute beginner level only. This notebook complements a [learning guide](https://dlsi.ua.es/~japerez/materials/transformers/intro/) based on studying the math behind the models by reading the book "[Speech and Language Processing](https://web.stanford.edu/~jurafsky/slp3/)" (3rd edition) by Jurafsky and Martin. It is part of a series of notebooks which are supposed to be incrementally studied, so make sure you follow the right order. If your learning is being supervised by a teacher, follow the additional instructions that you may have received. Although you may use a GPU environment to execute the code, the computational requirements for the default settings are so low that you can probably run it on CPU.

In [None]:
%%capture
%pip install torch numpy matplotlib

In [None]:
import os
# set before importing pytorch to avoid all non-deterministic operations on GPU
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

import random
import numpy as np
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.use_deterministic_algorithms(True)
    
set_seed(42)  # to ensure reproducibility

In [None]:
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

EMBEDDING_DIM = 6
HIDDEN_DIM = 24
CONTEXT_SIZE = 3
LEARNING_RATE = 0.0005
BATCH_SIZE = 32
EPOCHS = 3000

# with CONTEXT=2, the model cannot learn to say which animal is being hunted or chased by another

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

raw_sentences = [
    "I see the dog chasing the cat .",
    "I see the dog chasing the koala .",
    "I see the dog hunting the cat .",
    "I see the dog hunting the elephant .",
    "I see the dog hunting the koala .",
    "I see the lion chasing the elephant .",
    "I see the lion chasing the cat .",
    "I see the lion chasing the koala .",
    "I see the lion hunting the elephant .",
    "I see the lion hunting the koala .",
    "I see the tiger chasing the koala .",
    "I see the tiger chasing the cat .",
    "I see the tiger hunting the koala .",
]

test_phrases = ["I see the dog chasing the", "I see the tiger hunting the"]

In [None]:

all_tokens = [token for sentence in raw_sentences for token in sentence.lower().split()]
unique_tokens = sorted(list(set(all_tokens)))
word_to_ix = {word: i for i, word in enumerate(unique_tokens)}
ix_to_word = {i: word for word, i in word_to_ix.items()}
VOCAB_SIZE = len(word_to_ix)

data = []
for sentence in raw_sentences:
    sentence_tokens = sentence.lower().split()
    if len(sentence_tokens) >= CONTEXT_SIZE + 1:
        for i in range(CONTEXT_SIZE, len(sentence_tokens)):
            context = sentence_tokens[i - CONTEXT_SIZE:i]
            target = sentence_tokens[i]
            data.append((context, target))

print(data)

context_list = []
target_list = []
for context, target in data:
    context_list.append([word_to_ix[w] for w in context])
    target_list.append(word_to_ix[target])

context_tensor = torch.tensor(context_list, dtype=torch.long)
target_tensor = torch.tensor(target_list, dtype=torch.long)

dataset = TensorDataset(context_tensor, target_tensor)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:

class FeedForwardNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, context_size):
        super(FeedForwardNN, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.activation = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds_flattened = embeds.view(inputs.size(0), -1)
        hidden = self.activation(self.linear1(embeds_flattened))
        output = self.linear2(hidden)
        log_probs = torch.nn.functional.log_softmax(output, dim=1)
        return log_probs

In [None]:

model = FeedForwardNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, CONTEXT_SIZE).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

print(f"Training the model for {EPOCHS} epochs...")
loss_history = []
for epoch in range(EPOCHS):
    total_loss = 0
    for context_batch, target_batch in dataloader:
        model.zero_grad()
        context_batch, target_batch = context_batch.to(device), target_batch.to(device)
        log_probs = model(context_batch)
        loss = loss_function(log_probs, target_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    loss_history.append(total_loss)
    if (epoch + 1) % 200 == 0:
        print(f"Época {epoch + 1} - Pérdida: {total_loss:.4f}")

In [None]:

def predict_next_word(model, phrase):
    model.eval()
    with torch.no_grad():
        phrase_tokens = phrase.lower().split()
        context_tokens = phrase_tokens[-CONTEXT_SIZE:]

        print(context_tokens)

        context_idxs = torch.tensor([word_to_ix[w] for w in context_tokens], dtype=torch.long).to(device)
        context_idxs = context_idxs.unsqueeze(0)

        log_probs = model(context_idxs)
        probs = torch.exp(log_probs)

        top_probs, top_indices = torch.topk(probs, 3)

        top_words = [ix_to_word[i.item()] for i in top_indices[0]]

        print(f"Prediction for '{phrase}':")
        for i, word in enumerate(top_words):
            print(f"  {i+1}. '{word}' (Probability: {top_probs[0][i].item():.4f})")

print("\n--- Predictions ---")
for phrase in test_phrases:
    predict_next_word(model, phrase)

In [None]:
print("\n--- Embeddings Visualization ---")
def visualize_embeddings(model, word_to_ix):
    embeddings = model.embeddings.weight.data.cpu().numpy()

    if embeddings.shape[1] > 2:
        print("Projecting embeddings to 2D using PCA...")
        pca = PCA(n_components=2)
        embeddings_2d = pca.fit_transform(embeddings)
    else:
        embeddings_2d = embeddings

    plt.figure(figsize=(10, 8))
    for word, i in word_to_ix.items():
        x, y = embeddings_2d[i]
        plt.scatter(x, y)
        plt.annotate(word, (x, y), textcoords="offset points", xytext=(5,5), ha='center')

    plt.title("Learned Embeddings Visualization")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.grid(True)
    plt.show()

visualize_embeddings(model, word_to_ix)