In [11]:
from bpe.byte_pair_encoder import BytePairEncoder

def prepare_data(training_data, valid_data, test_data):
    bpe_encoder = BytePairEncoder(1000, verbose=False, model_path="../models/bpe/model.json", neural=True)

    # Train or load existing BPE
    try:
        bpe_encoder.load()  # will use model_path
        print("Loaded existing BPE model.")
    except FileNotFoundError:
        print("No saved BPE found. Training...")
        bpe_encoder.fit(training_data)
        bpe_encoder.save()

    return len(bpe_encoder.bpe_codes), bpe_encoder.encode(training_data), bpe_encoder.encode(valid_data), bpe_encoder.encode(test_data)

In [12]:
def load_data():
    test_string = "low low low low low lowest lowest newer newer newer newer newer newer wider wider wider new new"
    full_data = open('../data/shakespeare/Shakespeare_clean_full.txt', 'r').read()
    training_data = open('../data/shakespeare/Shakespeare_clean_train.txt', 'r').read()
    test_data = open('../data/shakespeare/Shakespeare_clean_test.txt', 'r').read()
    valid_data = open('../data/shakespeare/Shakespeare_clean_valid.txt', 'r').read()

    return test_string, full_data, training_data, test_data, valid_data

In [19]:


class NeuralBigram:

    def __init__(self, embedding_dimension, vocab_size, ngram_size, lr=1e-2):
        self.embedding_dimension = embedding_dimension
        self.vocab_size = vocab_size
        self.ngram_size = ngram_size
        self.lr = lr

        # Embedding: vocab_size → embedding_dimension
        self.embedding_matrix = np.random.randn(vocab_size, embedding_dimension) * 0.01

        # Linear layer: (context_size * embedding_dim) → vocab_size
        input_dim = (self.ngram_size - 1) * self.embedding_dimension
        self.linear_W = np.random.randn(input_dim, vocab_size) * 0.01
        self.linear_b = np.zeros((1, vocab_size))

    def forward(self, x, y=None, target=True):
        # Embedding lookup
        self.embeddings = self.embedding_matrix[x]  # (B, context_size, D)
        self.embeddings_flat = self.embeddings.reshape(x.shape[0], -1)  # (B, context_size*D)

        # Linear projection
        self.logits = self.embeddings_flat @ self.linear_W + self.linear_b  # (B, vocab_size)

        # Softmax
        exp_logits = np.exp(self.logits - np.max(self.logits, axis=1, keepdims=True))
        self.probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

        if target:
            B = x.shape[0]
            loss = -np.log(self.probs[np.arange(B), y]).mean()
            return loss, self.logits
        else:
            return self.probs

    def backwards(self, x, y):
        B = x.shape[0]

        # -------- Softmax + Cross-Entropy Gradient --------
        dlogits = self.probs.copy()
        dlogits[np.arange(B), y] -= 1
        dlogits /= B  # (B, vocab_size)

        # -------- Gradients for linear layer --------
        dW = self.embeddings_flat.T @ dlogits  # (context_size*D, vocab_size)
        db = np.sum(dlogits, axis=0, keepdims=True)  # (1, vocab_size)

        # Gradient through embeddings_flat
        demb_flat = dlogits @ self.linear_W.T  # (B, context_size*D)
        demb = demb_flat.reshape(self.embeddings.shape)  # (B, context_size, D)

        # Update Embeddings
        np.add.at(self.embedding_matrix, x, -self.lr * demb)

        # Update weights
        self.linear_W -= self.lr * dW
        self.linear_b -= self.lr * db

    def fit(self, data, epochs=10, batch_size=32, lr_decay=1.0):
        """
        Epoch-based training with shuffling
        """
        data = np.array(data, dtype=np.int64)
        num_samples = len(data) - self.ngram_size + 1

        # Precompute all n-grams once
        contexts = np.stack([data[i:i+self.ngram_size-1] for i in range(num_samples)])
        targets = np.array([data[i+self.ngram_size-1] for i in range(num_samples)])

        for epoch in range(epochs):
            # Shuffle data each epoch
            perm = np.random.permutation(num_samples)
            contexts_shuffled = contexts[perm]
            targets_shuffled = targets[perm]

            epoch_loss = 0.0
            num_batches = int(np.ceil(num_samples / batch_size))

            for b in range(num_batches):
                start = b * batch_size
                end = min((b+1) * batch_size, num_samples)
                x_batch = contexts_shuffled[start:end]
                y_batch = targets_shuffled[start:end]

                # Forward and backward
                loss, _ = self.forward(x_batch, y_batch, target=True)
                self.backwards(x_batch, y_batch)
                epoch_loss += loss * (end - start)

            epoch_loss /= num_samples
            self.lr *= lr_decay  # Optional learning rate decay
            print(f"Epoch {epoch+1}/{epochs} - loss: {epoch_loss:.4f} - lr: {self.lr:.6f}")

    def perplexity(self, data, batch_size=1) -> float:
        """
        Calculate the perplexity for that model after training on validation data

        :param self: the model itfels
        :param data: The encoded validation data
        :param batch_size: should be 1 perplexity calculation
        :return: the perplexity value
        """

        # Load Data and calculate num_samples based on data length and neural_ngram size
        data = np.array(data, dtype=np.int64)
        num_samples = len(data) - self.ngram_size + 1

        contexts = np.stack([data[i:i + self.ngram_size - 1] for i in range(num_samples)])
        targets = np.array([data[i + self.ngram_size - 1] for i in range(num_samples)])

        nll = 0.0
        for i in range(num_samples):
            probs = self.forward(contexts[i:i+1], target=False)
            nll += -np.log(probs[0, targets[i]])

        avg_nll = nll / num_samples
        return float(np.exp(avg_nll))

In [24]:
# Load the data
test_string, full_data, training_data, test_data, valid_data = load_data()

# Prepare data for neural n-gram
vocab_size, train_data, valid_data, test_data = prepare_data(training_data, valid_data, test_data)

model = NeuralBigram(embedding_dimension=512, vocab_size=vocab_size, ngram_size=3, lr=0.5)
model.fit(train_data, epochs=40, batch_size=32, lr_decay=0.95)
print(model.perplexity(valid_data))

Loaded existing BPE model.
Epoch 1/40 - loss: 5.8980 - lr: 0.475000
Epoch 2/40 - loss: 4.8416 - lr: 0.451250
Epoch 3/40 - loss: 4.3771 - lr: 0.428687
Epoch 4/40 - loss: 4.1708 - lr: 0.407253
Epoch 5/40 - loss: 4.0434 - lr: 0.386890
Epoch 6/40 - loss: 3.9492 - lr: 0.367546
Epoch 7/40 - loss: 3.8732 - lr: 0.349169
Epoch 8/40 - loss: 3.8074 - lr: 0.331710
Epoch 9/40 - loss: 3.7500 - lr: 0.315125
Epoch 10/40 - loss: 3.6993 - lr: 0.299368
Epoch 11/40 - loss: 3.6534 - lr: 0.284400
Epoch 12/40 - loss: 3.6119 - lr: 0.270180
Epoch 13/40 - loss: 3.5732 - lr: 0.256671
Epoch 14/40 - loss: 3.5382 - lr: 0.243837
Epoch 15/40 - loss: 3.5058 - lr: 0.231646
Epoch 16/40 - loss: 3.4759 - lr: 0.220063
Epoch 17/40 - loss: 3.4484 - lr: 0.209060
Epoch 18/40 - loss: 3.4227 - lr: 0.198607
Epoch 19/40 - loss: 3.3987 - lr: 0.188677
Epoch 20/40 - loss: 3.3763 - lr: 0.179243
Epoch 21/40 - loss: 3.3556 - lr: 0.170281
Epoch 22/40 - loss: 3.3362 - lr: 0.161767
Epoch 23/40 - loss: 3.3182 - lr: 0.153678
Epoch 24/40 - lo

In [26]:
model.perplexity(valid_data)

99.80881270253751

In [27]:


class NeuralBigram:

    def __init__(self, embedding_dimension, vocab_size, ngram_size, lr=1e-2, hidden_layer_size = 128):
        self.embedding_dimension = embedding_dimension
        self.vocab_size = vocab_size
        self.ngram_size = ngram_size
        self.lr = lr
        self.hidden_layer_size = hidden_layer_size

        # Embedding: vocab_size → embedding_dimension
        self.embedding_matrix = np.random.randn(vocab_size, embedding_dimension) * 0.01

        #Hidden Layer: (context_size * embedding_dim) → hidden layer size
        input_dim = (self.ngram_size - 1) * self.embedding_dimension
        self.linear_W1 = np.random.randn(input_dim, self.hidden_layer_size) * 0.01
        self.linear_b1 = np.zeros((1, self.hidden_layer_size))

        # Linear layer: hidden layer size → vocab_size
        self.linear_W2 = np.random.randn(self.hidden_layer_size, self.vocab_size) * 0.01
        self.linear_b2 = np.zeros((1, self.vocab_size))

    def forward(self, x, y=None, target=True):
        # Embedding lookup
        self.embeddings = self.embedding_matrix[x]  # (B, context_size, D)
        self.embeddings_flat = self.embeddings.reshape(x.shape[0], -1)  # (B, context_size*D)

        # Hidden layer with tanh activation
        self.hidden_layer = self.embeddings_flat @ self.linear_W1 + self.linear_b1
        self.hidden_activation = np.tanh(self.hidden_layer)

        # Linear projection to vocab size
        self.logits = self.hidden_activation @ self.linear_W2 + self.linear_b2  # (B, vocab_size)

        # Softmax
        exp_logits = np.exp(self.logits - np.max(self.logits, axis=1, keepdims=True))
        self.probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

        if target:
            B = x.shape[0]
            loss = -np.log(self.probs[np.arange(B), y]).mean()
            return loss, self.logits
        else:
            return self.probs

    def backwards(self, x, y):
        B = x.shape[0]

        # -------- Softmax + Cross-Entropy Gradient --------
        dlogits = self.probs.copy()
        dlogits[np.arange(B), y] -= 1
        dlogits /= B  # (B, vocab_size)

        # -------- Gradients for output layer (W2, b2) --------
        dW2 = self.hidden_activation.T @ dlogits  # (H, V)
        db2 = np.sum(dlogits, axis=0, keepdims=True)  # (1, V)

        # -------- Backprop into hidden activation --------
        dha = dlogits @ self.linear_W2.T  # (B, H)

        # -------- Backprop through tanh --------
        dh = dha * (1 - self.hidden_activation ** 2)  # (B, H)

        # -------- Gradients for hidden layer (W1, b1) --------
        dW1 = self.embeddings_flat.T @ dh  # (C*D, H)
        db1 = np.sum(dh, axis=0, keepdims=True)  # (1, H)

        # -------- Backprop into embeddings --------
        demb_flat = dh @ self.linear_W1.T  # (B, C*D)
        demb = demb_flat.reshape(self.embeddings.shape)  # (B, C, D)

        # -------- Parameter updates --------
        # Embeddings
        np.add.at(self.embedding_matrix, x, -self.lr * demb)
        # Hidden layer
        self.linear_W1 -= self.lr * dW1
        self.linear_b1 -= self.lr * db1
        # Output layer
        self.linear_W2 -= self.lr * dW2
        self.linear_b2 -= self.lr * db2

    def fit(self, data, epochs=10, batch_size=32, lr_decay=1.0):
        """
        Epoch-based training with shuffling
        """
        data = np.array(data, dtype=np.int64)
        num_samples = len(data) - self.ngram_size + 1

        # Precompute all n-grams once
        contexts = np.stack([data[i:i+self.ngram_size-1] for i in range(num_samples)])
        targets = np.array([data[i+self.ngram_size-1] for i in range(num_samples)])

        for epoch in range(epochs):
            # Shuffle data each epoch
            perm = np.random.permutation(num_samples)
            contexts_shuffled = contexts[perm]
            targets_shuffled = targets[perm]

            epoch_loss = 0.0
            num_batches = int(np.ceil(num_samples / batch_size))

            for b in range(num_batches):
                start = b * batch_size
                end = min((b+1) * batch_size, num_samples)
                x_batch = contexts_shuffled[start:end]
                y_batch = targets_shuffled[start:end]

                # Forward and backward
                loss, _ = self.forward(x_batch, y_batch, target=True)
                self.backwards(x_batch, y_batch)
                epoch_loss += loss * (end - start)

            epoch_loss /= num_samples
            self.lr *= lr_decay  # Optional learning rate decay
            print(f"Epoch {epoch+1}/{epochs} - loss: {epoch_loss:.4f} - lr: {self.lr:.6f}")

    def perplexity(self, data, batch_size=1) -> float:
        """
        Calculate the perplexity for that model after training on validation data

        :param self: the model itfels
        :param data: The encoded validation data
        :param batch_size: should be 1 perplexity calculation
        :return: the perplexity value
        """

        # Load Data and calculate num_samples based on data length and neural_ngram size
        data = np.array(data, dtype=np.int64)
        num_samples = len(data) - self.ngram_size + 1

        contexts = np.stack([data[i:i + self.ngram_size - 1] for i in range(num_samples)])
        targets = np.array([data[i + self.ngram_size - 1] for i in range(num_samples)])

        nll = 0.0
        for i in range(num_samples):
            probs = self.forward(contexts[i:i+1], target=False)
            nll += -np.log(probs[0, targets[i]])

        avg_nll = nll / num_samples
        return float(np.exp(avg_nll))





In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from neural_ngram.utils.preprocessing import get_ngram_batch

class PytorchBigram(nn.Module):
    def __init__(self, vocab_size, ngram_size=2, lr=1e-3, device=None):
        super().__init__()
        self.vocab_size = vocab_size
        self.ngram_size = ngram_size
        self.lr = lr

        # Each token directly mapped to logits (like a bigram table)
        self.embedding_layer = nn.Embedding(vocab_size, vocab_size)

        # Pick device automatically (MPS if available on Mac)
        self.device = device or ("mps" if torch.backends.mps.is_available() else "cpu")
        self.to(self.device)

    def forward(self, x, targets=None):
        """
        x: (B, context_size)
        targets: (B,) or (B, context_size) depending on neural_ngram
        """
        logits = self.embedding_layer(x)  # (B, context_size, vocab_size)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)      # flatten batch & time
        if targets is not None:
            targets = targets.view(B*T).to(self.device)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        else:
            return logits, None

    def fit(self, data, epochs=10, steps_per_epoch=10000, batch_size=32, lr_decay=1.0):
        # Optimizer
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)

        for epoch in range(epochs):
            epoch_loss = 0.0

            for step in range(steps_per_epoch):
                # ---- Random batch ----
                x_batch, y_batch = get_ngram_batch(data, n=self.ngram_size, batch_size=batch_size)

                # Convert to tensors
                x_batch = torch.tensor(x_batch, dtype=torch.long, device=self.device)
                y_batch = torch.tensor(y_batch, dtype=torch.long, device=self.device)

                # ---- Forward pass ----
                _, loss = self.forward(x_batch, targets=y_batch)

                # ---- Backward & update ----
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()

            # Average loss for epoch
            epoch_loss /= steps_per_epoch

            # Learning rate decay (optional)
            for param_group in optimizer.param_groups:
                param_group['lr'] *= lr_decay

            print(f"Epoch {epoch + 1}/{epochs} - loss: {epoch_loss:.4f} - lr: {optimizer.param_groups[0]['lr']:.6f}")

In [40]:
model = PytorchBigram(vocab_size=vocab_size, ngram_size=2, lr=0.001)
model.fit(train_data, epochs=40, batch_size=64, lr_decay=0.99)

Epoch 1/40 - loss: 6.3138 - lr: 0.000990
Epoch 2/40 - loss: 4.9148 - lr: 0.000980
Epoch 3/40 - loss: 4.3785 - lr: 0.000970
Epoch 4/40 - loss: 4.1906 - lr: 0.000961
Epoch 5/40 - loss: 4.1189 - lr: 0.000951
Epoch 6/40 - loss: 4.0985 - lr: 0.000941
Epoch 7/40 - loss: 4.0963 - lr: 0.000932
Epoch 8/40 - loss: 4.1058 - lr: 0.000923
Epoch 9/40 - loss: 4.1035 - lr: 0.000914
Epoch 10/40 - loss: 4.1139 - lr: 0.000904
Epoch 11/40 - loss: 4.1163 - lr: 0.000895


KeyboardInterrupt: 