In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import re
import time
import torch.nn.functional as F
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, normalizers, decoders

# Params

In [2]:
vocab_size = 10000
embedding_dim = 100
hidden_dim = 256
learning_rate = 0.001
batch_size = 64
num_epochs = 10
sequence_length = 10
nhead = 4
num_encoder_layers = 2
dim_feedforward = 512
file_path = 'LTR.txt'

In [3]:
def read_text_file(file_path):
    """Read text from a file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

text = read_text_file(file_path)

In [4]:
class LanguageDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long)

# Tokenizers

## Standar Tokenizer

### Métodos

In [5]:
def st_preprocess_text(text, sequence_length):
    """Preprocess the text into sequences of tokens."""
    # Tokenize text
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    tokens = text.split()
    
    # Create sequences
    sequences = []
    for i in range(len(tokens) - sequence_length):
        seq = tokens[i:i + sequence_length + 1]
        sequences.append(seq)
    
    return sequences

def st_build_vocab(sequences):
    """Build a vocabulary from the sequences."""
    all_tokens = [token for seq in sequences for token in seq]
    token_counts = Counter(all_tokens)
    vocab = {token: idx for idx, (token, _) in enumerate(token_counts.items(), 1)}
    vocab['<PAD>'] = 0  # Add padding token
    return vocab

def st_sequences_to_indices(sequences, vocab):
    """Convert sequences of tokens to sequences of indices."""
    return [[vocab[token] for token in seq] for seq in sequences]

### Inicialización

In [6]:
# Read and preprocess the text file
st_sequences = st_preprocess_text(text, sequence_length)
st_vocab = st_build_vocab(st_sequences)
st_indexed_sequences = st_sequences_to_indices(st_sequences, st_vocab)

# Adjust vocab_size according to the actual vocabulary size
st_vocab_size = len(st_vocab)

# Create the dataset and data loader
st_dataset = LanguageDataset(st_indexed_sequences)
st_data_loader = DataLoader(st_dataset, batch_size=batch_size, shuffle=True)

## Byte-Pair Encoding (BPE) Tokenizer

### Métodos

In [7]:
def bpe_preprocess_text(text):
    """Preprocess the text into lines."""
    # Normalize text
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    return text.split('\n')

# Preprocess text into sequences
def bpe_preprocess_text_to_sequences(text, sequence_length, tokenizer):
    """Preprocess the text into sequences of tokens using BPE tokenization."""
    # Tokenize text using the BPE tokenizer
    encoding = tokenizer.encode(text)
    tokens = encoding.tokens

    # Create sequences
    sequences = []
    for i in range(len(tokens) - sequence_length):
        seq = tokens[i:i + sequence_length + 1]
        sequences.append(seq)

    return sequences

# Build vocabulary
def bpe_build_vocab(tokenizer):
    """Build a vocabulary from the tokenizer."""
    vocab = tokenizer.get_vocab()
    vocab = {token: idx for idx, (token, _) in enumerate(vocab.items())}
    return vocab

# Convert sequences to indices
def bpe_sequences_to_indices(sequences, vocab):
    """Convert sequences of tokens to sequences of indices."""
    return [[vocab.get(token, vocab['<UNK>']) for token in seq] for seq in sequences]

### Inicialización

In [8]:
# Initialize the BPE tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()])
trainer = trainers.BpeTrainer(vocab_size=10000, special_tokens=["<PAD>", "<UNK>", "<END>"])
tokenizer.decoder = decoders.ByteLevel()


# Read and preprocess the text file
lines = bpe_preprocess_text(text)
tokenizer.train_from_iterator(lines, trainer)
bpe_sequences = bpe_preprocess_text_to_sequences(text, sequence_length, tokenizer)
bpe_vocab = bpe_build_vocab(tokenizer)
bpe_indexed_sequences = bpe_sequences_to_indices(bpe_sequences, bpe_vocab)

bpe_vocab_size = len(bpe_vocab)

# Create the dataset and data loader
bpe_dataset = LanguageDataset(bpe_indexed_sequences)
bpe_data_loader = DataLoader(bpe_dataset, batch_size=batch_size, shuffle=True)






# Perceptron

In [9]:
class PerceptronLanguageModel(nn.Module):
    """Feedforward language model."""
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        """
        Initialize the language model.

        Args:
            vocab_size (int): Size of the vocabulary.
            embedding_dim (int): Dimension of the embeddings.
            hidden_dim (int): Dimension of the hidden layer.
        """
        super(PerceptronLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x):
        """
        Perform a forward pass of the model.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, sequence_length).

        Returns:
            torch.Tensor: Output tensor of shape (batch_size * sequence_length, vocab_size).
        """
        embedded = self.embedding(x)
        batch_size, seq_len, _ = embedded.shape
        embedded = embedded.view(batch_size * seq_len, -1)  # Flatten the sequence dimension
        out = torch.relu(self.fc1(embedded))
        out = self.fc2(out)
        return out.view(batch_size, seq_len, -1)  # Reshape to (batch_size, sequence_length, vocab_size)

In [10]:
def train_model_perceptron(model, data_loader, criterion, optimizer, num_epochs, device):
    """Train the model."""
    model.to(device)
    total_training_time = 0.0
    for epoch in range(num_epochs):
        model.train()
        start_time = time.time()
        total_loss = 0.0
        for batch in data_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            inputs = batch[:, :-1]
            targets = batch[:, 1:].flatten()  # Flatten targets for loss calculation
            
            outputs = model(inputs).reshape(-1, st_vocab_size)  # Reshape outputs for loss calculation
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        avg_loss = total_loss / len(data_loader)
        print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}, Time: {epoch_time:.2f}s')
    return total_training_time

In [11]:
def generate_sentence_perceptron(model, start_sequence, vocab, reverse_vocab, max_length=20):
    """
    Generate a sentence using the trained model.

    Args:
        model (nn.Module): The trained language model.
        start_sequence (list of str): The initial sequence of words.
        vocab (dict): The word to index mapping.
        reverse_vocab (dict): The index to word mapping.
        max_length (int): The maximum length of the generated sentence.

    Returns:
        str: The generated sentence.
    """
    model.eval()  # Set the model to evaluation mode
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Convert the start_sequence to indices
    sequence = [vocab[word] for word in start_sequence]
    
    # Generate tokens one by one
    for _ in range(max_length - len(start_sequence)):
        inputs = torch.tensor([sequence], dtype=torch.long).to(device)
        with torch.no_grad():
            outputs = model(inputs)
        
        # Get the last token probabilities
        last_token_logits = outputs[0, -1, :]
        next_token_probs = F.softmax(last_token_logits, dim=-1)
        
        # Sample the next token
        next_token = torch.multinomial(next_token_probs, 1).item()
        
        # Add the token to the sequence
        sequence.append(next_token)
        
        # Stop if the end token is generated (optional)
        if reverse_vocab[next_token] == '<END>':
            break

    # Convert indices back to words
    generated_sequence = [reverse_vocab[idx] for idx in sequence]
    
    return ' '.join(generated_sequence)

# RNN

In [12]:
class RNNLanguageModel(nn.Module):
    """Elman RNN language model."""
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        """
        Initialize the language model.

        Args:
            vocab_size (int): Size of the vocabulary.
            embedding_dim (int): Dimension of the embeddings.
            hidden_dim (int): Dimension of the hidden layer.
            num_layers (int): Number of recurrent layers.
        """
        super(RNNLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden):
        """
        Perform a forward pass of the model.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, sequence_length).
            hidden (torch.Tensor): Hidden state for RNN.

        Returns:
            torch.Tensor: Output tensor of shape (batch_size, sequence_length, vocab_size).
            torch.Tensor: Updated hidden state.
        """
        embedded = self.embedding(x)
        rnn_out, hidden = self.rnn(embedded, hidden)
        output = self.fc(rnn_out)
        return output, hidden
    
    def init_hidden(self, batch_size):
        """
        Initialize hidden state.

        Args:
            batch_size (int): Batch size.

        Returns:
            torch.Tensor: Initial hidden state.
        """
        weight = next(self.parameters()).data
        hidden = weight.new(self.rnn.num_layers, batch_size, self.rnn.hidden_size).zero_()
        return hidden



def train_model_RNN(model, data_loader, criterion, optimizer, num_epochs, device):
    """Train the model."""
    model.to(device)
    total_training_time = 0.0
    for epoch in range(num_epochs):
        model.train()
        start_time = time.time()
        total_loss = 0.0
        for batch in data_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            inputs = batch[:, :-1]
            targets = batch[:, 1:].contiguous().view(-1)  # Flatten targets for loss calculation
            
            hidden = model.init_hidden(inputs.size(0))  # Initialize hidden state
            hidden = hidden.to(device)
            
            outputs, hidden = model(inputs, hidden)
            outputs = outputs.contiguous().view(-1, st_vocab_size)  # Reshape outputs for loss calculation
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        avg_loss = total_loss / len(data_loader)
        print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')

    return total_training_time

def generate_sentence_RNN(model, start_sequence, vocab, reverse_vocab, max_length=20):
    """
    Generate a sentence using the trained model.

    Args:
        model (nn.Module): The trained language model.
        start_sequence (list of str): The initial sequence of words.
        vocab (dict): The word to index mapping.
        reverse_vocab (dict): The index to word mapping.
        max_length (int): The maximum length of the generated sentence.

    Returns:
        str: The generated sentence.
    """
    model.eval()  # Set the model to evaluation mode
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Convert the start_sequence to indices
    sequence = [vocab[word] for word in start_sequence]
    
    # Initialize hidden state
    hidden = model.init_hidden(1)
    hidden = hidden.to(device)
    
    # Generate tokens one by one
    for _ in range(max_length - len(start_sequence)):
        inputs = torch.tensor([sequence], dtype=torch.long).to(device)
        with torch.no_grad():
            outputs, hidden = model(inputs, hidden)
        
        # Get the last token probabilities
        last_token_logits = outputs[0, -1, :]
        next_token_probs = torch.softmax(last_token_logits, dim=-1)
        
        # Sample the next token
        next_token = torch.multinomial(next_token_probs, 1).item()
        
        # Add the token to the sequence
        sequence.append(next_token)
        
        # Stop if the end token is generated (optional)
        if reverse_vocab[next_token] == '<END>':
            break

    # Convert indices back to words
    generated_sequence = [reverse_vocab[idx] for idx in sequence]
    
    return ' '.join(generated_sequence)

# LSTM

## Modelo

In [13]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super(LSTMLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden):
        embedded = self.embedding(x)
        rnn_out, hidden = self.rnn(embedded, hidden)
        output = self.fc(rnn_out)
        return output, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.rnn.num_layers, batch_size, self.rnn.hidden_size).zero_(),
                  weight.new(self.rnn.num_layers, batch_size, self.rnn.hidden_size).zero_())
        return hidden

## Standard Tokenizer

In [14]:
def train_model_st_lstm(model, data_loader, criterion, optimizer, num_epochs, device):
    model.to(device)
    total_training_time = 0.0
    for epoch in range(num_epochs):
        model.train()
        start_time = time.time()
        total_loss = 0.0
        for batch in data_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            inputs = batch[:, :-1]
            targets = batch[:, 1:].contiguous().view(-1)  # Flatten targets for loss calculation
            
            hidden = model.init_hidden(inputs.size(0))  # Initialize hidden state
            hidden = (hidden[0].to(device), hidden[1].to(device))
            
            outputs, hidden = model(inputs, hidden)
            outputs = outputs.contiguous().view(-1, st_vocab_size)  # Reshape outputs for loss calculation
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        avg_loss = total_loss / len(data_loader)
        print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')
    
    return total_training_time

def generate_sentence_st_lstm(model, start_sequence, vocab, reverse_vocab, max_length=20):
    model.eval()  # Set the model to evaluation mode
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Convert the start_sequence to indices
    sequence = [vocab[word] for word in start_sequence]
    
    # Initialize hidden state
    hidden = model.init_hidden(1)
    hidden = (hidden[0].to(device), hidden[1].to(device))
    
    # Generate tokens one by one
    for _ in range(max_length - len(start_sequence)):
        inputs = torch.tensor([sequence], dtype=torch.long).to(device)
        with torch.no_grad():
            outputs, hidden = model(inputs, hidden)
        
        # Get the last token probabilities
        last_token_logits = outputs[0, -1, :]
        next_token_probs = torch.softmax(last_token_logits, dim=-1)
        
        # Sample the next token
        next_token = torch.multinomial(next_token_probs, 1).item()
        
        # Add the token to the sequence
        sequence.append(next_token)
        
        # Stop if the end token is generated (optional)
        if reverse_vocab[next_token] == '<END>':
            break

    # Convert indices back to words
    generated_sequence = [reverse_vocab[idx] for idx in sequence]
    
    return ' '.join(generated_sequence)

## BPE Tokenizer

In [15]:
def train_model_bpe_lstm(model, data_loader, criterion, optimizer, num_epochs, device):
    model.to(device)
    total_training_time = 0.0
    for epoch in range(num_epochs):
        model.train()
        start_time = time.time()
        total_loss = 0.0
        for batch in data_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            inputs = batch[:, :-1]
            targets = batch[:, 1:].contiguous().view(-1)  # Flatten targets for loss calculation
            
            hidden = model.init_hidden(inputs.size(0))  # Initialize hidden state
            hidden = (hidden[0].to(device), hidden[1].to(device))
            
            outputs, hidden = model(inputs, hidden)
            outputs = outputs.contiguous().view(-1, bpe_vocab_size)  # Reshape outputs for loss calculation
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        avg_loss = total_loss / len(data_loader)
        print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')
    
    return total_training_time

def generate_sentence_bpe_lstm(model, start_sequence, tokenizer, vocab, max_length=20):
    """
    Generate a sentence using the trained model.

    Args:
        model (nn.Module): The trained language model.
        start_sequence (list of str): The initial sequence of words.
        tokenizer (Tokenizer): The BPE tokenizer.
        vocab (dict): The word to index mapping.
        max_length (int): The maximum length of the generated sentence.

    Returns:
        str: The generated sentence.
    """
    model.eval()  # Set the model to evaluation mode
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Tokenize the start_sequence
    start_tokens = tokenizer.encode(' '.join(start_sequence)).ids

    # Initialize hidden state
    hidden = model.init_hidden(1)
    if isinstance(hidden, tuple):
        hidden = tuple(h.to(device) for h in hidden)
    else:
        hidden = hidden.to(device)
    
    # Generate tokens one by one
    generated_tokens = start_tokens.copy()
    for _ in range(max_length - len(start_tokens)):
        inputs = torch.tensor([generated_tokens], dtype=torch.long).to(device)
        with torch.no_grad():
            outputs, hidden = model(inputs, hidden)
        
        # Get the last token probabilities
        last_token_logits = outputs[0, -1, :]
        next_token_probs = torch.softmax(last_token_logits, dim=-1)
        
        # Sample the next token
        next_token = torch.multinomial(next_token_probs, 1).item()
        
        # Add the token to the sequence
        generated_tokens.append(next_token)
        
        # Stop if the end token is generated (optional)
        if next_token == vocab.get('<END>'):
            break

    # Decode the generated tokens to get the final sentence
    decoded_tokens = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    # Manually add spaces where appropriate
    sentence_with_spaces = ''
    for token in tokenizer.encode(decoded_tokens).tokens:
        if not token.startswith('##') and len(sentence_with_spaces) > 0:
            sentence_with_spaces += ' '
        sentence_with_spaces += token.replace('##', '')
    
    return sentence_with_spaces


# GRU

In [16]:
class GRULanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super(GRULanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden):
        embedded = self.embedding(x)
        rnn_out, hidden = self.rnn(embedded, hidden)
        output = self.fc(rnn_out)
        return output, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.rnn.num_layers, batch_size, self.rnn.hidden_size).zero_()
        return hidden

def pad_sequences(sequences, max_length):
    return [seq + [0] * (max_length - len(seq)) for seq in sequences]

def train_model_gru(model, data_loader, criterion, optimizer, num_epochs, device):
    model.to(device)
    total_training_time = 0.0
    for epoch in range(num_epochs):
        model.train()
        start_time = time.time()
        total_loss = 0.0
        for batch in data_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            inputs = batch[:, :-1]
            targets = batch[:, 1:].contiguous().view(-1)
            hidden = model.init_hidden(inputs.size(0))
            hidden = hidden.to(device)
            outputs, hidden = model(inputs, hidden)
            outputs = outputs.contiguous().view(-1, st_vocab_size)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        avg_loss = total_loss / len(data_loader)
        print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')

    return total_training_time

def generate_sentence_gru(model, start_sequence, vocab, reverse_vocab, max_length=20):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    sequence = [vocab[word] for word in start_sequence]
    hidden = model.init_hidden(1)
    hidden = hidden.to(device)
    
    for _ in range(max_length - len(start_sequence)):
        inputs = torch.tensor([sequence], dtype=torch.long).to(device)
        with torch.no_grad():
            outputs, hidden = model(inputs, hidden)
        last_token_logits = outputs[0, -1, :]
        next_token_probs = torch.softmax(last_token_logits, dim=-1)
        next_token = torch.multinomial(next_token_probs, 1).item()
        sequence.append(next_token)
        if reverse_vocab[next_token] == '<END>':
            break

    generated_sequence = [reverse_vocab[idx] for idx in sequence]
    return ' '.join(generated_sequence)

# Calidad papi

In [17]:
def calculate_perplexity(model, data_loader, criterion, device):
    model.eval()
    model.to(device)
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for batch in data_loader:
            batch = batch.to(device)
            inputs = batch[:, :-1]
            targets = batch[:, 1:].contiguous().view(-1)
            
            if isinstance(model, RNNLanguageModel):
                hidden = model.init_hidden(inputs.size(0)).to(device)
                outputs, hidden = model(inputs, hidden)
            else:
                outputs = model(inputs)
                
            outputs = outputs.contiguous().view(-1, st_vocab_size)
            loss = criterion(outputs, targets)
            
            total_loss += loss.item() * targets.size(0)
            total_tokens += targets.size(0)

    average_loss = total_loss / total_tokens
    perplexity = torch.exp(torch.tensor(average_loss)).item()
    return perplexity

In [18]:
def calculate_perplexity(model, data_loader, criterion, device, vocab_size):
    model.eval()
    model.to(device)
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for batch in data_loader:
            batch = batch.to(device)
            inputs = batch[:, :-1]
            targets = batch[:, 1:].contiguous().view(-1)
 
            if isinstance(model, (RNNLanguageModel, LSTMLanguageModel, GRULanguageModel)) or type(model).__name__ == 'GRULanguageModel':
                hidden = model.init_hidden(inputs.size(0))
                if isinstance(model, LSTMLanguageModel) or isinstance(model, GRULanguageModel):
                    hidden = (hidden[0].to(device), hidden[1].to(device)) if isinstance(hidden, tuple) else hidden.to(device)
                else:
                    hidden = hidden.to(device)
                outputs, hidden = model(inputs, hidden)
            else:
                outputs = model(inputs)
                
            outputs = outputs.contiguous().view(-1, vocab_size)
            loss = criterion(outputs, targets)
            
            total_loss += loss.item() * targets.size(0)
            total_tokens += targets.size(0)

    average_loss = total_loss / total_tokens
    perplexity = torch.exp(torch.tensor(average_loss)).item()
    return perplexity


# Train and execute

In [19]:
# Determine the device to be used (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Init models
perceptron_model = PerceptronLanguageModel(st_vocab_size, embedding_dim, hidden_dim)
rnn_model = RNNLanguageModel(st_vocab_size, embedding_dim, hidden_dim)
st_lstm_model = LSTMLanguageModel(st_vocab_size, embedding_dim, hidden_dim)
bpe_lstm_model = LSTMLanguageModel(bpe_vocab_size, embedding_dim, hidden_dim)
gru_model = GRULanguageModel(st_vocab_size, embedding_dim, hidden_dim, num_layers=1)

criterion = nn.CrossEntropyLoss()
perceptron_optimizer = optim.Adam(perceptron_model.parameters(), lr=learning_rate)
rnn_optimizer = optim.Adam(rnn_model.parameters(), lr=learning_rate)
st_lstm_optimizer = optim.Adam(st_lstm_model.parameters(), lr=learning_rate)
bpe_lstm_optimizer = optim.Adam(bpe_lstm_model.parameters(), lr=learning_rate)
gru_optimizer = optim.Adam(gru_model.parameters(), lr=learning_rate)

# Train models
print("Training perceptron model...")
perceptron_training_time = train_model_perceptron(perceptron_model, st_data_loader, criterion, perceptron_optimizer, num_epochs, device)
perceptron_perplexity = calculate_perplexity(perceptron_model, st_data_loader, criterion, device, st_vocab_size)
print(f"Perceptron Model - Training Time: {perceptron_training_time:.2f}s, Perplexity: {perceptron_perplexity:.2f}")

print("Training RNN model...")
rnn_training_time = train_model_RNN(rnn_model, st_data_loader, criterion, rnn_optimizer, num_epochs, device)
rnn_perplexity = calculate_perplexity(rnn_model, st_data_loader, criterion, device, st_vocab_size)
print(f"RNN Model - Training Time: {rnn_training_time:.2f}s, Perplexity: {rnn_perplexity:.2f}")

print("Training LSTM model...")
st_lstm_training_time = train_model_st_lstm(st_lstm_model, st_data_loader, criterion, st_lstm_optimizer, num_epochs, device)
st_lstm_perplexity = calculate_perplexity(st_lstm_model, st_data_loader, criterion, device, st_vocab_size)
print(f"LSTM Model - Training Time: {st_lstm_training_time:.2f}s, Perplexity: {st_lstm_perplexity:.2f}")

print("Training LSTM model with BPE Tokenizer...")
bpe_lstm_training_time = train_model_bpe_lstm(bpe_lstm_model, bpe_data_loader, criterion, bpe_lstm_optimizer, num_epochs, device)
bpe_lstm_perplexity = calculate_perplexity(bpe_lstm_model, bpe_data_loader, criterion, device, bpe_vocab_size)
print(f"LSTM Model BPE Tokenizer - Training Time: {bpe_lstm_training_time:.2f}s, Perplexity: {bpe_lstm_perplexity:.2f}")

print("Training GRU model...")
gru_training_time= train_model_gru(gru_model, st_data_loader, criterion, gru_optimizer, num_epochs, device)
gru_perplexity = calculate_perplexity(gru_model, st_data_loader, criterion, device, st_vocab_size)
print(f"GRU Model - Training Time: {gru_training_time:.2f}s, Perplexity: {gru_perplexity:.2f}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Training perceptron model...
Epoch 1, Average Loss: 5.0584, Time: 15.46s
Epoch 2, Average Loss: 4.6170, Time: 15.43s
Epoch 3, Average Loss: 4.5067, Time: 15.39s
Epoch 4, Average Loss: 4.4581, Time: 15.35s
Epoch 5, Average Loss: 4.4327, Time: 15.31s
Epoch 6, Average Loss: 4.4169, Time: 15.13s
Epoch 7, Average Loss: 4.4064, Time: 15.20s
Epoch 8, Average Loss: 4.3985, Time: 15.21s
Epoch 9, Average Loss: 4.3927, Time: 15.29s
Epoch 10, Average Loss: 4.3880, Time: 15.22s
Perceptron Model - Training Time: 152.99s, Perplexity: 78.22
Training RNN model...
Epoch 1, Average Loss: 4.8614
Epoch 2, Average Loss: 3.9672
Epoch 3, Average Loss: 3.6250
Epoch 4, Average Loss: 3.4204
Epoch 5, Average Loss: 3.2819
Epoch 6, Average Loss: 3.1819
Epoch 7, Average Loss: 3.1054
Epoch 8, Average Loss: 3.0458
Epoch 9, Average Loss: 2.9972
Epoch 10, Average Loss: 2.9574
RNN Model - Training Time: 172.80s, Perplexity: 17.64
Training LSTM model...
Epoch 1, Average Loss: 4.9459
Epoch 2, Average Loss: 3.9084
Epoch 3, 

In [20]:
print("Perceptron time: ", perceptron_training_time, "Perceptron perplexity: ", perceptron_perplexity)
print("RNN time: ", rnn_training_time, "RNN perplexity: ", rnn_perplexity)
print("LSTM time: ", st_lstm_training_time, "LSTM perplexity: ", st_lstm_perplexity)
print("BPE LSTM time: ", bpe_lstm_training_time, "BPE LSTM perplexity: ", bpe_lstm_perplexity)
print("GRU time: ", gru_training_time, "GRU perplexity: ", gru_perplexity)

Perceptron time:  152.98582983016968 Perceptron perplexity:  78.21942901611328
RNN time:  172.80421090126038 RNN perplexity:  17.63882827758789
LSTM time:  185.8695924282074 LSTM perplexity:  11.535888671875
BPE LSTM time:  156.26988101005554 BPE LSTM perplexity:  13.371254920959473
GRU time:  184.4695906639099 GRU perplexity:  13.15131664276123


In [None]:
start_sequence = ['this', 'will', 'be']


generated_sentence_perceptron = generate_sentence_perceptron(perceptron_model, start_sequence, st_vocab, {v: k for k, v in st_vocab.items()})
print(f"Perceptron: {generated_sentence_perceptron}")

generated_sentence_RNN = generate_sentence_RNN(rnn_model, start_sequence, st_vocab, {v: k for k, v in st_vocab.items()})
print(f"RNN: {generated_sentence_RNN}")

generated_sentence_st_lstm = generate_sentence_st_lstm(st_lstm_model, start_sequence, st_vocab, {v: k for k, v in st_vocab.items()})
print(f"LSTM: {generated_sentence_st_lstm}")

generated_sentence_bpe_lstm = generate_sentence_bpe_lstm(bpe_lstm_model, start_sequence, tokenizer, bpe_vocab)
print(f"LSTM BPE: {generated_sentence_bpe_lstm}")

generated_sentence_gru = generate_sentence_gru(gru_model, start_sequence, st_vocab, {v: k for k, v in st_vocab.items()})
print(f"GRU: {generated_sentence_gru}")

Perceptron: this will be good they are the fire and foe to the shire against the master he did well youve
RNN: this will be the last disaster by the ground and its there were by many springs but they have seized
LSTM: this will be the silent town and night came forward to him again and with him went landroval and meneldor
LSTM BPE: this will be apart judge wreck inquisit judge ot studied bler leaps game preci pil ished attentively sway icing elrohir zing
GRU: this will be laid to thought if he won it for a time he thought the mournful howling of wolves
