<a href="https://colab.research.google.com/github/TimotheeeNiven/IntroML_TNiven/blob/main/Homework5_TNiven_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, IterableDataset, TensorDataset
import torch.utils.data as data
import time
import requests
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F
from torch.optim.lr_scheduler import LambdaLR
import random
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Provided text sequence
text = """Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text.

At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model.

One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks.

Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time.

Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants.

In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology."""

# Function to preprocess text into sequences and labels
def create_sequences_and_labels(text, max_length):
    sequences = []
    labels = []
    for i in range(len(text) - max_length):
        sequences.append(text[i:i + max_length])
        labels.append(text[i + max_length])
    return sequences, labels

# Function to create vocabulary and mappings
def create_vocab_mappings(text):
    chars = sorted(list(set(text)))
    ix_to_char = {i: ch for i, ch in enumerate(chars)}
    char_to_ix = {ch: i for i, ch in enumerate(chars)}
    return chars, ix_to_char, char_to_ix

# Function to convert sequences and labels to tensors
def convert_to_tensors(sequences, labels, char_to_ix):
    X = torch.tensor([[char_to_ix[ch] for ch in seq] for seq in sequences], dtype=torch.long)
    y = torch.tensor([char_to_ix[label] for label in labels], dtype=torch.long)
    return X, y

# Prepare dataset for different sequence lengths
max_lengths = [10, 20, 30]
datasets = {}

for max_length in max_lengths:
    sequences, labels = create_sequences_and_labels(text, max_length)
    chars, ix_to_char, char_to_ix = create_vocab_mappings(text)
    X, y = convert_to_tensors(sequences, labels, char_to_ix)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    datasets[max_length] = (X_train, X_val, y_train, y_val, len(chars))

# Define Transformer model
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        transformer_output = self.transformer_encoder(embedded)
        output = self.fc(transformer_output[:, -1, :])  # Get output of last Transformer block
        return output

# Hyperparameters
hidden_size = 128
num_layers = 4
nhead = 2
learning_rate = 0.001
epochs = 50

# Train and validate models for different sequence lengths
for max_length, (X_train, X_val, y_train, y_val, vocab_size) in datasets.items():
    print(f"Training and validating for sequence length {max_length}")
    model = CharTransformer(vocab_size, hidden_size, vocab_size, num_layers, nhead)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train)
        loss = criterion(output, y_train)
        loss.backward()
        optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            val_output = model(X_val)
            val_loss = criterion(val_output, y_val)
            _, predicted = torch.max(val_output, 1)
            val_accuracy = (predicted == y_val).float().mean()

        if (epoch+1) % 5 == 0:
            print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Validation Accuracy: {val_accuracy.item()}')

    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time for training: {execution_time} seconds")
    print("\n")


Training and validating for sequence length 10




Epoch 5, Loss: 3.0526015758514404, Validation Loss: 2.9355547428131104, Validation Accuracy: 0.14255765080451965
Epoch 10, Loss: 2.856675148010254, Validation Loss: 2.712320566177368, Validation Accuracy: 0.2159329205751419
Epoch 15, Loss: 2.5810840129852295, Validation Loss: 2.5576460361480713, Validation Accuracy: 0.2515723407268524
Epoch 20, Loss: 2.4728055000305176, Validation Loss: 2.5014548301696777, Validation Accuracy: 0.2725366950035095
Epoch 25, Loss: 2.3905084133148193, Validation Loss: 2.4713757038116455, Validation Accuracy: 0.2704402506351471
Epoch 30, Loss: 2.3448147773742676, Validation Loss: 2.441974639892578, Validation Accuracy: 0.26834380626678467
Epoch 35, Loss: 2.3021578788757324, Validation Loss: 2.4251315593719482, Validation Accuracy: 0.276729553937912
Epoch 40, Loss: 2.2780842781066895, Validation Loss: 2.409914255142212, Validation Accuracy: 0.2704402506351471
Epoch 45, Loss: 2.250457763671875, Validation Loss: 2.3948702812194824, Validation Accuracy: 0.26834

Problem 2

In [None]:
# Step 1: Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text  # This is the entire text data

# Step 2: Prepare the dataset
sequence_length = 20
# Create a character mapping to integers
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}

# Encode the text into integers
encoded_text = [char_to_int[ch] for ch in text]

# Create sequences and targets
sequences = []
targets = []
for i in range(0, len(encoded_text) - sequence_length):
    seq = encoded_text[i:i+sequence_length]
    target = encoded_text[i+sequence_length]
    sequences.append(seq)
    targets.append(target)

# Convert lists to PyTorch tensors
sequences = torch.tensor(sequences, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

# Step 3: Create a dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# Instantiate the dataset
dataset = CharDataset(sequences, targets)

# Step 4: Create data loaders
batch_size = 128
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

In [None]:
class CharModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, model_type='Transformer', num_layers=2, num_heads=2, dim_feedforward=256, dropout=0.1):
        super(CharModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        if model_type == 'Transformer':
            encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=num_heads, dim_feedforward=dim_feedforward, dropout=dropout)
            self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        else:
            raise ValueError("Invalid model type. Choose 'Transformer'.")
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        transformer_output = self.transformer_encoder(embedded)
        output = self.fc(transformer_output[:, -1, :])
        return output

In [None]:
# Train and evaluate function
def train_evaluate(model_type, train_loader, val_loader, device):
    model = CharModel(len(chars), hidden_size, len(chars), model_type).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device, non_blocking=True), targets.to(device, non_blocking=True)  # Move data to device
            optimizer.zero_grad()
            output = model(inputs)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        epoch_train_loss = train_loss / len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device, non_blocking=True), targets.to(device, non_blocking=True)  # Move data to device
                val_output = model(inputs)
                loss = criterion(val_output, targets)
                val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(val_output, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()

        epoch_val_loss = val_loss / len(val_loader.dataset)
        epoch_val_accuracy = correct / total

        if (epoch+1) % 1 == 0:
            print(f'Epoch {epoch+1}, Train Loss: {epoch_train_loss}, Validation Loss: {epoch_val_loss}, Validation Accuracy: {epoch_val_accuracy}')

    end_time = time.time()
    execution_time = end_time - start_time

    return epoch_train_loss, epoch_val_loss, epoch_val_accuracy, execution_time


In [None]:
maxlen_values = [20, 30, 50]
# Define parameters
hidden_size = 512
num_layers = 2
num_heads = 2
dim_feedforward = 256
dropout = 0.1
learning_rate = 0.0001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 20
# Train and evaluate models for different sequence lengths
results = {}
for maxlen in maxlen_values:
    print(f"\nTraining models for sequence length: {maxlen}")
    results[maxlen] = {}
    for model_type in ['Transformer']:
        print(f"\nTraining {model_type} model...")
        loss, val_loss, val_accuracy, execution_time = train_evaluate(model_type, train_loader, test_loader, device)
        results[maxlen][model_type] = {
            'loss': loss,
            'val_loss': val_loss,
            'val_accuracy': val_accuracy,
            'execution_time': execution_time
        }

# Print and compare results
for maxlen, models_data in results.items():
    print(f"\nResults for sequence length: {maxlen}")
    for model_type, data in models_data.items():
        print(f"\n{model_type} Model:")
        print(f"Training Loss: {data['loss']}")
        print(f"Validation Loss: {data['val_loss']}")
        print(f"Validation Accuracy: {data['val_accuracy']}")
        print(f"Execution Time: {data['execution_time']} seconds")



Training models for sequence length: 20

Training Transformer model...
Epoch 1, Train Loss: 2.511823572262447, Validation Loss: 2.4801008787083543, Validation Accuracy: 0.27047405581082595
Epoch 2, Train Loss: 2.4833527778428857, Validation Loss: 2.470777820613664, Validation Accuracy: 0.27020957077216184
Epoch 3, Train Loss: 2.478055991993177, Validation Loss: 2.4696921855142455, Validation Accuracy: 0.2701512944077104
Epoch 4, Train Loss: 2.4750124252402315, Validation Loss: 2.4670280606407604, Validation Accuracy: 0.2691157682393814
Epoch 5, Train Loss: 2.472941410287177, Validation Loss: 2.466399253943329, Validation Accuracy: 0.2695012888042138
Epoch 6, Train Loss: 2.4716557105294377, Validation Loss: 2.465298949567296, Validation Accuracy: 0.2681609324218312
Epoch 7, Train Loss: 2.4707366181172024, Validation Loss: 2.4655798448111383, Validation Accuracy: 0.26311778549815085
Epoch 8, Train Loss: 2.4699570494612284, Validation Loss: 2.4663895775701894, Validation Accuracy: 0.2702