# Positional Encoding Mechanism

In [1]:
# Write and run code here

class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_length):
        super(PositionalEncoder, self).__init__()
        self.d_model = d_model
        self.max_length = max_length
        
        # Initialize the positional encoding matrix
        pe = torch.zeros(max_length, d_model)  # a function used to create a tensor filled with zeros        
        
        # Calculates the positional encoding for each position in the sequence and assigns the values to the pe matrix:
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1) 
            # creates a tensor position containing a sequence of numbers from 0 to max_seq_length-1
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * -(math.log(10000.0) / d_model)) 
            #calculates a term used for scaling the positional encoding values based on their position in the embedding dimension
        
        # Calculate and assign position encodings to the matrix:
        pe[:, 0::2] = torch.sin(position * div_term) # assigns sine values as positional encodings for even dimensions
        pe[:, 1::2] = torch.cos(position * div_term) # assigns cosine values as positional encodings for odd dimensions
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    # Update the embeddings tensor adding the positional encodings
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)] #slice & inject positional information into the embeddings
        return x


- 	The code that translates the positional encoding mechanism into a usable format for the computer. 	 	

What is 'Classes' in Python:  		
- 	They define the properties (variables) and functionalities (methods) that objects of that class will have 		
- 	Unlike functions, classes don't directly perform actions. They define a structure for objects. 
- 	A class is more like a cookbook category, like "desserts." It defines what a dessert is in general (properties) and might include various recipes (methods) for different desserts (objects)

# Multi Head Attention

In [None]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, num_heads):
        
        super(MultiHeadAttention, self).__init__()
        
        # Set the number of attention heads
        self.num_heads = num_heads
        self.d_model = d_model
        self.head_dim = d_model // num_heads
        
		# Set up the linear transformations
        self.query_linear = nn.Linear(d_model, d_model) #transforms the input embeddings into queries 
        self.key_linear = nn.Linear(d_model, d_model)   #transforms the input embeddings into keys
        self.value_linear = nn.Linear(d_model, d_model) #transforms the input embeddings into values
        self.output_linear = nn.Linear(d_model, d_model)
    
    def split_heads(self, x, batch_size):
        # Split the sequence embeddings in x across the attention heads
        x = x.view(batch_size, -1, self.num_heads, self.head_dim)

        return x.permute(0, 2, 1, 3).contiguous().view(batch_size * self.num_heads, -1, self.head_dim)
        
    def compute_attention(self, query, key, mask=None):
        # Compute dot-product attention scores
        scores = torch.matmul(query, key.permute(1, 2, 0))           # matmul = matrix multiplication
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float("-1e20"))

        # Normalize attention scores into attention weights
        attention_weights = F.softmax(scores, dim=-1)
        return attention_weights

This class sets up the building blocks for performing multi-head attention, which allows the model to focus on different aspects of the input sequence simultaneously.

# Feed Forward Sub Layer

In [None]:
class FeedForwardSubLayer(nn.Module):
    # Specify the two linear layers' input and output sizes
    def __init__(self, d_model, d_ff):
        super(FeedForwardSubLayer, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

	# Apply a forward pass
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

- Specify in the __init__() method the sizes of the two linear fully connected layers. 
- Apply a forward pass through the two linear layers, using the ReLU() activation in between.

# Encoder Transformers

##  A Encoder Layer

In [None]:
# Complete the initialization of elements in the encoder layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForwardSubLayer(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        return self.norm2(x + self.dropout(ff_output))

Complete the implementation of the EncoderLayer class to initialize all its inner elements one by one.

## Encoder transformer body and head

In [None]:
#BODY
class TransformerEncoder(nn.Module): 
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoder(d_model, max_sequence_length)\\
        
        # Define a stack of multiple encoder layers
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
	
    # Complete the forward pass method
    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x, mask)
        return x

#HEAD
class ClassifierHead(nn.Module): 
    def __init__(self, d_model, num_classes):
        super(ClassifierHead, self).__init__()
        # Add linear layer for multiple-class classification
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        logits = self.fc(x[:, 0, :])
        # Obtain log class probabilities upon raw outputs
        return F.log_softmax(logits, dim=-1)

- Define a stack of multiple encoder layers in the __init__() method.
- Complete the forward() method. Note that the process starts by converting the original sequence tokens in x into embeddings.

- Add final linear layer to project encoder results into raw classification outputs.
- Apply the necessary function to map raw classification outputs into log class probabilities.

## Testing the encoder transformer

In [None]:
input_sequence = torch.randint(0, vocab_size, (batch_size, sequence_length))
mask = torch.randint(0, 2, (sequence_length, sequence_length))

# Instantiate the encoder transformer's body and head
encoder = TransformerEncoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length=sequence_length)

classifier = ClassifierHead(d_model, num_classes)

# Complete the forward pass 
output = encoder(input_sequence, mask)
classification = classifier(output)

print("Classification outputs for a batch of ", batch_size, "sequences:")
print(classification)

`output:
    Classification outputs for a batch of  8 sequences:
    tensor([[ 0.3724,  0.0636,  0.5129],
            [-0.1837,  0.5669, -0.9256],
            [-0.1848, -0.2706,  0.1537],
            [ 0.0478,  0.2004, -0.2376],
            [ 0.6299,  0.4149,  0.2964],
            [ 1.3734, -0.0549, -0.0309],
            [-0.0408,  0.3052, -0.1994],
            [ 0.5111,  0.5409,  0.2535]], grad_fn=<AddmmBackward0>)`

# Decoder Transformers

## Building a decoder body and head

In [None]:
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_sequence_length)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        # Add a linear layer (head) for next-word prediction
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x, self_mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x, self_mask)

        # Apply the forward pass through the model head
        x = self.fc(x)
        return F.log_softmax(x, dim=-1)

## Testing the decoder transformer

In [None]:
input_sequence = torch.randint(0, vocab_size, (batch_size, sequence_length))

# Create a triangular attention mask for causal attention
self_attention_mask = (1 - torch.triu(torch.ones(1, sequence_length, sequence_length), diagonal=1)).bool()

# Instantiate the decoder transformer
decoder = TransformerDecoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length=sequence_length)

output = decoder(input_sequence, self_attention_mask)
print(output.shape)
print(output)

NameError: name 'torch' is not defined

# Combined Transformers

## Incorporating cross-attention in a decoder

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        
        # Initialize the causal (masked) self-attention and cross-attention
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForwardSubLayer(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, causal_mask, encoder_output, cross_mask):
        # Pass the necessary arguments to the causal self-attention and cross-attention
        self_attn_output = self.self_attn(x, x, x, causal_mask)
        x = self.norm1(x + self.dropout(self_attn_output))
        cross_attn_output = self.cross_attn(x, encoder_output, encoder_output, cross_mask)
        x = self.norm2(x + self.dropout(cross_attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

## Testing out an encoder-decoder transformer

In [None]:
# Create a batch of random input sequences
input_sequence = torch.randint(0, vocab_size, (batch_size, sequence_length))
padding_mask = torch.randint(0, 2, (sequence_length, sequence_length))
causal_mask = torch.triu(torch.ones(sequence_length, sequence_length), diagonal=1)

# Instantiate the two transformer bodies
encoder = TransformerEncoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length=sequence_length)
decoder = TransformerDecoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length=sequence_length)

# Pass the necessary masks as arguments to the encoder and the decoder
encoder_output = encoder(input_sequence, padding_mask)
decoder_output = decoder(input_sequence, causal_mask, encoder_output, padding_mask)
print("Batch's output shape: ", decoder_output.shape)

# Transformer assembly bottom-up

Putting together the main building blocks of an encoder-only transformer architecture, using a bottom-up approach.

The following classes, their attributes, and their core functions have been defined for you:
- `PositionalEncoding(nn.Module)`: positional encoding for input embeddings.
- `MultiHeadAttention(nn.Module)`: multi-head attention layer.
- `FeedForward(nn.Module)`: feed-forward layer.
- `EncoderLayer(nn.Module)`: a replicable encoder layer that glues together multi-head attention and feed-forward layers, along with layer normalizations and dropouts.

Your next task is to finalize assembling the highest-level components of the encoder transformer: the `TransformerEncoder` and `Transformer` classes.

In [None]:
# Initialize positional encoding layer and stack of EncoderLayer modules
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_len, dropout):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_len)
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)
  
    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        x = self.dropout(x)
        
        # Pass the sequence through each layer in the encoder
        for layer in self.layers:
            x = layer(x, mask)
        
        return x

class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_len, dropout):
        super(Transformer, self).__init__()
        # Initialize the encoder stack of the Transformer
        self.encoder = TransformerEncoder(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_len, dropout)
        
    def forward(self, src, src_mask):
        encoder_output = self.encoder(src, src_mask)
        return encoder_output

Add the whole stack of components and layers into a Transformer class object: You'll need to initialize an attribute containing the whole encoder stack.

# Harnessing Pre-trained LLMs

## Text Generation

In [None]:
# Load the tokenizer and pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

text = ["The best movie I've ever watched!", "What an awful movie. I regret watching it."]

# Tokenize the inputs and pass them to the LLM to perform classification inference.
inputs = tokenizer(text, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits = outputs.logits

predicted_classes = torch.argmax(logits, dim=1).tolist()
for idx, predicted_class in enumerate(predicted_classes):
    print(f"Predicted class for \"{text[idx]}\": {predicted_class}")

## Text Summarization

In [None]:
print(f"Number of instances: {len(dataset['train'])}")

# Show the names of features in the training fold of the dataset
print(f"Feature names: {dataset['train'].column_names}")

# Encode the input example, obtain the summary, and decode it
example = dataset['train'][-2]['review_sents']
input_ids = tokenizer.encode("summarize: " + example, return_tensors="pt", max_length=512, truncation=True)

summary_ids = model.generate(input_ids, max_length=150)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\nOriginal Text (first 400 characters): \n", example[:400])
print("\nGenerated Summary: \n", summary)

- Display the names of the features in the data, by accessing the downloaded 'train' fold.
- Use the necessary variables and methods to encode the input example, pass it to the model to generate a summary, and decode the summary.

## Text Translation

In [None]:
model_name = "Helsinki-NLP/opus-mt-en-es"

# Load the tokenizer and the model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

english_inputs = ["Hello", "Thank you", "How are you?", "Sorry", "Goodbye"]

# Encode the inputs, generate translations, decode, and print them
for english_input in english_inputs:
    input_ids = tokenizer.encode(english_input, return_tensors = "pt")
    translated_ids = model.generate(input_ids) 
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    print(f"English: {english_input} | Spanish: {translated_text}")

- Use the appropriate task-specific classes and methods to load the tokenizer and the model (the classes needed have been already imported for you, as usual!).
- Complete the instructions to encode the input sequences, generate translations, and decode them. For encodings, use an extra argument to return them as PyTorch tensors.

## Q&A Task

In [None]:
# Load a specific subset of the dataset 
from datasets import load_dataset
mlqa = load_dataset("xtreme", name="MLQA.en.en")

question = mlqa["test"]["question"][0]
context = mlqa["test"]["context"][0]
print("Question: ", question)
print("Context: ", context)

# Initialize the tokenizer using the model checkpoint
tokenizer = AutoTokenizer.from_pretrained("deepset/minilm-uncased-squad2")

# Tokenize the inputs returning the result as tensors
inputs = tokenizer(question, context, return_tensors ="pt")
print("First five encoded tokens: ", inputs["input_ids"][0][:5])

## Extract and decode the answer ##
# Initialize the LLM upon the model checkpoint
model = AutoModelForQuestionAnswering.from_pretrained(model_ckp)

with torch.no_grad():         # context manager disables gradient calculation
  outputs = model(**inputs)   # passes the inputs through the model

# Get the most likely start and end answer position from the raw LLM outputs
start_idx = torch.argmax(outputs.start_logits)
end_idx = torch.argmax(outputs.end_logits) + 1

# Access the tokenized inputs tensor to get the answer span
answer_span = inputs["input_ids"][0][start_idx:end_idx]

# Decode the answer span to get the extracted answer text
answer = tokenizer.decode(answer_span)
print("Answer: ", answer)

# Fine-Tuning and Transfer Learning

In [None]:
# Load a pre-trained LLM, specifying its use for binary classification
model = AutoModelForSequenceClassification.from_pretrained(model = "distilbert-base-uncased", num_labels=2)

# Set up training arguments with a batch size of 8 per GPU and 5 epochs
training_args = TrainingArguments(
    output_dir="./smaller_bert_finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=5,
)
# Set up trainer, assigning previously set up training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

In [None]:
# Initialize the trainer and assign a training and validation set to it
trainer = Trainer(model=model, args=training_args,
    			compute_metrics=compute_metrics,
    			train_dataset=emotions_encoded["train"],
    			eval_dataset=emotions_encoded["validation"],
    			tokenizer=tokenizer
)

# Training loop to fine-tune the model
#trainer.train()

input_texts = ["It's dark and rainy outside", "I love penguins!"]

# Tokenize the input sequences and pass them to the model
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = model(**inputs)

# Obtain class labels from raw predictions
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

for i, predicted_label in enumerate(predicted_labels):
    print(f"\n Input Text {i + 1}: {input_texts[i]}")
    print(f"Predicted Label: {predicted_label}")

# Evaluation of LLM

## Classification Metrics

In [None]:
# Pass the four input texts (without labels) to the pipeline
predictions = sentiment_analysis ([example["text"] for example in test_examples])

true_labels = [example["label"] for example in test_examples]
predicted_labels = [1 if pred["label"] == "POSITIVE" else 0 for pred in predictions]

# Load the accuracy metric
accuracy = evaluate.load("accuracy")

result = accuracy.compute(references=true_labels, predictions=predicted_labels)
print(result)

In [None]:
# Obtain a description of each metric
print(help(accuracy))
print(help(precision))
print(help(recall))
print(help(f1))

# Load the accuracy, precision, recall and F1 score metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Pass the examples to the pipeline, and obtain a list predicted labels
sentiment_analysis = pipeline("sentiment-analysis")
predictions = sentiment_analysis([example for example in test_examples])

predicted_labels = [1 if pred["label"] == "POSITIVE" else 0 for pred in predictions]
test_labels = [1, 1, 1, 1, 0, 0, 0]

# Compute the metrics by comparing real and predicted labels
print(precision.compute(references=test_labels , predictions=predicted_labels))
print(recall.compute(references=test_labels , predictions=predicted_labels))
print(f1.compute(references=test_labels , predictions=predicted_labels))

## Specialized Metrics