In [1]:
import torch #type: ignore
import torch.nn as nn #type: ignore
import numpy #type: ignore


In [2]:
print(torch.arange(start=0, end=6, step=2))

tensor([0, 2, 4])


In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self):
        super(PositionalEncoding, self).__init__()

    def forward(self, batch_X):
        _, max_sentence_length, d_model = batch_X.shape
        
        positional_encodings = torch.arange(start=0, end=max_sentence_length).unsqueeze(-1).expand(-1, d_model).clone() # .expand() doesn't create new memory for the duplicated dimension, it uses shared memory --> clone it to not used shared memory
        embedding_dimensions = torch.arange(start=0, end=d_model, step=2)
        positional_encodings[:, 0::2] = torch.sin(positional_encodings[:, 0::2] / (10000 ** (embedding_dimensions / d_model)))
        positional_encodings[:, 1::2] = torch.cos(positional_encodings[:, 1::2] / (10000 ** (embedding_dimensions / d_model)))

        return batch_X + positional_encodings # broadcasting so that positional_encodings added to every training example



In [4]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, use_mask=False):
        super(MultiHeadAttention, self).__init__()
        # Broadcasting --> don't need to worry about batch dimension
    
        # nn.Linear matrix has shape (out_features, in_features), performs computation XW
        self.use_mask = use_mask
        self.num_heads = num_heads
        self.d_k = d_model // num_heads 
        self.d_v = self.d_k
        self.W_Q = nn.Linear(in_features=d_model, out_features=d_model)
        self.W_K = nn.Linear(in_features=d_model, out_features=d_model)
        self.W_V = nn.Linear(in_features=d_model, out_features=d_model)
        self.W_O = nn.Linear(in_features=d_model, out_features=d_model)

    def create_mask(self, batch_size, sentence_length, padding_mask, use_attention_mask):
        padding_mask = padding_mask.unsqueeze(-1).expand(-1, sentence_length, sentence_length)
        if use_attention_mask:
            causal_mask = torch.tril(torch.ones(sentence_length, sentence_length)).unsqueeze(0).expand(batch_size, sentence_length, sentence_length)
            combined_mask = torch.min(padding_mask, causal_mask)
        else:
            combined_mask = padding_mask
        return combined_mask == 0

    def forward(self, batch_X, padding_mask, dropout_rate, encoder_output=None):
        batch_size, sentence_length, d_model = batch_X.shape
        
        Q = None
        if encoder_output is not None:
            Q = self.W_Q(encoder_output).permute(0, 2, 1).reshape(batch_size, sentence_length, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        else:
            Q = self.W_Q(batch_X).permute(0, 2, 1).reshape(batch_size, sentence_length, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        K = self.W_K(batch_X).permute(0, 2, 1).reshape(batch_size, sentence_length, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        V = self.W_V(batch_X).permute(0, 2, 1).reshape(batch_size, sentence_length, self.num_heads, self.d_v).permute(0, 2, 1, 3)

        # torch.matmul() performs the matrix multiplication over the last 2 dimensions, broadcasting all the others
        mask = self.create_mask(batch_size, sentence_length, padding_mask, self.use_mask).unsqueeze(1).expand(batch_size, self.num_heads, sentence_length, sentence_length)
        attention_scores = torch.matmul(Q, K.permute(0, 1, 3, 2)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float)).masked_fill(mask, float('-inf'))
        
        scaled_attention_scores = nn.functional.softmax(attention_scores, dim=-1)
        #scaled_dot_product_attention = nn.Dropout(dropout_rate)(torch.matmul(scaled_attention_scores, V)) # shape = (batch_size, num_heads, sentence_length, d_v)
        scaled_dot_product_attention = torch.matmul(scaled_attention_scores, V) # shape = (batch_size, num_heads, sentence_length, d_v)
        
        # Concatenate all the heads
        scaled_dot_product_attention = scaled_dot_product_attention.permute(0, 2, 1, 3).reshape(batch_size, sentence_length, d_model)
        
        return self.W_O(scaled_dot_product_attention) # shape = (batch_size, sentence_length, d_model)

In [5]:
class FFN(nn.Module):
    def __init__(self, d_model, activation=nn.ReLU()):
        super(FFN, self).__init__()
        d_ff = d_model * 4
        self.ffn = nn.Sequential(
            nn.Linear(in_features=d_model, out_features=d_ff),
            activation,
            nn.Linear(in_features=d_ff, out_features=d_model)
        )

    def forward(self, batch_X):
        return self.ffn(batch_X)

In [6]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.ffn = FFN(d_model=d_model)
        self.layer_norm1 = nn.LayerNorm(normalized_shape=d_model)
        self.layer_norm2 = nn.LayerNorm(normalized_shape=d_model)

    def forward(self, batch_X, padding_mask, dropout_rate):
        # print("input shape pre mha: " + str(batch_X.shape))
        batch_X = self.layer_norm1(batch_X + self.mha(batch_X, padding_mask, dropout_rate))
        # print("input post mha: " + str(batch_X.shape))
        #batch_X = self.layer_norm2(nn.Dropout(dropout_rate)(batch_X + self.ffn(batch_X)))
        batch_X = self.layer_norm2(batch_X + self.ffn(batch_X))
        return batch_X

In [7]:
class Encoder(nn.Module):
    def __init__(self, d_model, num_layers, num_heads):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        layer_list= [EncoderLayer(d_model=d_model, num_heads=num_heads) for l in range(num_layers)]
        for l in range(num_layers):
            self.add_module(f"EncoderLayer{l}", layer_list[l])

    def forward(self, batch_X, padding_mask, dropout_rate):
        for l in range(self.num_layers):
            batch_X = self._modules[f"EncoderLayer{l}"](batch_X, padding_mask, dropout_rate)
        return batch_X


In [8]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads):
        super(DecoderLayer, self).__init__()
        self.mha1 = MultiHeadAttention(d_model=d_model, num_heads=num_heads, use_mask=True)
        self.layernorm1 = nn.LayerNorm(normalized_shape=d_model)
        self.mha2 = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.layernorm2 = nn.LayerNorm(normalized_shape=d_model)
        self.ffn = FFN(d_model=d_model)
        self.layernorm3 = nn.LayerNorm(normalized_shape=d_model)

    def forward(self, batch_X, encoder_output, padding_mask, dropout_rate):
        batch_X = self.layernorm1(batch_X + self.mha1(batch_X, padding_mask, dropout_rate))
        batch_X = self.layernorm2(batch_X + self.mha2(batch_X, padding_mask, dropout_rate, encoder_output))
        
        #batch_X = self.layernorm3(nn.Dropout(dropout_rate)(batch_X + self.ffn(batch_X)))
        batch_X = self.layernorm3(batch_X + self.ffn(batch_X))
        return batch_X


In [9]:
class Decoder(nn.Module):
    def __init__(self, d_model, num_layers, num_heads):
        super(Decoder, self).__init__()
        self.num_layers = num_layers
        layer_list = [DecoderLayer(d_model=d_model, num_heads=num_heads) for l in range(num_layers)]
        for l in range(num_layers):
            self.add_module(f"DecoderLayer{l}", layer_list[l])

    def forward(self, batch_X, encoder_output, padding_mask, dropout_rate):
        for l in range(self.num_layers):
            batch_X = self._modules[f"DecoderLayer{l}"](batch_X, encoder_output, padding_mask, dropout_rate)
        return batch_X

In [10]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, dropout_rate):
        super(Transformer, self).__init__()
        self.vocab_size=vocab_size
        self.dropout_rate = dropout_rate
        self.positional_encoding = PositionalEncoding()
        self.encoder_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
        self.encoder_embedding_dropout = nn.Dropout(dropout_rate)
        self.encoder = Encoder(d_model=d_model, num_layers=num_layers, num_heads=num_heads)
        self.decoder_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
        self.decoder_embedding_dropout = nn.Dropout(dropout_rate)
        self.decoder = Decoder(d_model=d_model, num_layers=num_layers, num_heads=num_heads)
        self.decoder_dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(in_features=d_model, out_features=vocab_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, encoder_input, shifted_decoder_input, encoder_padding_masks, decoder_padding_masks):
        # embedded_encoder_input = self.encoder_embedding_dropout(self.encoder_embedding(encoder_input))
        embedded_encoder_input = self.encoder_embedding(encoder_input)
        # print("embedded encoder input: " + str(embedded_encoder_input))
        # embedded_decoder_input = self.decoder_embedding_dropout(self.decoder_embedding(shifted_decoder_input))
        embedded_decoder_input = self.decoder_embedding(shifted_decoder_input)
        # print("embedded decoder input: " + str(embedded_decoder_input))
        encoder_output = self.positional_encoding(embedded_encoder_input)
        # print("encoder positional_encodings: " + str(encoder_output))
        encoder_output = self.encoder(encoder_output, encoder_padding_masks, self.dropout_rate)
        # print("encoder output: " + str(encoder_output))
        decoder_output = self.decoder(self.positional_encoding(embedded_decoder_input), encoder_output, decoder_padding_masks, self.dropout_rate)
        # print("decoder_output: " + str(decoder_output))
        output_probabilities = self.softmax(self.decoder_dropout(self.linear(decoder_output)))
        # print("output probabilities: " + str(output_probabilities))
        return output_probabilities


In [11]:
class TransformerLoss(nn.Module):
    def __init__(self):
        super(TransformerLoss, self).__init__()

    def forward(self, decoder_output, target_sequences, padding_vocab_index):
        # decoder_output has shape (batch_size, sentence_length, vocab_size)
        # target_sequences has shape (batch_size, sentence_length)
        # for each training example, each of the vocab_size positions in each row
            # has a corresponding probability of being selected, and each corresponding row in the target
            # will have a value equal to the correct position representing a word in the vocabulary
        # print(target_sequences)
        batch_size, sentence_length, vocab_size = decoder_output.shape
        flattened_decoder_output = decoder_output.reshape(batch_size * sentence_length, vocab_size) 
        flattened_target_sequences = target_sequences.reshape(batch_size * sentence_length)
        
        return nn.functional.cross_entropy(input=flattened_decoder_output, 
                                           target=flattened_target_sequences, 
                                           reduction='mean',
                                           ignore_index=padding_vocab_index)


In [13]:
from transformers import AutoTokenizer # type: ignore 
from datasets import load_dataset # type: ignore

wmt_dataset = load_dataset('iwslt2017', 'iwslt2017-fr-en')
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)


In [14]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize(examples):
    english_examples = [example['en'] for example in examples['translation']]
    french_examples = [example['fr'] for example in examples['translation']]
    
    english_examples = tokenizer(english_examples, padding='max_length', truncation=True, max_length=128)
    french_examples = tokenizer(french_examples, padding='max_length', truncation=True, max_length=128)
    return {
        # all of these should have shape (batch_size, max_length)
        'input_token_ids': french_examples['input_ids'], 
        'encoder_attention_mask': french_examples['attention_mask'], # mask for padded sequences
        'decoder_attention_mask': english_examples['attention_mask'],
        'labels': english_examples['input_ids']
    }

tokenized_datasets = wmt_dataset.map(tokenize, batched=True)




In [15]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
PIN_MEMORY = True if torch.cuda.is_available() else False
VOCAB_SIZE = tokenizer.vocab_size + 1 # add 1 because you manually added padding token, increasing vocab size
BATCH_SIZE = 64
D_MODEL = 512
NUM_HEADS = 8
NUM_LAYERS = 6
DROPOUT_RATE = 0.1
NUM_WORKERS = 8
PREFETCH_FACTOR = 2
PERSISTENT_WORKERS = True
SHUFFLE = True
NUM_EPOCHS = 15
WARMUP_STEPS = 4000

In [16]:
test = torch.tensor([[1., 2., 3.], [1., 2., 3.]])
test = nn.functional.softmax(test, dim=-1)
print(test)
test = torch.argmax(test, dim=-1)
print(test)

tensor([[0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652]])
tensor([2, 2])


In [17]:
def learning_rate_lambda_function(step_number):
    return D_MODEL ** (-0.5) * min((step_number + 1) ** (-0.5), (step_number + 1) * WARMUP_STEPS ** (-1.5))

def collate_function(batch):
    input_ids = torch.stack([torch.tensor(example['input_token_ids']) for example in batch])
    encoder_attention_masks = torch.stack([torch.tensor(example['encoder_attention_mask']) for example in batch])
    decoder_attention_masks = torch.stack([torch.tensor(example['decoder_attention_mask']) for example in batch])
    output_labels = torch.stack([torch.tensor(example['labels']) for example in batch])

    return {
        'input_token_ids': input_ids,
        'encoder_attention_masks': encoder_attention_masks,
        'decoder_attention_masks': decoder_attention_masks,
        'output_labels': output_labels
    }

def generate_overfit(model, encoder_inputs, encoder_padding_masks, start_token, max_length, device):
    model.eval()
    decoder_inputs = torch.tensor([start_token]).unsqueeze(0).expand(batch_size) # (batch_size, sentence_length=1)
    with torch.no_grad():
        for _ in range(max_length):
            batch_size = encoder_inputs.shape[0]
            curr_sentence_length = decoder_inputs.shape[1]
            decoder_padding_masks = torch.ones(batch_size, curr_sentence_length)
            outputs = model(encoder_inputs, decoder_inputs, encoder_padding_masks, decoder_padding_masks) # (batch_size, sentence_length, vocab_size)
            next_tokens = torch.argmax(outputs, dim=-1)[:, -1].unsqueeze(-1) # argmax returns index of largest probability, which is exactly what we want --> (batch_size, sentence_length) --> (batch_size, 1))
            decoder_inputs = torch.cat([decoder_inputs, next_tokens], dim=-1)

    return decoder_inputs


In [21]:
from torch.optim import Adam

m = nn.Linear(in_features=10, out_features=10)
opt = Adam(m.parameters(), lr=1e-4)

AttributeError: partially initialized module 'torch._dynamo' has no attribute 'config' (most likely due to a circular import)

In [19]:
from torch.utils.data import DataLoader # type: ignore
from torch.optim import Adam # type: ignore
from torch.optim.lr_scheduler import LambdaLR # type: ignore

num_epochs = NUM_EPOCHS
model = Transformer(
    vocab_size=VOCAB_SIZE, 
    d_model=D_MODEL, 
    num_heads=NUM_HEADS, 
    num_layers=NUM_LAYERS,
    dropout_rate=DROPOUT_RATE
)
model.to(DEVICE)
loss_function = TransformerLoss()
parameters = model.parameters()
optimizer = Adam(parameters, lr=1e-4)
scheduler = LambdaLR(optimizer, learning_rate_lambda_function)

# First overfit to small dataset to ensure that the model is working
small_loader = DataLoader(
    tokenized_datasets['train'].select(range(128)), # select first 128 examples
    batch_size=BATCH_SIZE, 
    #shuffle=SHUFFLE, 
    collate_fn=collate_function, 
    #pin_memory=PIN_MEMORY, 
    #num_workers=NUM_WORKERS,
    #prefetch_factor=PREFETCH_FACTOR,
    #persistent_workers=PERSISTENT_WORKERS
)

for epoch in range(100):
    model.train()
    print("CURRENT EPOCH: " + str(epoch))
    for batch_index, batch in enumerate(small_loader):
        optimizer.zero_grad()
        input_token_ids = batch['input_token_ids'].to(DEVICE)
        encoder_padding_masks = batch['encoder_attention_masks'].to(DEVICE)
        decoder_padding_masks = batch['decoder_attention_masks'].to(DEVICE)
        output_labels = batch['output_labels'].to(DEVICE)

        start_token_batch = torch.full((BATCH_SIZE, 1), tokenizer.bos_token_id)
        shifted_output_labels = torch.cat([start_token_batch, output_labels[:, :-1]], dim=-1)

        decoder_outputs = model(input_token_ids, shifted_output_labels, encoder_padding_masks, decoder_padding_masks)
        loss = loss_function(decoder_outputs, output_labels, tokenizer.pad_token_id)
        print("overfit batch loss: " + str(loss.item()))
        loss.backward()
        optimizer.step()
        scheduler.step()

# for batch_index, batch in enumerate(small_loader):
#     encoder_inputs = batch['input_token_ids']
#     start_token = tokenizer.bos_token_id
#     max_length = batch.shape[1]
#     device = DEVICE
#     generate_overfit(model, encoder_inputs, encoder_padding_masks, start_token, max_length, device)



AttributeError: partially initialized module 'torch._dynamo' has no attribute 'config' (most likely due to a circular import)

In [None]:
train_loader = DataLoader(
    tokenized_datasets['train'], 
    batch_size=BATCH_SIZE, 
    shuffle=SHUFFLE, 
    collate_fn=collate_function, 
    pin_memory=PIN_MEMORY, 
    num_workers=NUM_WORKERS,
    prefetch_factor=PREFETCH_FACTOR,
    persistent_workers=PERSISTENT_WORKERS
)
validation_loader = DataLoader(
    tokenized_datasets['validation'], 
    batch_size=BATCH_SIZE, 
    shuffle=SHUFFLE, 
    collate_fn=collate_function, 
    pin_memory=PIN_MEMORY, 
    num_workers=NUM_WORKERS,
    prefetch_factor=PREFETCH_FACTOR,
    persistent_workers=PERSISTENT_WORKERS
)
test_loader = DataLoader(
    tokenized_datasets['test'], 
    batch_size=BATCH_SIZE, 
    shuffle=SHUFFLE, 
    collate_fn=collate_function, 
    pin_memory=PIN_MEMORY, 
    num_workers=NUM_WORKERS,
    prefetch_factor=PREFETCH_FACTOR,
    persistent_workers=PERSISTENT_WORKERS
)

for epoch in range(num_epochs):
    model.train()
    for batch_index, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_token_ids = batch['input_token_ids'].to(DEVICE)
        attention_masks = batch['attention_masks'].to(DEVICE)
        output_labels = batch['output_labels'].to(DEVICE)

        decoder_outputs = model(input_token_ids, output_labels[:, :-1], attention_masks)
        loss = loss_function(decoder_outputs, output_labels)
        print("training batch loss: " + str(loss))
        loss.backward()
        optimizer.step()
        scheduler.step()