In [1]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [37]:
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [12]:
class Create_Embedding(nn.Module):
    def __init__(self, num_embeds, vocab):
        super().__init__()
        self.embeddings = nn.Embedding(vocab, num_embeds)
        self.num_embeds = num_embeds
    
    def forward(self, input):
        return self.embeddings(input) * math.sqrt(self.num_embeds) # the second part of the equation is 'Softmax'
    
src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
embedding = Create_Embedding(512, 11)
embedded_input = embedding(src)
print(embedded_input.shape)


torch.Size([1, 10, 512])


In [30]:
class Layer_Norm(nn.Module):
    def __init__(self, num_embeddings, eps = 1e-6):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(num_embeddings))            # (1, num_embeds)
        self.beta = nn.Parameter(torch.zeros(num_embeddings))            # (1, num_embeds)
        self.eps = eps
    
    def forward(self, embedded_input):
        mean_matrix = embedded_input.mean(-1, keepdim=True)           # (batch_size, context_size, 1)
        std_matrix = embedded_input.std(-1, keepdim=True) + self.eps  # (batch_size, context_size, 1)
        normalization = (embedded_input - mean_matrix) / std_matrix   # (batch_size, context_size, num_embeds)
        layer_normalization = self.gamma * normalization + self.beta  # (batch_size, context_size, num_embeds)
        return layer_normalization
    
layer_norm = Layer_Norm(512)
layer_norm_output = layer_norm.forward(embedded_input)
print(layer_norm_output.shape)

torch.Size([1, 10, 512])


In [31]:
class Multi_Headed_Attention(nn.Module):
    def __init__(self, num_embeddings, heads, dropout = 0.1):
        super().__init__()
        self.num_embeddings = num_embeddings 
        self.num_heads = heads
        self.num_headings = self.num_embeddings // self.num_heads
        self.query_linear_layer = nn.Linear(self.num_embeddings, self.num_embeddings)
        self.key_linear_layer = nn.Linear(self.num_embeddings, self.num_embeddings)
        self.value_linear_layer = nn.Linear(self.num_embeddings, self.num_embeddings)
        self.output_linear_layer = nn.Linear(self.num_embeddings, self.num_embeddings)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, embedded_input):
        
        num_batches, _, _ = embedded_input.shape
        
        query = self.query_linear_layer(embedded_input)
        key = self.query_linear_layer(embedded_input)
        value = self.query_linear_layer(embedded_input)
        
        # reshaping all my matrixes accoding to the amount of heads I need
        querys = query.view(num_batches, self.num_heads, -1, self.num_headings)     # (B, H, C, E//H)
        keys = key.view(num_batches, self.num_heads, -1, self.num_headings)         # (B, H, C, E//H)
        values = value.view(num_batches, self.num_heads, -1, self.num_headings)     # (B, H, C, E//H)
        
        # self-attention mechanism 
        attention_filter = torch.matmul(querys, keys.transpose(-2, -1))             # (B, H, C, C)
        attention_filter = attention_filter.softmax(-1)                             # (B, H, C, C)
        attention_filter = self.dropout(attention_filter)                           # (B, H, C, C)
        filtered_values = torch.matmul(attention_filter, values)                    # (B, H, C, E//H)
        
        # concatonating the heads back together
        output = (
            filtered_values.transpose(1, 2)
            .contiguous()
            .view(num_batches, -1, self.num_heads * self.num_headings)
        )
        
        return self.output_linear_layer(output)

multi_headed_attention = Multi_Headed_Attention(512, 8)
multi_headed_attention_output = multi_headed_attention.forward(layer_norm_output)
print(multi_headed_attention_output.shape)

torch.Size([1, 10, 512])


In [32]:
class Add_And_Norm(nn.Module):
    def __init__(self, num_embeddings):
        super().__init__()
        self.num_embeddings = num_embeddings
        self.layer_norm = Layer_Norm(self.num_embeddings)
    
    def forward(self, input_previous_layer, output_previous_layer):
        residual_connection = input_previous_layer + output_previous_layer
        return layer_norm.forward(residual_connection)
    
add_and_norm = Add_And_Norm(512)
add_and_norm_output = add_and_norm(layer_norm_output, multi_headed_attention_output)
print(add_and_norm_output.shape)

torch.Size([1, 10, 512])


In [34]:
class Feed_Forward(nn.Module):

    def __init__(self, num_embeddings, feed_forward_dimensions, dropout=0.1):
        super().__init__()
        self.num_embeddings = num_embeddings
        self.feed_forward_dimensions = feed_forward_dimensions
        self.feed_forward_1 = nn.Linear(self.num_embeddings, self.feed_forward_dimensions)
        self.feed_forward_2 = nn.Linear(self.feed_forward_dimensions, self.num_embeddings)
        self.dropout = nn.Dropout(dropout)

    def forward(self, embedded_input):
        return self.feed_forward_2(self.dropout(self.feed_forward_1(embedded_input).relu()))
    
feed_forward = Feed_Forward(512, 2048)
feed_forward_output = feed_forward(add_and_norm_output)
print(feed_forward_output.shape)

torch.Size([1, 10, 512])


In [58]:
class Encoder(nn.Module): 
    def __init__(self, num_embeddings, num_heads, num_layers):
        super().__init__()
        self.num_embeddings = num_embeddings
        self.num_layers = num_layers
        self.layer_norm = clones(Layer_Norm(self.num_embeddings), 4)
        self.multi_headed_attention = Multi_Headed_Attention(num_embeddings, num_heads)
        self.add_and_norm = clones(Add_And_Norm(self.num_embeddings), 2)
        self.feed_forward = Feed_Forward(num_embeddings, num_heads)
        
    def forward(self, embedded_input):
        tensor_1 = embedded_input
        for i in range(self.num_layers):
            tensor_2 = self.layer_norm[0].forward(tensor_1)
            tensor_3 = self.multi_headed_attention.forward(tensor_2)
            tensor_4 = self.add_and_norm[0].forward(tensor_2, tensor_3)
            tensor_5 = self.layer_norm[1].forward(tensor_4)
            tensor_6 = self.feed_forward.forward(tensor_5)
            tensor_7 = self.add_and_norm[1].forward(tensor_5, tensor_6)
            tensor_1 = self.layer_norm[2].forward(tensor_7)
        return self.layer_norm[3].forward(tensor_1)

In [59]:
src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
embedding = Create_Embedding(512, 11)
embedded_input = embedding(src)
encoder = Encoder(512, 8, 4)
output = encoder.forward(embedded_input)
print(output)

tensor([[[ 1.3089,  1.1938, -0.3771,  ..., -0.7220,  0.7530,  0.3284],
         [ 0.2051,  1.4245, -1.2449,  ..., -0.5768, -0.4670,  1.0837],
         [ 0.0998,  1.0674, -1.7092,  ...,  0.4688, -0.0969,  1.6535],
         ...,
         [ 0.2010,  1.1745,  0.0808,  ..., -0.3659, -0.6513,  0.0759],
         [ 0.3338,  1.2236,  0.7162,  ...,  0.9650,  1.7414,  1.5314],
         [ 1.1617,  0.5882, -0.9717,  ...,  0.2102, -1.9405,  0.3313]]],
       grad_fn=<AddBackward0>)
