<a href="https://colab.research.google.com/github/kgreed4/no_hate_transformer/blob/dombuford/encoder_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install torch



In [16]:
import torch
import torch.nn as nn
import math

In [59]:
#input embeddings

class InputEmbeddings(nn.Module):
  def __init__(self, embedding_size:int, vocab_size:int):
    super().__init__()
    self.embedding_size = embedding_size
    self.vocab_size = vocab_size
    self.embedding = nn.Embedding(vocab_size, embedding_size)

  def forward(self, x):
    return self.embedding(x)* math.sqrt(self.embedding_size)


In [47]:
# positional encoding

class PositionalEncoding(nn.Module):
  def __init__(self, embedding_size, seq_len, dropout):
    super().__init__()
    self.embedding_size = embedding_size
    self.seq_len = seq_len
    self.dropout = nn.Dropout(dropout)

    #need a matrix of size (seq_len, embedding_size)
    self.positionalEncoding = torch.zeros(seq_len, embedding_size)

    #create a vector of shape seq_len
    position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
    dividing_term = torch.exp(torch.arange(0, embedding_size, 2).float()* (-math.log(10000.0)/embedding_size))

    #apply sin to even positions in the encoding and cosine to the odd positions
    self.positionalEncoding[:, 0::2] = torch.sin(position * dividing_term)
    self.positionalEncoding[:, 1::2] = torch.cos(position * dividing_term)

    #add batch dimension to positionalEncoding matrix
    self.positionalEncoding = self.positionalEncoding.unsqueeze(0)

   # self.register_buffer('positionalEncoding',self.positionalEncoding) #the tensor will now be saved in the file

  def forward(self, x):
    x = x +(self.positionalEncoding[:, :x.shape[1], :]).requires_grad_(False)
    return self.dropout(x)

In [4]:
#layer normalization
class LayerNormalization(nn.Module):
  def __init__(self, eps:float):
    super().__init__()
    self.eps = eps
    self.alpha = nn.Parameter(torch.ones(1))
    self.bias = nn.Parameter(torch.zeros(1))
  def forward(self, x):
    mean = x.mean(dim = -1, keepdim= True)
    std = x.std(dim = -1, keepdim = True)
    return self.alpha * (x - mean)/(std + self.eps) + self.bias

In [5]:
#feed forward layer
class FeedForward(nn.Module):
  def __init__(self, embedding_size:int, d_ff:int, dropout:float):
    super().__init__()
    self.embedding_size = embedding_size
    self.d_ff = d_ff
    self.dropout = nn.Dropout(dropout)
    self.linear_1 = nn.Linear(embedding_size, d_ff)
    self.linear_2 = nn.Linear(d_ff, embedding_size)
  def forward(self, x):
   # x dimensions are (batch_size, seq_len, embedding_size)
    x = self.linear_1(x)
    x = torch.relu(x)
    x = self.dropout(x)
    x = self.linear_2(x)
    return x


In [72]:
class MultiHeadAttentionBlock(nn.Module):
  def __init__(self, num_heads, embedding_size, dropout):
    super().__init__()
    self.num_heads = num_heads
    self.embedding_size = embedding_size
    self.dropout =nn.Dropout(dropout)
    assert embedding_size % num_heads == 0, "embedding_size is not divisible by num of heads" #so that the embedding can be divided equally into number of heads there are
    self.d_k = int(embedding_size/num_heads)

    self.W_q = nn.Linear(embedding_size, embedding_size)
    self.W_k = nn.Linear(embedding_size, embedding_size)
    self.W_v = nn.Linear(embedding_size, embedding_size)
    self.W_o = nn.Linear(embedding_size, embedding_size)

  def forward(self, q, k, v):
    query  = self.W_q(q)  #(Batch, seq_len, embedding_size)--> (Batch, seq_len, embedding_size)
    key = self.W_k(k)     #(Batch, seq_len, embedding_size)--> (Batch, seq_len, embedding_size)
    value = self.W_v(v)   #(Batch, seq_len, embedding_size)--> (Batch, seq_len, embedding_size)

    query = query.view(query.shape[0], query.shape[1], self.num_heads, self.d_k).transpose(1, 2)  #dividing up for num of heads
    key = key.view(key.shape[0], key.shape[1], self.num_heads, self.d_k).transpose(1, 2)
    value = value.view(value.shape[0], value.shape[1], self.num_heads, self.d_k).transpose(1,2)

    #calculate attention scores and
    x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, self.dropout)

    x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.num_heads * self.d_k)  #(batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k)--> (batch, seq_len, embedding_size)

    return self.W_o(x)

  @staticmethod
  def attention(query, key, value, dropout):
    d_k = query.shape[-1]

    attention_scores = (query @ key.transpose(-2, -1))/math.sqrt(d_k)
    attention_scores = attention_scores.softmax(dim = -1)  #(Batch, num_heads, seq_len, seq_len)
    if dropout is not None:
      attention_scores = dropout(attention_scores)

    return (attention_scores @value), attention_scores



In [70]:
#define the encoder layer
class EncoderLayer(nn.Module):
  def __init__(self, embedding_size, num_heads, d_ff, dropout, eps): #will need to update inputs
    super().__init__()
    self.multiHeadAttention = MultiHeadAttentionBlock(num_heads, embedding_size, dropout)
    self.feedForward = FeedForward(embedding_size, d_ff, dropout)
    self.layerNormalization = nn.ModuleList(LayerNormalization(eps) for _ in range(2))
  def forward(self, x):
    multi_head_output = self.multiHeadAttention(x, x, x)
    add_norm_output_1 = self.layerNormalization[0](x + multi_head_output) #adding for purpose of residual connections
    feed_forward_output = self.feedForward(add_norm_output_1)
    add_norm_output_2 = self.layerNormalization[1](add_norm_output_1 + feed_forward_output)

    return add_norm_output_2


In [73]:
class Encoder(nn.Module):
  def __init__(self, n, num_heads, embedding_size, vocab_size, seq_len, d_ff, dropout, eps):
    super().__init__()
    self.inputEmbeddings = InputEmbeddings(embedding_size, vocab_size)
    self.pe = PositionalEncoding(embedding_size, seq_len, dropout)
    self.encoderLayer = nn.ModuleList([EncoderLayer(embedding_size, num_heads, d_ff, dropout, eps) for _ in range(n)])
  def forward(self, x):
    x = self.inputEmbeddings(x)
    x = self.pe(x)
    for i, layer in enumerate(self.encoderLayer):
      x = layer(x)
    return x


In [74]:
from numpy import random
from transformers import BertTokenizer
n = 6
num_heads = 8
embedding_size = 512
vocab_size = 20000
seq_len = 7
d_ff = 1024
dropout = 0.1
eps = 10**-6
batch_size = 32

input_seq = "Hello my name is Dominique"
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input text
tokenized_input = tokenizer(input_seq, return_tensors='pt')

#encoderlayer = EncoderLayer(embedding_size, num_heads, d_ff, dropout, eps)
encoder = Encoder(n, num_heads, embedding_size, vocab_size, seq_len, d_ff, dropout, eps)
print(encoder(tokenized_input['input_ids']))

tensor([[[ 1.3966e+00,  2.4102e-01,  9.5467e-01,  ..., -2.1335e-01,
          -3.4617e-01,  8.7963e-01],
         [ 4.2103e-01,  6.5657e-01, -2.7865e-01,  ...,  6.0341e-01,
          -2.1286e-03,  4.3383e-01],
         [-7.7296e-01, -2.2518e+00, -1.0065e+00,  ..., -9.1954e-02,
           5.4797e-01, -1.2536e-01],
         ...,
         [-6.4488e-01, -1.1524e+00,  2.1089e-01,  ...,  4.1641e-01,
          -3.0807e-01,  2.6539e+00],
         [-2.3036e+00, -2.3276e+00,  1.1196e+00,  ...,  6.7676e-01,
          -6.8076e-01, -7.9725e-01],
         [-7.4339e-01,  2.9110e-01,  4.1816e-02,  ...,  2.3869e-01,
          -7.6795e-01, -3.7369e-01]]], grad_fn=<AddBackward0>)
