# Compare with Actual Distilbert

In [2]:
import torch
from distilbert_by_hand import DistilBertByHand
from transformers import DistilBertTokenizer, DistilBertModel

In [9]:
sentence = "The cat sat on the mat."

# My implementation
my_transformer = DistilBertByHand()

# Actual implementation
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
their_transformer = DistilBertModel.from_pretrained('distilbert-base-uncased')
inputs = tokenizer(sentence, return_tensors="pt")

# Compare the outputs
my_output = my_transformer(sentence)
with torch.no_grad():  
    their_output = their_transformer(**inputs)

print(my_output)
print(their_output[0][0])

tensor([[ 0.7931, -0.2998, -0.5067,  ..., -0.7640, -0.0722,  0.8590],
        [-0.2309,  0.9762,  0.4279,  ..., -0.0098,  0.1503,  0.1877],
        [-0.3172, -0.0491, -0.2106,  ...,  0.0203,  0.4648,  1.4245],
        ...,
        [ 0.7091, -0.0809, -0.6096,  ..., -0.3787, -0.7070, -0.9288],
        [ 0.3666, -0.3846, -0.2389,  ...,  0.7856,  0.6769,  0.5238],
        [-0.1419,  0.5372,  0.7171,  ...,  0.3744,  0.3691,  0.2721]])
tensor([[-0.2713, -0.0781, -0.0216,  ..., -0.0853,  0.4197,  0.1664],
        [-0.2253,  0.0514, -0.1776,  ..., -0.0192,  1.0410, -0.4474],
        [-0.1074, -0.0692,  0.1663,  ..., -0.2652,  0.3477,  0.3385],
        ...,
        [ 0.2359, -0.0804, -0.0106,  ..., -0.1970,  0.3074,  0.1018],
        [ 0.3169, -0.1647, -0.3697,  ...,  0.0730,  0.0621, -0.6574],
        [ 0.3583,  0.1961,  0.0775,  ..., -0.1091,  0.0866, -0.5101]])


In [7]:
their_transformer.eval()  # evaluation mode

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

# Old work below this point

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel

model = DistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

distilbert_weights = model.state_dict()

embedding_dimension = 768
num_heads = 6
head_size = embedding_dimension // num_heads 

def layer_norm(x, weight, bias, eps=1e-6):
    
    mean = x.mean(dim=-1, keepdim=True)
    std_dev = x.std(dim=-1, keepdim=True)
    x_normalized = (x - mean) / (std_dev + eps)
    output = weight * x_normalized + bias
    
    return output

def get_head_tensor(X_expanded, layer, Q_K_or_V):
    
    #Weight matrix W_Q, W_K, or W_V
    weight_matrix = distilbert_weights['transformer.layer.' + str(layer) + '.attention.' + Q_K_or_V.lower() + '_lin.weight']
    head_divided_weight_matrix = weight_matrix.view(num_heads, head_size, embedding_dimension)

    #Bias matrix b_Q, b_K, or b_V
    bias_matrix = distilbert_weights['transformer.layer.' + str(layer) + '.attention.' + Q_K_or_V.lower() + '_lin.bias']
    head_divided_bias_matrix = bias_matrix.view(num_heads, head_size)

    # Multiply X with W_Q, W_K, or W_V
    head_matrices = torch.matmul(X_expanded, head_divided_weight_matrix.transpose(1, 2)) + head_divided_bias_matrix.unsqueeze(1)

    # Reshape to get the head tensor
    head_matrices = head_matrices.squeeze(1)
    
    return head_matrices

def embed(sentence):
    
    distilbert_weights = model.state_dict()
    
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors="pt")
    inputs = inputs["input_ids"][0]
    tokens_length = len(inputs)
    
    # Full token embeddings
    W = distilbert_weights['embeddings.word_embeddings.weight']

    # Sentence token embeddings
    X = W[inputs]
    
    # Positional embeddings
    P_full = distilbert_weights['embeddings.position_embeddings.weight']
    P = P_full[:tokens_length, :]

    # Add position embeddings to token embeddings
    X = X + P

    # Normalize
    X = layer_norm(X, distilbert_weights['embeddings.LayerNorm.weight'], distilbert_weights['embeddings.LayerNorm.bias'])
    
    return X

tokens_len = X.shape[0] #TODO: redundant, initialize when creating a class

def attention(X, layer):
    
    # For pytorch broadcasting to work, we need to expand the tensor to (1, 9, 768)
    X_expanded = X.unsqueeze(0)  # Shape: (1, 9, 768)
    
    # Query, Key, and Value heads
    Q = get_head_tensor(X_expanded, layer, 'Q')
    K = get_head_tensor(X_expanded, layer, 'K')
    V = get_head_tensor(X_expanded, layer, 'V')

    # Attention Weights
    A = torch.softmax(torch.matmul(Q, K.transpose(1, 2) / torch.sqrt(torch.tensor(head_size).float())),dim=-1)

    # Update V
    V = torch.matmul(A, V)

    # Concatenating the heads
    V = V.view(tokens_len,embedding_dimension)

    #Linear layer
    W_out_lin = distilbert_weights['transformer.layer.' + str(layer) + '.attention.out_lin.weight']
    b_out_lin = distilbert_weights['transformer.layer.' + str(layer) + '.attention.out_lin.bias']
    b_out_lin_matrix = b_out_lin.repeat(tokens_len, 1)

    residual = torch.matmul(V, W_out_lin) + b_out_lin_matrix  #TODO: Need to transpose W_out_lin as per copilot suggestion?

    # Residual Connections
    X = X + residual

    # Normalize
    W_sa = distilbert_weights['transformer.layer.' + str(layer) + '.sa_layer_norm.weight']
    b_sa = distilbert_weights['transformer.layer.' + str(layer) + '.sa_layer_norm.bias']
    X = layer_norm(X, W_sa, b_sa)
    
    return X

def feed_forward(X, layer):
    
    # ff Linear 1
    W_ff1 = distilbert_weights['transformer.layer.' + str(layer) + '.ffn.lin1.weight']
    b_ff1 = distilbert_weights['transformer.layer.' + str(layer) + '.ffn.lin1.bias']
    b_ff1_matrix = b_ff1.repeat(9, 1)

    FF_data = torch.matmul(X, W_ff1.transpose(0,1) ) + b_ff1_matrix

    # FF ReLU
    FF_data = torch.relu(FF_data)

    # FF Linear 2
    W_ff2 = distilbert_weights['transformer.layer.' + str(layer) + '.ffn.lin2.weight']
    b_ff2 = distilbert_weights['transformer.layer.' + str(layer) + '.ffn.lin2.bias']
    b_ff2_matrix = b_ff2.repeat(9, 1)

    X = torch.matmul(FF_data, W_ff2.transpose(0,1) ) + b_ff2_matrix

    # Normalize
    W_ff = distilbert_weights['transformer.layer.' + str(layer) + '.output_layer_norm.weight']
    b_ff = distilbert_weights['transformer.layer.' + str(layer) + '.output_layer_norm.bias']
    X = layer_norm(X, W_ff, b_ff)
    
    return X


In [90]:
X = embed("The cat sat on the mat.")
for layer in range(6):
    X = attention(X, layer)
    X = feed_forward(X, layer)
print(X)

tensor([[-0.9836, -0.1272, -0.6853,  ...,  1.0959,  0.5143,  0.6237],
        [ 0.2990, -0.2386,  0.7141,  ..., -0.2033,  0.0946,  0.0958],
        [ 1.0037, -0.1444,  0.3477,  ..., -0.0689, -0.6273,  0.4990],
        ...,
        [ 1.1924, -0.2387,  1.2874,  ...,  1.0004, -0.8292, -0.3271],
        [-0.2179,  0.1382,  1.5083,  ...,  0.2565,  1.0773, -0.4967],
        [ 0.4736,  0.0803,  0.5177,  ...,  1.0466, -0.1284, -0.4433]])


In [107]:
transformer = DistilBERTProcessor()
X = transformer("The cat sat on the mat.")
print(X)

tensor([[-0.9836, -0.1272, -0.6853,  ...,  1.0959,  0.5143,  0.6237],
        [ 0.2990, -0.2386,  0.7141,  ..., -0.2033,  0.0946,  0.0958],
        [ 1.0037, -0.1444,  0.3477,  ..., -0.0689, -0.6273,  0.4990],
        ...,
        [ 1.1924, -0.2387,  1.2874,  ...,  1.0004, -0.8292, -0.3271],
        [-0.2179,  0.1382,  1.5083,  ...,  0.2565,  1.0773, -0.4967],
        [ 0.4736,  0.0803,  0.5177,  ...,  1.0466, -0.1284, -0.4433]])


In [102]:
class DistilBERTProcessor:
    
    def __init__(self):
        
        self.model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.distilbert_weights = self.model.state_dict()
        self.embedding_dimension = 768
        self.num_heads = 6
        self.head_size = self.embedding_dimension // self.num_heads
        self.tokens_len = None  # This will be initialized during embedding

    def layer_norm(self, x, weight, bias, eps=1e-6):
        
        mean = x.mean(dim=-1, keepdim=True)
        std_dev = x.std(dim=-1, keepdim=True)
        x_normalized = (x - mean) / (std_dev + eps)
        output = weight * x_normalized + bias
        
        return output

    def get_head_tensor(self, X_expanded, layer, Q_K_or_V):
        
        #Weight matrix W_Q, W_K, or W_V
        weight_matrix = self.distilbert_weights['transformer.layer.' + str(layer) + '.attention.' + Q_K_or_V.lower() + '_lin.weight']
        head_divided_weight_matrix = weight_matrix.view(self.num_heads, self.head_size, self.embedding_dimension)
        
        #Bias matrix b_Q, b_K, or b_V
        bias_matrix = self.distilbert_weights['transformer.layer.' + str(layer) + '.attention.' + Q_K_or_V.lower() + '_lin.bias']
        head_divided_bias_matrix = bias_matrix.view(self.num_heads, self.head_size)
        
        # Multiply X with W_Q, W_K, or W_V
        head_matrices = torch.matmul(X_expanded, head_divided_weight_matrix.transpose(1, 2)) + head_divided_bias_matrix.unsqueeze(1)
        
        # Reshape to get the head tensor
        head_matrices = head_matrices.squeeze(1)
        
        return head_matrices

    def embed(self, sentence):
        
        # Tokenize the sentence
        inputs = self.tokenizer(sentence, return_tensors="pt")
        inputs = inputs["input_ids"][0]
        
        # Initialize tokens_len
        self.tokens_len = len(inputs)
        
        # Full token embeddings
        W = self.distilbert_weights['embeddings.word_embeddings.weight']
        
        # Sentence token embeddings
        X = W[inputs]
        
        # Positional embeddings
        P_full = self.distilbert_weights['embeddings.position_embeddings.weight']
        P = P_full[:self.tokens_len, :]
        
        # Add position embeddings to token embeddings
        X = X + P
        
        # Normalize
        X = self.layer_norm(X, self.distilbert_weights['embeddings.LayerNorm.weight'], self.distilbert_weights['embeddings.LayerNorm.bias'])
        
        return X

    def attention(self, X, layer):
        
        # For pytorch broadcasting to work, we need to expand the tensor to (1, self.token_length, 768)
        X_expanded = X.unsqueeze(0)
        
        # Query, Key, and Value heads
        Q = self.get_head_tensor(X_expanded, layer, 'Q')
        K = self.get_head_tensor(X_expanded, layer, 'K')
        V = self.get_head_tensor(X_expanded, layer, 'V')
        
        # Attention Weights
        A = torch.softmax(torch.matmul(Q, K.transpose(1, 2) / torch.sqrt(torch.tensor(self.head_size).float())), dim=-1)
        
        # Update V
        V = torch.matmul(A, V)
        
        # Concatenating the heads
        V = V.view(self.tokens_len, self.embedding_dimension)
        
        # Linear layer
        W_out_lin = self.distilbert_weights['transformer.layer.' + str(layer) + '.attention.out_lin.weight']
        b_out_lin = self.distilbert_weights['transformer.layer.' + str(layer) + '.attention.out_lin.bias']
        b_out_lin_matrix = b_out_lin.repeat(self.tokens_len, 1)
        
        residual = torch.matmul(V, W_out_lin) + b_out_lin_matrix #TODO: Need to transpose W_out_lin as per copilot suggestion?

        # Residual Connections
        X = X + residual
        
        # Normalize
        W_sa = self.distilbert_weights['transformer.layer.' + str(layer) + '.sa_layer_norm.weight']
        b_sa = self.distilbert_weights['transformer.layer.' + str(layer) + '.sa_layer_norm.bias']
        
        X = self.layer_norm(X, W_sa, b_sa)
        
        return X

    def feed_forward(self, X, layer):
        
        # FF Linear 1
        W_ff_l1 = self.distilbert_weights['transformer.layer.' + str(layer) + '.ffn.lin1.weight']
        b_ff_l1 = self.distilbert_weights['transformer.layer.' + str(layer) + '.ffn.lin1.bias']
        b_ff_l1_matrix = b_ff_l1.repeat(self.tokens_len, 1)
        
        FF_data = torch.matmul(X, W_ff_l1.transpose(0, 1)) + b_ff_l1_matrix
        
        # FF ReLU
        FF_data = torch.relu(FF_data)
        
        # FF Linear 2
        W_ff_l2 = self.distilbert_weights['transformer.layer.' + str(layer) + '.ffn.lin2.weight']
        b_ff_l2 = self.distilbert_weights['transformer.layer.' + str(layer) + '.ffn.lin2.bias']
        b_ff_l2_matrix = b_ff_l2.repeat(self.tokens_len, 1)
        
        X = torch.matmul(FF_data, W_ff_l2.transpose(0, 1)) + b_ff_l2_matrix
        
        # Normalize
        W_ff = self.distilbert_weights['transformer.layer.' + str(layer) + '.output_layer_norm.weight']
        b_ff = self.distilbert_weights['transformer.layer.' + str(layer) + '.output_layer_norm.bias']
        
        X = self.layer_norm(X, W_ff, b_ff)
        
        return X

    def run_layers(self, X):
        
        for layer in range(6):
        
            X = self.attention(X, layer)
            X = self.feed_forward(X, layer)
            
        return X
    
    def __call__(self, sentence):
        
        X = self.embed(sentence)
        
        X = self.run_layers(X)
        
        return X