In [161]:
import numpy as np

# Let's use very small embeddings (3D) for visualization
# Simplified word embeddings (normally these would be learned)
word_embeddings = {
    "I":      np.array([1, 0, 0]),
    "like":   np.array([0, 1, 0]),
    "Batman": np.array([0, 0, 1])
}

# Simplified weight matrices (normally these would be learned)
W_query = np.array([
    [1, 0, 0],
    [0, 1, 0],
    [0, 0, 1]
])

W_key = np.array([
    [1, 0, 0],
    [0, 1, 0],
    [0, 0, 1]
])

W_value = np.array([
    [1, 0, 0],
    [0, 1, 0],
    [0, 0, 1]
])

# Let's look at what happens with the word "I"
word = "I"
embedding = word_embeddings[word]

print(f"Processing word: {word}")
print(f"Original embedding: {embedding}")

# Create Q, K, V for this word
Q = np.dot(embedding, W_query)
K = np.dot(embedding, W_key)
V = np.dot(embedding, W_value)

print("\nFor word 'I':")
print(f"Query vector (what I'm looking for): {Q}")
print(f"Key vector (what I match against): {K}")
print(f"Value vector (what information I carry): {V}")

# Now let's see how "I" interacts with "like"
other_word = "like"
other_K = np.dot(word_embeddings[other_word], W_key)

# Calculate attention score between "I" and "like"
attention_score = np.dot(Q, other_K)
print(f"\nAttention score between '{word}' and '{other_word}': {attention_score}")

# Visual representation
print("\nVisual Representation:")
print("Word 'I' looking for matches:")
print("┌─────────┐")
print("│   I    │ ──Query──> [1,0,0] • Compare with Keys of:")
print("└─────────┘")
print("                         │")
print("                         ├──> I     [1,0,0] = Score: 1.0")
print("                         ├──> like  [0,1,0] = Score: 0.0")
print("                         └──> Batman[0,0,1] = Score: 0.0")

Processing word: I
Original embedding: [1 0 0]

For word 'I':
Query vector (what I'm looking for): [1 0 0]
Key vector (what I match against): [1 0 0]
Value vector (what information I carry): [1 0 0]

Attention score between 'I' and 'like': 0

Visual Representation:
Word 'I' looking for matches:
┌─────────┐
│   I    │ ──Query──> [1,0,0] • Compare with Keys of:
└─────────┘
                         │
                         ├──> I     [1,0,0] = Score: 1.0
                         ├──> like  [0,1,0] = Score: 0.0
                         └──> Batman[0,0,1] = Score: 0.0


In [162]:
import numpy as np

class SelfAttention:
    def __init__(self, embedding_dim):
        self.embedding_dim = embedding_dim
        self.W_query = np.random.randn(embedding_dim, embedding_dim)
        self.W_key = np.random.randn(embedding_dim, embedding_dim)
        self.W_value = np.random.randn(embedding_dim, embedding_dim)

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

    def forward(self, X):
        Q = np.dot(X, self.W_query)
        K = np.dot(X, self.W_key)
        V = np.dot(X, self.W_value)
        
        attention_scores = np.dot(Q, K.T)
        attention_scores = attention_scores / np.sqrt(self.embedding_dim)
        attention_weights = self.softmax(attention_scores)
        
        output = np.dot(attention_weights, V)
        return output, attention_weights

# Process sentences
sentence1 = ["I", "like", "Batman"]
sentence2 = ["I", "like", "Dwight", "Schrute", "from", "The", "Office"]

# Combine unique words from both sentences
unique_words = list(set(sentence1 + sentence2))
vocab_size = len(unique_words)
word_to_idx = {word: idx for idx, word in enumerate(unique_words)}

# Create one-hot encodings
def create_one_hot(sentence, word_to_idx, vocab_size):
    encodings = np.zeros((len(sentence), vocab_size))
    for i, word in enumerate(sentence):
        encodings[i][word_to_idx[word]] = 1
    return encodings

# Process each sentence separately
X1 = create_one_hot(sentence1, word_to_idx, vocab_size)
X2 = create_one_hot(sentence2, word_to_idx, vocab_size)

# Initialize and apply self-attention
attention = SelfAttention(embedding_dim=vocab_size)

# Process sentence 1
output1, attention_weights1 = attention.forward(X1)
print("\nAttention weights for 'I like Batman':")
for i, word1 in enumerate(sentence1):
    for j, word2 in enumerate(sentence1):
        print(f"{word1} -> {word2}: {attention_weights1[i][j]:.3f}")

# Process sentence 2
output2, attention_weights2 = attention.forward(X2)
print("\nAttention weights for 'I like Dwight Schrute from The Office':")
for i, word1 in enumerate(sentence2):
    for j, word2 in enumerate(sentence2):
        print(f"{word1} -> {word2}: {attention_weights2[i][j]:.3f}")


Attention weights for 'I like Batman':
I -> I: 0.203
I -> like: 0.498
I -> Batman: 0.298
like -> I: 0.511
like -> like: 0.363
like -> Batman: 0.126
Batman -> I: 0.097
Batman -> like: 0.322
Batman -> Batman: 0.581

Attention weights for 'I like Dwight Schrute from The Office':
I -> I: 0.071
I -> like: 0.174
I -> Dwight: 0.077
I -> Schrute: 0.280
I -> from: 0.043
I -> The: 0.103
I -> Office: 0.252
like -> I: 0.090
like -> like: 0.064
like -> Dwight: 0.153
like -> Schrute: 0.219
like -> from: 0.259
like -> The: 0.086
like -> Office: 0.130
Dwight -> I: 0.166
Dwight -> like: 0.082
Dwight -> Dwight: 0.139
Dwight -> Schrute: 0.219
Dwight -> from: 0.120
Dwight -> The: 0.161
Dwight -> Office: 0.113
Schrute -> I: 0.092
Schrute -> like: 0.197
Schrute -> Dwight: 0.264
Schrute -> Schrute: 0.059
Schrute -> from: 0.330
Schrute -> The: 0.025
Schrute -> Office: 0.032
from -> I: 0.314
from -> like: 0.087
from -> Dwight: 0.096
from -> Schrute: 0.254
from -> from: 0.140
from -> The: 0.035
from -> Office:

In [163]:
multihead_W_query = np.random.randn(3, 6, 6)
multihead_W_query

array([[[-1.14853278,  0.24294323, -0.77837852,  0.46768282,
          1.10207346, -1.37347654],
        [-0.33810614,  0.67845663, -1.38402979, -1.17511333,
          0.78840684,  0.20958856],
        [-1.00998464,  0.95613032, -1.40618625,  0.77488933,
         -0.02233755, -1.59799296],
        [ 1.18096428,  0.67515626, -0.73140025,  0.57697706,
          0.91110612,  0.07475426],
        [ 0.02173382,  2.14251567, -2.75189226, -1.16521825,
          0.65411177, -0.18243588],
        [-0.45791856, -1.55752755,  0.17662621, -0.1791015 ,
          1.21998309,  1.12747096]],

       [[-0.69879681,  0.85213555, -0.15920229, -1.70878087,
         -2.71404885, -1.87139264],
        [-0.783748  ,  0.30179782,  0.35268233, -0.29266392,
          0.50183542, -2.12717599],
        [-0.45232925, -0.49042193, -0.9220487 ,  0.07075109,
          0.40954338, -1.03936272],
        [-0.12082068,  1.88814436,  0.15811987,  0.77821047,
         -0.09576802, -0.07125079],
        [-1.08351856,  1.051

In [164]:
class Dropout:
    def __init__(self, p=0.1):
        self.p = p
    def forward(self, x, training=True):
        if not training or self.p == 0:
            return x
        
        mask = np.random.binomial(1, 1-self.p, x.shape) / (1-self.p)
        return x * mask
    

In [165]:
class MultiHeadAttention:
    def __init__(self, embedding_dim, num_head, dropout=0.1):
        self.embedding_dim = embedding_dim
        self.num_head = num_head

        self.head_dim = embedding_dim // num_head

        self.W_query = [np.random.randn(embedding_dim, self.head_dim) for _ in range(num_head)]
        self.W_key = [np.random.randn(embedding_dim, self.head_dim) for _ in range(num_head)]
        self.W_value = [np.random.randn(embedding_dim, self.head_dim) for _ in range(num_head)]

        total_head_dim = self.head_dim * num_head
        self.W_output = np.random.randn(total_head_dim, embedding_dim)

        self.dropout = Dropout(dropout)
    
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
    
    def attention(self, Q, K, V, mask=None):
        batch_size = Q.shape[0]
        
        # Reshape for batch matrix multiplication
        Q_flat = Q.reshape(-1, Q.shape[-1])
        K_flat = K.reshape(-1, K.shape[-1])
        V_flat = V.reshape(-1, V.shape[-1])

        scores = np.dot(Q_flat, K_flat.T)
        scores = scores / np.sqrt(self.head_dim)

        if mask is not None:
            scores = scores * mask + -1e9 * (1 - mask)
        
        weights = self.softmax(scores)
        return np.dot(weights, V_flat), weights
    
    def forward(self, X, memory = None, mask=None,training=True):
        head_outputs = []
        all_attention_weights = []

        for head in range(self.num_head):
            Q = np.dot(X, self.W_query[head])

            if memory is not None:
                K = np.dot(memory, self.W_key[head])
                V = np.dot(memory, self.W_value[head])
            else:
                K = np.dot(X, self.W_key[head])
                V = np.dot(X, self.W_value[head])

            head_output, attention_weights = self.attention(Q, K, V)
            dropout = self.dropout.forward(head_output, training)
            head_outputs.append(dropout)
            all_attention_weights.append(attention_weights)
        
        multi_head_output = np.concatenate(head_outputs, axis = -1)

        final_output = np.dot(multi_head_output, self.W_output)

        final_output = self.dropout.forward(final_output, training)

        return final_output, all_attention_weights
    

In [166]:
#Feed Forward Network
class FFN():
    def __init__(self, dmodel, d_ff): #dmodel = 512 & d_ff = 2048 as per paper
        self.W1 = np.random.randn(dmodel, d_ff)
        self.W2 = np.random.randn(d_ff, dmodel)
        self.b1 = np.random.randn(d_ff)
        self.b2 = np.random.randn(dmodel)

    def reLU(self, X):
        return np.maximum(0, X)
    
    def forward(self, X):
        self.Z1 = np.dot(X,self.W1) + self.b1
        self.A1 = self.reLU(self.Z1)
        self.Z2 = np.dot(self.A1,self.W2) + self.b2
        return self.Z2

In [167]:
class PositionalEncoding:
    def __init__(self, dmodel,max_seq_length):
        self.encoding = np.zeros((max_seq_length, dmodel))
        for pos in range(max_seq_length):
            for j in range(int(dmodel/2)):
                 denominator = 10000**(2*j/dmodel)
                 self.encoding[pos, 2*j] = np.sin(pos/denominator)
                 self.encoding[pos, 2*j+1] = np.cos((pos/denominator))
    
    def forward(self,X):
        seq_length = X.shape[1]
        positions_needed = self.encoding[:seq_length]
        return X + positions_needed

In [168]:
class Embedding:
    def __init__(self, vocab_size, dmodel):
        self.embedding_matrix = np.random.randn(vocab_size, dmodel)
        self.dmodel = dmodel
        pass
    def forward(self,x):
        return self.embedding_matrix[x] * np.sqrt(self.dmodel) 
        

In [169]:
class LayerNormalization:
    def __init__(self, dmodel, epsilon = 1e-12):
        self.gamma = np.ones(dmodel)
        self.beta = np.zeros(dmodel)
        self.epsilon = epsilon

    def forward(self, x):
        mean = np.mean(x, axis=-1, keepdims=True)
        variance = np.var(x, axis=-1, keepdims=True)
        normalize = (x - mean) / np.sqrt(variance + self.epsilon)
        scale_shift = self.gamma * normalize + self.beta
        return scale_shift
    

In [170]:
class EncoderLayer:
    def __init__(self, dmodel, num_heads, d_ff, dropout=0.1):
        self.mha = MultiHeadAttention(dmodel, num_heads)

        self.ffn = FFN(dmodel,d_ff)

        self.norm1 = LayerNormalization(dmodel) #after mha
        self.norm2 = LayerNormalization(dmodel) #after ffn
        self.dropout = Dropout(dropout)

    def forward(self, X, training=True):
        atten_output, atten_weights = self.mha.forward(X)
        dropout1 = self.dropout.forward(atten_output, training)
        output1 = self.norm1.forward(X + dropout1)

        ffn_output = self.ffn.forward(output1)
        dropout2 = self.dropout.forward(ffn_output, training)
        output2 = self.norm2.forward(output1 + dropout2)

        return output2, atten_weights
    

In [171]:
class Encoder:
    def __init__(self, num_layers, dmodel, num_heads, d_ff):
        self.layers = [EncoderLayer(dmodel,num_heads,d_ff) for _ in range(num_layers)]

    def forward(self, X, training=True):
        attention_weights = []
        for layer in self.layers:
            outs, weights = layer.forward(X)
            attention_weights.append(weights)
        
        return outs, attention_weights

In [172]:
class DecoderLayer:
    def __init__(self, dmodel, num_heads, d_ff, dropout=0.1):
        self.masked_mha = MultiHeadAttention(dmodel, num_heads)

        self.mha = MultiHeadAttention(dmodel, num_heads)

        self.ffn = FFN(dmodel, d_ff)

        self.norm1 = LayerNormalization(dmodel)
        self.norm2 = LayerNormalization(dmodel)
        self.norm3 = LayerNormalization(dmodel)

        self.dropout = Dropout(dropout)
    
    def create_mask(self, size):
        mask = np.tril(np.ones((size,size)))
        return mask
    
    def forward(self, X, encoded_output, training=True):
        seq_length = X.shape[1]
        mask = self.create_mask(seq_length)

        masked_atten_output, masked_atten_weights = self.masked_mha.forward(X, memory=None, mask=mask)
        dropout1 = self.dropout.forward(masked_atten_output, training)
        out1 = self.norm1.forward(X + dropout1)

        atten_output, atten_weights = self.mha.forward(out1, memory=encoded_output)
        dropout2 = self.dropout.forward(atten_output, training)
        out2 = self.norm2.forward(out1 + dropout2)

        ffn_output = self.ffn.forward(out2)
        dropout3 = self.dropout.forward(ffn_output, training)
        out3 = self.norm3.forward(out2 + dropout3)

        return out3, (masked_atten_weights, atten_weights)

In [173]:
class Decoder:
    def __init__(self, num_layers, dmodel, num_heads, d_ff):
        self.layers = [DecoderLayer(dmodel, num_heads, d_ff) for _ in range(num_layers)]
    
    def forward(self, X, encoded_output, training=True):
        attention_weights = []
        for layer in self.layers:
            outs, weights = layer.forward(X, encoded_output)
            attention_weights.append(weights)
        
        return outs, attention_weights

In [174]:
class Transformer:
    def __init__(self, num_layers, dmodel, num_heads, d_ff, input_vocab_size, target_vocab_size, dropout=0.1):
        self.dmodel = dmodel
        #Embedding
        self.input_embedding = Embedding(input_vocab_size, dmodel)
        self.output_embedding = Embedding(target_vocab_size, dmodel)

        #Positional Encoding
        self.positional_encoding = PositionalEncoding(dmodel, max_seq_length = 5000)

        #Dropout
        self.dropout = Dropout(dropout)

        #Encoder and Decoder
        self.encoder = Encoder(num_layers, dmodel, num_heads, d_ff)
        self.decoder = Decoder(num_layers, dmodel, num_heads, d_ff)

        #Final linear layer and softmax
        self.final_layer = np.random.randn(dmodel, target_vocab_size)

    def forward(self, input_seq, target_seq, training=True):
        #Input embedding + Positional Encoding
        input_embedded = self.input_embedding.forward(input_seq)
        input_encoded = self.positional_encoding.forward(input_embedded)
        input_dropped = self.dropout.forward(input_encoded, training)


        #Encoder
        encoded_output, encoder_attention = self.encoder.forward(input_dropped, training)

        #Target embedding + Positional Encoding
        target_embedded = self.output_embedding.forward(target_seq)
        target_encoded = self.positional_encoding.forward(target_embedded)
        target_dropped = self.dropout.forward(target_encoded, training)

        #Decoder
        decoder_output, decoder_attention = self.decoder.forward(target_dropped, encoded_output, training)

        #Final linear layer and softmax
        logits = np.dot(decoder_output, self.final_layer)
        probs = self.softmax(logits)
        
        return probs, (encoder_attention, decoder_attention)

    def softmax(self,x):
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

# Let's test it
if __name__ == "__main__":
    # Test parameters (smaller for testing)
    num_layers = 2
    d_model = 6
    num_heads = 2
    d_ff = 8
    input_vocab_size = 1000
    target_vocab_size = 1000
    
    # Create transformer
    transformer = Transformer(num_layers, d_model, num_heads, d_ff, 
                           input_vocab_size, target_vocab_size)
    
    # Create sample inputs
    batch_size = 1
    seq_length = 3
    input_seq = np.random.randint(0, input_vocab_size, (batch_size, seq_length))
    target_seq = np.random.randint(0, target_vocab_size, (batch_size, seq_length))
    
    # Forward pass
    output, attention = transformer.forward(input_seq, target_seq)
    
    print("Input sequence shape:", input_seq.shape)
    print("Target sequence shape:", target_seq.shape)
    print("Output shape:", output.shape)

Input sequence shape: (1, 3)
Target sequence shape: (1, 3)
Output shape: (1, 3, 1000)


In [175]:
#Traning
def softmax(x, axis=-1):
    exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
#learning rate
def get_learning_rate(step, dmodel, warmup_steps=4000):
    return dmodel ** (-0.5) * min(step ** (-0.5), step * warmup_steps ** (-0.5))

#Loss function
def cross_entropy_loss(predictions, targets, smoothing=0.1):
    vocab_size = predictions.shape[-1]

    confidence = 1.0 - smoothing
    smoothing_labels = smoothing / vocab_size


    batch_size, seq_length, vocab_size = predictions.shape
    predictions = predictions.reshape(-1, vocab_size)
    targets = targets.reshape(-1)

    one_hot = np.zeros_like(predictions)
    one_hot[np.arange(len(targets)), targets] = 1
    smoothed_targets = one_hot * confidence + smoothing_labels

    log_probs = -np.log(predictions + 1e-10)
    loss = np.mean(np.sum(smoothed_targets * log_probs, axis=1))
     
    return loss

if __name__ == "__main__":
    # Test learning rate
    step = 1000
    d_model = 512
    lr = get_learning_rate(step, d_model)
    print(f"Learning rate at step {step}: {lr:.6f}")
    
    # Test loss function
    batch_size = 2
    seq_length = 3
    vocab_size = 5
    
    # Create test data
    logits = np.random.randn(batch_size, seq_length, vocab_size)
    predictions = softmax(logits, axis=-1)
    targets = np.random.randint(0, vocab_size, (batch_size, seq_length))
    
    # Print shapes for debugging
    print("\nShapes:")
    print("Predictions:", predictions.shape)
    print("Targets:", targets.shape)
    
    # Calculate loss
    loss = cross_entropy_loss(predictions, targets)
    print(f"\nLoss value: {loss:.4f}")

Learning rate at step 1000: 0.001398

Shapes:
Predictions: (2, 3, 5)
Targets: (2, 3)

Loss value: 1.8229
