In [1]:
import numpy as np
import gensim.downloader as api
from scipy.special import softmax

In [None]:
# Load pre-trained GloVe embeddings
print("Loading GloVe embeddings...")
glove_vectors = api.load("glove-wiki-gigaword-300")  # 300-dimensional GloVe embeddings
print("GloVe embeddings loaded.")

In [None]:
# Our input sentence
sentence = "The dog chased the cat which was scared"
words = sentence.split()
seq_length = len(words)

In [None]:
words

In [None]:
seq_length

In [None]:
# Hyperparameters
embedding_dim = 300  # GloVe vectors are 300-dimensional
max_seq_length = 512  # Maximum sequence length for positional embeddings

In [None]:
def get_positional_encoding(seq_length, d_model, max_len=max_seq_length):
    positional_encoding = np.zeros((max_len, d_model))
    for pos in range(max_len):
        for i in range(0, d_model, 2):
            positional_encoding[pos, i] = np.sin(pos / (10000 ** ((2 * i)/d_model)))
            positional_encoding[pos, i + 1] = np.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
    return positional_encoding[:seq_length]

# Step 1: Create word embeddings using GloVe
word_embeddings = np.array([glove_vectors[word.lower()] if word.lower() in glove_vectors else glove_vectors['unk'] for word in words])

# Step 2: Create positional embeddings
positional_embeddings = get_positional_encoding(seq_length, embedding_dim)

# Step 3: Combine word embeddings and positional embeddings
input_embeddings = word_embeddings + positional_embeddings

In [None]:
word_embeddings.shape

In [None]:
positional_embeddings.shape

In [None]:
input_embeddings

In [None]:
input_embeddings.shape

In [None]:
# Step 4: Create Query, Key, and Value matrices
W_query = np.random.rand(embedding_dim, embedding_dim)
W_key = np.random.rand(embedding_dim, embedding_dim)
W_value = np.random.rand(embedding_dim, embedding_dim)

In [None]:
# Step 5: Compute Q, K, V
Q = np.dot(input_embeddings, W_query)
K = np.dot(input_embeddings, W_key)
V = np.dot(input_embeddings, W_value)

In [None]:
# Step 6: Compute attention scores
attention_scores = np.dot(Q, K.T)

In [None]:
# Step 7: Scale the attention scores
attention_scores /= np.sqrt(embedding_dim)

In [None]:
# Step 8: Apply softmax to get attention weights
attention_weights = softmax(attention_scores, axis=1)

In [None]:
# Step 9: Compute the weighted sum
output = np.dot(attention_weights, V)

# Print results
print("\nSelf-Attention Results:")
for i, word in enumerate(words):
    print(f"\nWord: {word}")
    print(f"Top 3 words this word pays attention to:")
    top_attention = sorted(enumerate(attention_weights[i]), key=lambda x: x[1], reverse=True)[:3]
    for idx, weight in top_attention:
        print(f"  {words[idx]}: {weight:.4f}")

# Analyze relationships
print("\nInteresting relationships:")
for i, word in enumerate(words):
    max_attention = np.argmax(attention_weights[i])
    if i != max_attention:
        print(f"'{word}' pays most attention to '{words[max_attention]}'")


# Visualize attention weights
print("\nAttention Weight Matrix:")
for i, word in enumerate(words):
    print(f"{word:>10}", end="")
print()
for i, word in enumerate(words):
    print(f"{word:>10}", end="")
    for j in range(seq_length):
        print(f"{attention_weights[i, j]:>10.2f}", end="")
    print()