In [1]:
import numpy as np
import gensim.downloader as api
from scipy.special import softmax

In [3]:
# Load pre-trained GloVe embeddings
print("Loading GloVe embeddings...")
glove_vectors = api.load("glove-wiki-gigaword-300")  # 300-dimensional GloVe embeddings
print("GloVe embeddings loaded.")

Loading GloVe embeddings...
GloVe embeddings loaded.


In [19]:
# Our input sentence
sentence = "The dog chased the cat which was scared"
words = sentence.split()
seq_length = len(words)

In [20]:
words

['The', 'dog', 'chased', 'the', 'cat', 'which', 'was', 'scared']

In [21]:
seq_length

8

In [22]:
# Hyperparameters
embedding_dim = 300  # GloVe vectors are 300-dimensional
max_seq_length = 512  # Maximum sequence length for positional embeddings

In [6]:
def get_positional_encoding(seq_length, d_model, max_len=max_seq_length):
    positional_encoding = np.zeros((max_len, d_model))
    for pos in range(max_len):
        for i in range(0, d_model, 2):
            positional_encoding[pos, i] = np.sin(pos / (10000 ** ((2 * i)/d_model)))
            positional_encoding[pos, i + 1] = np.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
    return positional_encoding[:seq_length]

# Step 1: Create word embeddings using GloVe
word_embeddings = np.array([glove_vectors[word.lower()] if word.lower() in glove_vectors else glove_vectors['unk'] for word in words])

# Step 2: Create positional embeddings
positional_embeddings = get_positional_encoding(seq_length, embedding_dim)

# Step 3: Combine word embeddings and positional embeddings
input_embeddings = word_embeddings + positional_embeddings

In [7]:
word_embeddings.shape

(8, 300)

In [8]:
positional_embeddings.shape

(8, 300)

In [9]:
input_embeddings

array([[ 0.04656   ,  1.21318001, -0.0074364 , ...,  1.0090611 ,
        -0.20988999,  1.053913  ],
       [ 0.73104098,  1.40159875,  0.84722602, ...,  1.33936   ,
         0.57994003,  1.068149  ],
       [ 0.79341743, -0.09352756,  1.14576686, ...,  1.074944  ,
         0.26975002,  1.30592   ],
       ...,
       [-1.18124428,  0.22839546, -1.00623184, ...,  0.67298999,
        -0.05374394,  0.58175999],
       [-0.2138425 ,  0.82379838, -0.96039976, ...,  0.78083999,
        -0.43185993,  0.919652  ],
       [ 1.03910661,  0.90790557, -0.0365134 , ...,  1.35398   ,
         0.27280008,  1.73144001]])

In [10]:
input_embeddings.shape

(8, 300)

In [11]:
# Step 4: Create Query, Key, and Value matrices
W_query = np.random.rand(embedding_dim, embedding_dim)
W_key = np.random.rand(embedding_dim, embedding_dim)
W_value = np.random.rand(embedding_dim, embedding_dim)

In [12]:
# Step 5: Compute Q, K, V
Q = np.dot(input_embeddings, W_query)
K = np.dot(input_embeddings, W_key)
V = np.dot(input_embeddings, W_value)

In [13]:
# Step 6: Compute attention scores
attention_scores = np.dot(Q, K.T)

In [14]:
# Step 7: Scale the attention scores
attention_scores /= np.sqrt(embedding_dim)

In [15]:
# Step 8: Apply softmax to get attention weights
attention_weights = softmax(attention_scores, axis=1)

In [16]:
# Step 9: Compute the weighted sum
output = np.dot(attention_weights, V)

# Print results
print("\nSelf-Attention Results:")
for i, word in enumerate(words):
    print(f"\nWord: {word}")
    print(f"Top 3 words this word pays attention to:")
    top_attention = sorted(enumerate(attention_weights[i]), key=lambda x: x[1], reverse=True)[:3]
    for idx, weight in top_attention:
        print(f"  {words[idx]}: {weight:.4f}")

# Analyze relationships
print("\nInteresting relationships:")
for i, word in enumerate(words):
    max_attention = np.argmax(attention_weights[i])
    if i != max_attention:
        print(f"'{word}' pays most attention to '{words[max_attention]}'")


# Visualize attention weights
print("\nAttention Weight Matrix:")
for i, word in enumerate(words):
    print(f"{word:>10}", end="")
print()
for i, word in enumerate(words):
    print(f"{word:>10}", end="")
    for j in range(seq_length):
        print(f"{attention_weights[i, j]:>10.2f}", end="")
    print()


Self-Attention Results:

Word: The
Top 3 words this word pays attention to:
  chased: 1.0000
  the: 0.0000
  The: 0.0000

Word: dog
Top 3 words this word pays attention to:
  chased: 1.0000
  the: 0.0000
  The: 0.0000

Word: chased
Top 3 words this word pays attention to:
  chased: 1.0000
  the: 0.0000
  The: 0.0000

Word: the
Top 3 words this word pays attention to:
  chased: 1.0000
  the: 0.0000
  The: 0.0000

Word: cat
Top 3 words this word pays attention to:
  chased: 1.0000
  the: 0.0000
  The: 0.0000

Word: which
Top 3 words this word pays attention to:
  chased: 1.0000
  the: 0.0000
  The: 0.0000

Word: was
Top 3 words this word pays attention to:
  chased: 1.0000
  the: 0.0000
  The: 0.0000

Word: scared
Top 3 words this word pays attention to:
  chased: 1.0000
  the: 0.0000
  The: 0.0000

Interesting relationships:
'The' pays most attention to 'chased'
'dog' pays most attention to 'chased'
'the' pays most attention to 'chased'
'cat' pays most attention to 'chased'
'which' pay