# Represent The Input

In [2]:
import numpy as np
from scipy.special import softmax

In [3]:
print("Step 1: Input : 3 inputs, d_model=4")
x =np.array([[1.0, 0.0, 1.0, 0.0], # Input 1
 [0.0, 2.0, 0.0, 2.0], # Input 2
 [1.0, 1.0, 1.0, 1.0]]) # Input 3
print(x)

Step 1: Input : 3 inputs, d_model=4
[[1. 0. 1. 0.]
 [0. 2. 0. 2.]
 [1. 1. 1. 1.]]


# Initializing the Weights Matrixes
Q train the queries,
K train the keys, &
V train the values.

In [4]:
print("Step 2: weights 3 dimensions x d_model = 4")
print("w_query")
w_query = np.array([[1, 0, 1],
                    [1, 0, 0],
                    [0, 0, 1],
                    [0, 1, 1]])

print(w_query)

Step 2: weights 3 dimensions x d_model = 4
w_query
[[1 0 1]
 [1 0 0]
 [0 0 1]
 [0 1 1]]


In [5]:
print("w_key")
w_key = np.array([[0, 0, 1],
                 [1, 1, 0],
                  [0, 1, 0],
                  [1, 1, 0]])
print(w_key)


w_key
[[0 0 1]
 [1 1 0]
 [0 1 0]
 [1 1 0]]


In [6]:
print("w_value")
w_value = np.array([[0, 2, 0],
                    [0, 3, 0],
                    [1, 0, 3],
                    [1, 1, 0]])
print(w_value)


w_value
[[0 2 0]
 [0 3 0]
 [1 0 3]
 [1 1 0]]


## Matrix multiplication to obtain Q, K, V
where,  Q = Query, 
        K = Key, &
        V = Value

In [7]:
print("Step 3: Matrix multiplication to obtain Q,K,V")
print("Query: x * w_query")
Q = np.matmul(x, w_query)
print(Q)


Step 3: Matrix multiplication to obtain Q,K,V
Query: x * w_query
[[1. 0. 2.]
 [2. 2. 2.]
 [2. 1. 3.]]


In [8]:
print("Key: x * w_key")
K=np.matmul(x,w_key)
print(K)

Key: x * w_key
[[0. 1. 1.]
 [4. 4. 0.]
 [2. 3. 1.]]


In [9]:
print("Value: x * w_value")
V=np.matmul(x,w_value)
print(V)

Value: x * w_value
[[1. 2. 3.]
 [2. 8. 0.]
 [2. 6. 3.]]


# Scaled Attention Scores

[![image.png](https://i.postimg.cc/3R0W108H/image.png)](https://postimg.cc/xJY0TCNs)

In [10]:
print("Step 4: Scaled Attention Scores")
k_d = 1
attention_scores = (Q @ K.transpose()) / k_d
print(attention_scores)

Step 4: Scaled Attention Scores
[[ 2.  4.  4.]
 [ 4. 16. 12.]
 [ 4. 12. 10.]]


# Scaled softmax attention scores for each vector

In [11]:
print("Step 5: Scaled softmax attention_scores for each vector")
attention_scores[0]=softmax(attention_scores[0])
attention_scores[1]=softmax(attention_scores[1])
attention_scores[2]=softmax(attention_scores[2])

print(attention_scores[0])
print(attention_scores[1])
print(attention_scores[2])

Step 5: Scaled softmax attention_scores for each vector
[0.06337894 0.46831053 0.46831053]
[6.03366485e-06 9.82007865e-01 1.79861014e-02]
[2.95387223e-04 8.80536902e-01 1.19167711e-01]


# The final attention representations


[![image.png](https://i.postimg.cc/9FdqRkFt/image.png)](https://postimg.cc/JGhhSPXG)  

In [12]:
print("Step 6: attention value obtained by score1/k_d * V")
print(V[0])
print(V[1])
print(V[2])
print("Attention 1")
attention1=attention_scores[0].reshape(-1,1)
attention1=attention_scores[0][0]*V[0]
print(attention1)
print("Attention 2")
attention2=attention_scores[0][1]*V[1]
print(attention2)
print("Attention 3")
attention3=attention_scores[0][2]*V[2]
print(attention3)

Step 6: attention value obtained by score1/k_d * V
[1. 2. 3.]
[2. 8. 0.]
[2. 6. 3.]
Attention 1
[0.06337894 0.12675788 0.19013681]
Attention 2
[0.93662106 3.74648425 0.        ]
Attention 3
[0.93662106 2.80986319 1.40493159]


# Summing up Results

In [13]:
print("Step7: summed the results to create the first line of the output matrix")
attention_input1=attention1+attention2+attention3
print(attention_input1)

Step7: summed the results to create the first line of the output matrix
[1.93662106 6.68310531 1.59506841]


# Steps 1 to 7 for all the inputs

##### Output varies on every notebooks, because of the random generation of the vectors


In [14]:
print("Step 8: Step 1 to 7 for inputs 1 to 3")
"""We assume we have 3 results with learned weights (they were not
trained in this example.
We assume we are implementing the original Transformer paper.We will
have 3 results of 64 dimensions each"""
attention_head1=np.random.random((3, 64))
print(attention_head1)

Step 8: Step 1 to 7 for inputs 1 to 3
[[0.25767291 0.77554309 0.60612191 0.95769962 0.13205772 0.96357352
  0.73418955 0.75544633 0.88867426 0.73904519 0.56345214 0.21762382
  0.94107205 0.65346795 0.10236107 0.47355059 0.1558973  0.60805941
  0.16209452 0.245118   0.41071432 0.14583567 0.42604864 0.7196642
  0.19346888 0.04518761 0.90596878 0.38972131 0.20927568 0.24657575
  0.61964242 0.466403   0.61673801 0.4274784  0.70908773 0.51518149
  0.6578018  0.0388737  0.97837623 0.05621202 0.62778133 0.95112959
  0.90514695 0.31809963 0.34013051 0.54520668 0.39115608 0.90203515
  0.46755755 0.89460512 0.71718006 0.73815075 0.45034516 0.00895124
  0.30055165 0.38351807 0.40010213 0.29549375 0.94648697 0.485085
  0.50928744 0.60024328 0.18597511 0.11954859]
 [0.23612013 0.31702388 0.9980949  0.44083918 0.08332787 0.16938221
  0.73181141 0.40210155 0.79449801 0.76471491 0.70241131 0.66087309
  0.05482241 0.09874774 0.22118279 0.19335745 0.58610748 0.16062148
  0.42957841 0.56585077 0.6879454 

# The output of the heads of the attention sub-layer

In [15]:
print("Step 9: We assume we have trained the 8 heads of the attention sub-layer")
z0h1=np.random.random((3, 64))
z1h2=np.random.random((3, 64))
z2h3=np.random.random((3, 64))
z3h4=np.random.random((3, 64))
z4h5=np.random.random((3, 64))
z5h6=np.random.random((3, 64))
z6h7=np.random.random((3, 64))
z7h8=np.random.random((3, 64))
print("shape of one head",z0h1.shape,"dimension of 8 heads",64*8)

Step 9: We assume we have trained the 8 heads of the attention sub-layer
shape of one head (3, 64) dimension of 8 heads 512


# Concatenation of the output of the heads

In [16]:
print("Step 10: Concantenation of heads 1 to 8 to obtain the original 8x64=512 ouput dimension of the model")
output_attention=np.hstack((z0h1,z1h2,z2h3,z3h4,z4h5,z5h6,z6h7,z7h8))
print(output_attention)

Step 10: Concantenation of heads 1 to 8 to obtain the original 8x64=512 ouput dimension of the model
[[0.51487787 0.13279593 0.72858981 ... 0.05213404 0.2413309  0.79108552]
 [0.33775124 0.54400559 0.70856328 ... 0.98473891 0.6014465  0.13930668]
 [0.67731707 0.19359002 0.76657562 ... 0.93789978 0.02204074 0.50358362]]
