In [34]:
#Computational Complexity in NLP Models

In [37]:
# Computational times of complexity per layer 
# Comparing the computational time between: 
# self attention = O(n^2 * d)
# recurrent = O(n * d^2)

In [2]:
import numpy as np
import time

In [35]:
# Define Sequence Length and Dimensionality.
# Set the sequence length `n` and the representation dimensionality `d`.

In [3]:
n = 512
d = 512

In [36]:
# Generate Random Input Sequence
# Create a random input sequence of shape `(n, d)`.

In [4]:
input_seq = np.random.rand(n, d)

In [5]:
# simulation of self-attention layer O(n^2*d)
start_time = time.time()
for i in range(n):
    for j in range(n):
        _ = np.dot(input_seq[i], input_seq[j])

In [6]:
at = time.time() - start_time
print(f"Self-attention computation time: {time.time() - start_time} seconds")

Self-attention computation time: 0.2576613426208496 seconds


In [7]:
# simulation of recurrent layer O(n*d^2)
start_time = time.time()
hidden_state = np.zeros(d)
for i in range (n):
    for j in range(d):
        for k in range(d):
            hidden_state[j] += input_seq[i, j] * hidden_state[k]
rt = time.time() - start_time
print(f"Recurrent layer computation time: {time.time() - start_time} seconds") 

Recurrent layer computation time: 36.8253710269928 seconds


In [8]:
# Calculate the total
total = at + rt

# Calculate the percentage of at
percentage_at = round((at / total) * 100,2)

# Output the result
print(f"The percentage of 'computational time for attention' in the sum of 'attention' and 'recurrent' is {percentage_at}%")

# Calculate x, which is the ratio of rt to at
x = round(rt / at,2)

The percentage of 'computational time for attention' in the sum of 'attention' and 'recurrent' is 0.69%


In [10]:
# Install torch package
%pip install torch

import torch

if torch.cuda.is_available():
  print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
  print("No GPU available")


Note: you may need to restart the kernel to use updated packages.
No GPU available


In [11]:
# PyTorch version
import torch
import time

# define the sequence length and representation dimensionality
n = 512
d = 512

# Use GPU if available, otherwise stick with cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# define the inputs
input_seq = torch.rand(n, d, device=device)

# simulation of self-attention layer O(n^2*d)
start_time = time.time()
_ = torch.mm(input_seq, input_seq.t())
at = time.time() - start_time
print(f"Self-attention computation time: {at} seconds")

# simulation of recurrent layer O(n*d^2)
start_time = time.time()
hidden_state = torch.zeros(d, device=device)
for i in range(n):
    for j in range(d):
        for k in range(d):
            hidden_state[j] += input_seq[i, j] * hidden_state[k]
            ct = time.time() - start_time
            if ct>at*10:
              break

rt = time.time() - start_time
print(f"Recurrent layer computation time: {rt} seconds")

# Calculate the total
total = at + rt

# Calculate the percentage of at
percentage_at = round((at / total) * 100, 2)

# Output the result
print(f"The percentage of self-attention computation in the sum of self-attention and recurrent computation is {percentage_at}%")
     

cpu
Self-attention computation time: 0.0007717609405517578 seconds
Recurrent layer computation time: 2.078526020050049 seconds
The percentage of self-attention computation in the sum of self-attention and recurrent computation is 0.04%


In [38]:
# TensorFlow Implementation.
# Using TensorFlow to simulate the self-attention layer.

In [12]:
import tensorflow as tf
import numpy as np
import time

# define the sequence length and representation dimensionality
n =  32768
d = 12288

# define the inputs
input_seq = tf.random.normal((n, d), dtype=tf.float32)

# simulation of self-attention layer O(n^2*d)
start_time = time.time()
_ = tf.matmul(input_seq, input_seq, transpose_b=True)

at = time.time() - start_time
print(f"Self-attention computation time: {at} seconds")

Self-attention computation time: 52.23362469673157 seconds


In [39]:
# Transformer Architecture
# Implementation of the Transformer architecture step-by-step.

In [14]:
import numpy as np
from scipy.special import softmax

In [15]:
print("Step 1: Input : 3 inputs, d_model=4")
x =np.array([[1.0, 0.0, 1.0, 0.0],   # Input 1
             [0.0, 2.0, 0.0, 2.0],   # Input 2
             [1.0, 1.0, 1.0, 1.0]])  # Input 3
print(x)

Step 1: Input : 3 inputs, d_model=4
[[1. 0. 1. 0.]
 [0. 2. 0. 2.]
 [1. 1. 1. 1.]]


In [16]:
print("Step 2: weights 3 dimensions x d_model=4")
print("w_query")
w_query =np.array([[1, 0, 1],
                   [1, 0, 0],
                   [0, 0, 1],
                   [0, 1, 1]])
print(w_query)

Step 2: weights 3 dimensions x d_model=4
w_query
[[1 0 1]
 [1 0 0]
 [0 0 1]
 [0 1 1]]


In [17]:
print("w_key")
w_key =np.array([[0, 0, 1],
                 [1, 1, 0],
                 [0, 1, 0],
                 [1, 1, 0]])
print(w_key)

w_key
[[0 0 1]
 [1 1 0]
 [0 1 0]
 [1 1 0]]


In [18]:
print("w_value")
w_value = np.array([[0, 2, 0],
                    [0, 3, 0],
                    [1, 0, 3],
                    [1, 1, 0]])
print(w_value)

w_value
[[0 2 0]
 [0 3 0]
 [1 0 3]
 [1 1 0]]


In [19]:
print("Step 3: Matrix multiplication to obtain Q,K,V")
print("Query: x * w_query")
Q=np.matmul(x,w_query)
print(Q)

Step 3: Matrix multiplication to obtain Q,K,V
Query: x * w_query
[[1. 0. 2.]
 [2. 2. 2.]
 [2. 1. 3.]]


In [20]:
print("Key: x * w_key")
K=np.matmul(x,w_key)
print(K)

Key: x * w_key
[[0. 1. 1.]
 [4. 4. 0.]
 [2. 3. 1.]]


In [21]:
print("Value: x * w_value")
V=np.matmul(x,w_value)
print(V)

Value: x * w_value
[[1. 2. 3.]
 [2. 8. 0.]
 [2. 6. 3.]]


In [22]:
print("Step 4: Scaled Attention Scores")
k_d=1 #square root of k_d=3 rounded down to 1 for this example
attention_scores = (Q @ K.transpose())/k_d
print(attention_scores)

Step 4: Scaled Attention Scores
[[ 2.  4.  4.]
 [ 4. 16. 12.]
 [ 4. 12. 10.]]


In [23]:
print("Step 5: Scaled softmax attention_scores for each vector")
attention_scores[0]=softmax(attention_scores[0])
attention_scores[1]=softmax(attention_scores[1])
attention_scores[2]=softmax(attention_scores[2])
print(attention_scores[0])
print(attention_scores[1])
print(attention_scores[2])

Step 5: Scaled softmax attention_scores for each vector
[0.06337894 0.46831053 0.46831053]
[6.03366485e-06 9.82007865e-01 1.79861014e-02]
[2.95387223e-04 8.80536902e-01 1.19167711e-01]


In [24]:
print("Step 6: attention value obtained by score1/k_d * V")
print(V[0])
print(V[1])
print(V[2])
print("Attention 1")
attention1=attention_scores[0].reshape(-1,1)
attention1=attention_scores[0][0]*V[0]
print(attention1)
print("Attention 2")
attention2=attention_scores[0][1]*V[1]
print(attention2)
print("Attention 3")
attention3=attention_scores[0][2]*V[2]
print(attention3)


Step 6: attention value obtained by score1/k_d * V
[1. 2. 3.]
[2. 8. 0.]
[2. 6. 3.]
Attention 1
[0.06337894 0.12675788 0.19013681]
Attention 2
[0.93662106 3.74648425 0.        ]
Attention 3
[0.93662106 2.80986319 1.40493159]


In [25]:
print("Step 7: summed the results to create the first line of the output matrix")
attention_input1=attention1+attention2+attention3
print(attention_input1)

Step 7: summed the results to create the first line of the output matrix
[1.93662106 6.68310531 1.59506841]


In [26]:
print("Step 8: Step 1 to 7 for inputs 1 to 3")
#We assume we have 3 results with learned weights (they were not trained in this example)
#We assume we are implementing the original Transformer paper. We will have 3 results of 64 dimensions each
attention_head1=np.random.random((3, 64))
print(attention_head1)

Step 8: Step 1 to 7 for inputs 1 to 3
[[0.21962113 0.23332887 0.24878814 0.94922195 0.11339154 0.64616943
  0.14884462 0.05633224 0.79976097 0.53969387 0.04226782 0.17803405
  0.42711866 0.15817828 0.98084094 0.32622351 0.93611146 0.32823529
  0.44039233 0.578684   0.36234751 0.51296216 0.39565728 0.68134302
  0.01266175 0.4989774  0.09607441 0.67107334 0.01587283 0.96229691
  0.82608546 0.49660398 0.72202009 0.20145437 0.124417   0.67969621
  0.83618465 0.44481095 0.66211576 0.87787996 0.69272964 0.21599088
  0.67345826 0.66400322 0.40880414 0.11889926 0.55761756 0.97270575
  0.03783002 0.67044453 0.05130788 0.27384528 0.83421293 0.79341759
  0.81594851 0.83862227 0.07235664 0.00217145 0.71107092 0.34004817
  0.38062029 0.97075716 0.62579064 0.34505933]
 [0.05460562 0.35403144 0.16110127 0.85504687 0.08574397 0.95234895
  0.56794388 0.11438117 0.37297142 0.35361358 0.73072716 0.13568564
  0.53444797 0.26974749 0.71811678 0.8074681  0.20803768 0.77733578
  0.62595064 0.86510122 0.81456

In [27]:
print("Step 9: We assume we have trained the 8 heads of the attention sublayer")
z0h1=np.random.random((3, 64))
z1h2=np.random.random((3, 64))
z2h3=np.random.random((3, 64))
z3h4=np.random.random((3, 64))
z4h5=np.random.random((3, 64))
z5h6=np.random.random((3, 64))
z6h7=np.random.random((3, 64))
z7h8=np.random.random((3, 64))
print("shape of one head",z0h1.shape,"dimension of 8 heads",64*8)

Step 9: We assume we have trained the 8 heads of the attention sublayer
shape of one head (3, 64) dimension of 8 heads 512


In [28]:
print("Step 10: Concantenation of heads 1 to 8 to obtain the original 8x64=512 ouput dimension of the model")
output_attention=np.hstack((z0h1,z1h2,z2h3,z3h4,z4h5,z5h6,z6h7,z7h8))
print(output_attention)

Step 10: Concantenation of heads 1 to 8 to obtain the original 8x64=512 ouput dimension of the model
[[0.45113299 0.92588276 0.13288348 ... 0.43085643 0.31020755 0.31293182]
 [0.42274075 0.78297358 0.11060556 ... 0.17981774 0.55364099 0.84168673]
 [0.59465149 0.53783302 0.70572621 ... 0.38161594 0.52783578 0.06286411]]


In [29]:
#Hugging Face Transformer Model

In [30]:
!pip -q install transformers

In [31]:
from transformers import pipeline

In [33]:
# Install tf-keras package
%pip install tf-keras

traslator = pipeline("translation_en_to_fr")
print(traslator("It is easy to translate languages with transformers", 
                max_length=40))

Collecting tf-keras
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: tf-keras
Successfully installed tf-keras-2.18.0


No model was supplied, defaulted to google-t5/t5-base and revision a9723ea (https://huggingface.co/google-t5/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


Note: you may need to restart the kernel to use updated packages.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/t5-base/a90903540cc02cbeb7ff9f823f1a80eb778c7e22426a0e620b01c77a5ec8f5b4?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1737439645&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNzQzOTY0NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby90NS1iYXNlL2E5MDkwMzU0MGNjMDJjYmViN2ZmOWY4MjNmMWE4MGViNzc4YzdlMjI0MjZhMGU2MjBiMDFjNzdhNWVjOGY1YjQ%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=Q6gotejgEgF3WVzCX1MJ69dsB7GDwlReLOHBB9hM6qMs%7EML22Hefwjv%7EiUNQ9j7KGWZsb-KmO4BJ2LaX2AVRJr1inPnAV1egTuD2knuuFNSqlB90yLcQB%7ExfVkH27HraxF9WWRpHBhPRU8QVp2Swp5FxEbsAZniEWV1TuAN4TWzqMjVloh2LF8uQA2npAq39k7pUwxt%7E0iqV1Kbu4wWrxeTUB%7ELTDpllLmY-9aZ4CGCwOZA1DIEjoVCkbk4D5JVaqQAB6wLyTzwKD%7E7bSUFHOaAH3R%7E%7EL0hX6ig%7EYdwA7ycujiPlRrMQ91n1MkwOBbO91IYkBAZqvi2Y-m4diOyAbA__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSConnectionPool(host='cdn-lfs.hf.

model.safetensors:  96%|#########6| 860M/892M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/t5-base/a90903540cc02cbeb7ff9f823f1a80eb778c7e22426a0e620b01c77a5ec8f5b4?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1737439645&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNzQzOTY0NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby90NS1iYXNlL2E5MDkwMzU0MGNjMDJjYmViN2ZmOWY4MjNmMWE4MGViNzc4YzdlMjI0MjZhMGU2MjBiMDFjNzdhNWVjOGY1YjQ%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=Q6gotejgEgF3WVzCX1MJ69dsB7GDwlReLOHBB9hM6qMs%7EML22Hefwjv%7EiUNQ9j7KGWZsb-KmO4BJ2LaX2AVRJr1inPnAV1egTuD2knuuFNSqlB90yLcQB%7ExfVkH27HraxF9WWRpHBhPRU8QVp2Swp5FxEbsAZniEWV1TuAN4TWzqMjVloh2LF8uQA2npAq39k7pUwxt%7E0iqV1Kbu4wWrxeTUB%7ELTDpllLmY-9aZ4CGCwOZA1DIEjoVCkbk4D5JVaqQAB6wLyTzwKD%7E7bSUFHOaAH3R%7E%7EL0hX6ig%7EYdwA7ycujiPlRrMQ91n1MkwOBbO91IYkBAZqvi2Y-m4diOyAbA__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSConnectionPool(host='cdn-lfs.hf.

model.safetensors:  98%|#########7| 870M/892M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/t5-base/a90903540cc02cbeb7ff9f823f1a80eb778c7e22426a0e620b01c77a5ec8f5b4?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1737439645&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNzQzOTY0NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby90NS1iYXNlL2E5MDkwMzU0MGNjMDJjYmViN2ZmOWY4MjNmMWE4MGViNzc4YzdlMjI0MjZhMGU2MjBiMDFjNzdhNWVjOGY1YjQ%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=Q6gotejgEgF3WVzCX1MJ69dsB7GDwlReLOHBB9hM6qMs%7EML22Hefwjv%7EiUNQ9j7KGWZsb-KmO4BJ2LaX2AVRJr1inPnAV1egTuD2knuuFNSqlB90yLcQB%7ExfVkH27HraxF9WWRpHBhPRU8QVp2Swp5FxEbsAZniEWV1TuAN4TWzqMjVloh2LF8uQA2npAq39k7pUwxt%7E0iqV1Kbu4wWrxeTUB%7ELTDpllLmY-9aZ4CGCwOZA1DIEjoVCkbk4D5JVaqQAB6wLyTzwKD%7E7bSUFHOaAH3R%7E%7EL0hX6ig%7EYdwA7ycujiPlRrMQ91n1MkwOBbO91IYkBAZqvi2Y-m4diOyAbA__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSConnectionPool(host='cdn-lfs.hf.

model.safetensors:  98%|#########7| 870M/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use mps:0


[{'translation_text': "Il est facile de traduire des langues à l'aide de transformateurs"}]
