In [5]:
import numpy as np
import torch
from math import exp

# Introduction to Generalist Policies

## Understanding Policy

Policy $(\pi)$ is a model that maps an agent’s actions to states. The policy map gives the probability of taking action $A$ when in state $S$.

<br>

<div>
<img src="img/policy2.png" width="380" style="margin-right: 60px;"/> <img src="img/policy3.png" width="380" style="margin-right: 60px;"/> <img src="img/feedbackloop.png" width="370"/> 
</div>


# Architecture

<img src="img/transformer.png" width="740"/>

<img src="img/architecture.png" width="800"/>

## Understanding Embedding

In [7]:
sentence = 'Put the cube on the table'

dc = {s:i for i,s 
      in enumerate(sorted(sentence.replace(',', '').split()))}

print(dc)


{'Put': 0, 'cube': 1, 'on': 2, 'table': 3, 'the': 5}


We use the dictionary to assign an integer index to each word:

In [8]:
sentence_int = torch.tensor(
    [dc[s] for s in sentence.replace(',', '').split()]
)
print(sentence_int)

tensor([0, 5, 1, 2, 5, 3])


In [9]:
vocab_size = 50_000

torch.manual_seed(123)
embed = torch.nn.Embedding(vocab_size, 3)
embedded_sentence = embed(sentence_int).detach()

print(embedded_sentence)
print(embedded_sentence.shape)

tensor([[ 0.3374, -0.1778, -0.3035],
        [ 0.2692, -0.0770, -1.0205],
        [-0.5880,  0.3486,  0.6603],
        [-0.2196, -0.3792,  0.7671],
        [ 0.2692, -0.0770, -1.0205],
        [-1.1925,  0.6984, -1.4097]])
torch.Size([6, 3])


In [14]:
torch.manual_seed(123)
d = embedded_sentence.shape[1]
d_q, d_k, d_v = 2, 2, 4
W_query = torch.nn.Parameter(torch.rand(d, d_q))
W_key = torch.nn.Parameter(torch.rand(d, d_k))
W_value = torch.nn.Parameter(torch.rand(d, d_v))


x_2 = embedded_sentence[1]
query_2 = x_2 @ W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value


print(query_2.shape)
print(key_2.shape)
print(value_2.shape)

torch.Size([2])
torch.Size([2])
torch.Size([4])


In [16]:
keys = embedded_sentence @ W_key
values = embedded_sentence @ W_value

print("keys.shape:", keys.shape)
print("values.shape:", values.shape)

keys.shape: torch.Size([6, 2])
values.shape: torch.Size([6, 4])


In [17]:
omega_24 = query_2.dot(keys[4])
print(omega_24)

tensor(0.5869, grad_fn=<DotBackward0>)


In [18]:
omega_2 = query_2 @ keys.T
print(omega_2)

tensor([ 0.2432,  0.5869, -0.5191, -0.1851,  0.5869,  0.4730],
       grad_fn=<SqueezeBackward4>)


# Understanding Query, Key, Value

<img src="img/attention.png" width="700"/>

# Softmax Function

<img src="img/softmax.png" width="500"/>

In [None]:
def softmax(input_vector):
    # Calculate the exponent of each element in the input vector
    exponents = [exp(i) for i in input_vector]

    # Calculate the sum of the exponents
    sum_of_exponents = sum(exponents)

    # Divide each exponent by the sum of the exponents and round to 3 decimal places
    probabilities = [round(i / sum_of_exponents, 3) for i in exponents]

    return probabilities

print(softmax([1.3, 5.1, 2.2, 0.7, 1.1]))

[0.02, 0.903, 0.05, 0.011, 0.017]


In [None]:
# Define the output layer values (logits)
logits = np.array([1.3, 5.1, 2.2, 0.7, 1.1])

# Implement the softmax function
def softmax(i):
    exp_i = np.exp(i) # Compute the exponentials of the input values
    return exp_i / np.sum(exp_i) # Normalize by dividing by the sum of exponentials

# Apply the softmax function
probabilities = softmax(logits)

# Print the resulting probabilities
print(probabilities)

[0.02019046 0.90253769 0.04966053 0.01108076 0.01653055]


## Self Attention vs Multihead Attention

- Self-attention: The fundamental building block of transformer-based LLMs that allows models to weigh the importance of different parts of the input data.
- Multi-head attention: An extension of self-attention that allows the model to simultaneously focus on information from different representation subspaces.
- Cross-attention: A variant that enables the model to attend to two different sequences, which makes it useful in tasks like translation or summarization.
- Causal-attention: A variant to ensure that the prediction for each token depends only on the preceding tokens, which is important for text generation, where each prediction should be based only on the prior context.
