# Encoder Input Transformation

In [947]:
encoder_sequence = "I love you !"

## One Hot Tokenization

In [None]:
encoder_token_to_idx = {
    "I": 0,
    "love": 1,
    "you": 2,
    "!": 3,
    "pizza": 4
}

encoder_idx_to_token = {idx: token for token, idx in encoder_token_to_idx.items() }
encoder_idx_to_token

### Pre-Tokenization

In [None]:
max_length = 6

In [None]:
pre_tokens = [token for token in encoder_sequence.split()]
pre_tokens

In [None]:
oh_encoding = [[0] * len(encoder_token_to_idx) for i in range(max_length)]
oh_encoding

In [None]:
for i, token in enumerate(pre_tokens):
    j = encoder_token_to_idx[token]
    oh_encoding[i][j] = 1

oh_encoding

## Lernable Input Embedding

In [None]:
dim = 3  # We want dim < len(encoder_token_to_idx)

In [None]:
def dot_product_matrix_multiplicaton(matrix_a, matrix_b):
    matrix_a_n = len(matrix_a)
    matrix_a_m = len(matrix_a[0])
    matrix_b_n = len(matrix_b)
    matrix_b_m = len(matrix_b[0])

    assert matrix_a_m == matrix_b_n, "please rearrange matrices so that n_cols of matrix_a equals n_row of matrix_b"

    result = []

    for i in range(matrix_a_n):
        result.append([0] * matrix_b_m)

    # For each row in A
    for i in range(matrix_a_n):
        # For each column in B
        for j in range(matrix_b_m):
            # For the new Matrix the position i, j is the sum of the individual products 
            # of these row and column values
            for k in range(matrix_b_n):
                result[i][j] += matrix_a[i][k] * matrix_b[k][j]

    return result

In [None]:
embedding_transformation = [[ 0.0101,  0.1608, -0.0674],
        [-0.2484,  0.2732, -0.2361],
        [-0.4152,  0.3621,  0.1320],
        [-0.0577, -0.1929,  0.2178],
        [-0.3691,  0.1721, -0.3601]]

input_embedding = dot_product_matrix_multiplicaton(oh_encoding, embedding_transformation)
input_embedding

In [None]:
def add_matrices(matrix_a, matrix_b):
    matrix_a_n = len(matrix_a)
    matrix_a_m = len(matrix_a[0])
    matrix_b_n = len(matrix_b)
    matrix_b_m = len(matrix_b[0])

    # assert matrix_a_n == matrix_b_n and matrix_a_m == matrix_b_m, "matrices must have equal dimensionality"

    result = []

    for row_a, row_b in zip(matrix_a, matrix_b):

        row = []
        for i in range(len(row_a)):
            row.append(row_a[i] + row_b[i])

        result.append(row)
    
    return result

In [None]:
positional_encoding = [[ 1, 0, 0],
        [ 0, 1, 0],
        [ 0, 0, 1],
        [ 1, 1, 0],
        [ 1, 0, 1],
        [ 1, 1, 1]]


input_embedding_with_positional_encoding = add_matrices(
    input_embedding, positional_encoding)

input_embedding_with_positional_encoding

# Encoder Layer

In [None]:
latent_space = input_embedding_with_positional_encoding

## Keys

In [None]:
k = [[-0.2409, -0.5735,  0.3882],
        [ 0.0843, -0.1487,  0.2438],
        [ 0.2335, -0.3259, -1.3235]]

k = dot_product_matrix_multiplicaton(latent_space, k)
k

## Queries

In [None]:
q = [[-0.1427,  0.0482, -0.0687],
        [-0.2875,  1.1645, -0.4308],
        [ 0.5597, -0.2897, -0.2003]]

q = dot_product_matrix_multiplicaton(latent_space, q)
q

## Attention

In [None]:
def transpose_matrix(matrix):
    # https://stackoverflow.com/questions/6473679/transpose-list-of-lists

    return list(map(list, zip(*matrix)))

In [None]:
k_transposed = transpose_matrix(k)
k_transposed

In [None]:
attention_scores = dot_product_matrix_multiplicaton(q, k_transposed)
attention_scores

In [None]:
e = (1 + 1/100000) ** 100000  # hmm

def softmax_1d(vector):
    exped = [e ** item for item in vector]
    total = sum(exped)
    return [item / total for item in exped]

def softmax_matrix(matrix):
    result = []

    for row in matrix:
        result.append(softmax_1d(row))
    
    return result

In [None]:
softmax_attention_scores = softmax_matrix(attention_scores)
softmax_attention_scores

## Values

In [None]:
v = [[ 2.5655, -0.1657, -0.5686],
        [-0.3721,  0.4021, -0.0208],
        [ 0.2262, -0.5495,  0.4100]]

v = dot_product_matrix_multiplicaton(latent_space, v)
v

In [None]:
latent_space = dot_product_matrix_multiplicaton(softmax_attention_scores, v)
latent_space

### Repeat with differnt k, q and v for each Encoder Layer

In [None]:
encoder_latent_space = latent_space

# Decoder Input Transformation

In [None]:
decoder_sequence = "<sos> Ich"

## One Hot Tokenization

In [None]:
decoder_token_to_idx = {
    "Pizza": 0,
    "Ich": 1,
    "dich": 2,
    "liebe": 3,
    "!": 4,
    "<sos>": 5
}

decoder_idx_to_token = {idx: token for token, idx in decoder_token_to_idx.items() }
decoder_idx_to_token

### Pre-Tokenization

In [None]:
pre_tokens = [token for token in decoder_sequence.split()]
pre_tokens

In [None]:
oh_encoding = [[0] * len(decoder_token_to_idx) for i in range(max_length)]
oh_encoding

In [None]:
for i, token in enumerate(pre_tokens):
    j = decoder_token_to_idx[token]
    oh_encoding[i][j] = 1

oh_encoding

## Lernable Input Embedding

In [None]:
embedding_transformation = [[ 0.0977, -0.3945, 1.2683],
        [ 0.0096, 1.0847,  0.0539],
        [-0.3053,  0.2832,  0.2126],
        [-0.0951,  0.2297,  0.0180],
        [ 0.0416, -0.1804,  0.1314],
        [-0.1219, -0.1792,  1.1145]]

input_embedding = dot_product_matrix_multiplicaton(oh_encoding, embedding_transformation)
input_embedding

In [None]:
positional_encoding = [[ 1, 0, 0],
        [ 0, 1, 0],
        [ 0, 0, 1],
        [ 1, 1, 0],
        [ 1, 0, 1],
        [ 1, 1, 1]]


input_embedding_with_positional_encoding = add_matrices(
    input_embedding, positional_encoding)

input_embedding_with_positional_encoding

# Decoder Layer

In [None]:
encoder_latent_space

In [None]:
decoder_latent_space = input_embedding_with_positional_encoding
decoder_latent_space

## Keys

In [None]:
k = [[-0.0857, 0.0989, 0.3189],
        [-0.4483, -1.2625,  0.1883],
        [-0.0205,  0.1175, -1.0054]]

k = dot_product_matrix_multiplicaton(encoder_latent_space, k)
k

## Queries

In [None]:
q = [[ 0.5223, -0.4201,  0.2190],
        [ 0.3426,  0.1201, 1.0258],
        [ 0.3770, -0.2533, -0.3659]]

q = dot_product_matrix_multiplicaton(latent_space, q)
q

## Attention

In [None]:
k_transposed = transpose_matrix(k)
k_transposed

In [None]:
attention_scores = dot_product_matrix_multiplicaton(q, k_transposed)
attention_scores

In [None]:
def causalize_attention(attention_scores):
    n = len(attention_scores)
    m = len(attention_scores[0])

    result = []
    for i, row in enumerate(attention_scores):
        new_row = [0] * len(row)

        for j in range(i + 1):
            new_row[j] = row[j]

        result.append(new_row)

    return result

In [None]:
attention_scores = causalize_attention(attention_scores)
attention_scores

In [None]:
softmax_attention_scores = softmax_matrix(attention_scores)
softmax_attention_scores

## Values

In [None]:
v = [[-0.3140,  2.4510, -1.5762],
        [ 1.2975, -0.1267, 2.5162],
        [-0.3554, -0.5260,  2.4740]]

v = dot_product_matrix_multiplicaton(encoder_latent_space, v)
v

In [None]:
decoder_latent_space = dot_product_matrix_multiplicaton(softmax_attention_scores, v)
decoder_latent_space

### Repeat with differnt k, q and v for each Encoder Layer

In [None]:
last_hidden_state = decoder_latent_space

# Language Model Head

In [None]:
language_model_head_transformation = [[1.2591, 0.2340,  0.2995, -0.3929, 1.2186, -0.2409],
        [-0.0096,  -1.0262, -3.1889, -0.2528, -0.0658, -1.1641],
        [ 1.2318,  1.0766,  2.3248, -0.2726,  2.3647, -1.0103]]

logits = dot_product_matrix_multiplicaton(last_hidden_state, language_model_head_transformation)
logits

In [None]:
softmax_logits = softmax_matrix(logits)
softmax_logits

In [None]:
for i, row in enumerate(softmax_logits):
    if i >= len(pre_tokens):
        break

    row_max = max(row)
    idx_max = row.index(row_max)
    print(decoder_idx_to_token[idx_max])