<a href="https://colab.research.google.com/github/jagvgithub/Attention-is-All-You-Need/blob/main/Attention_is_all_you_need.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Jose Antonio Gómez Vargas

## Redes neuronales y Deep Learning

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import LayerNormalization, Dense, Embedding, Dropout

# Mecanismo de atención escalada por producto punto
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)
    return output, attention_weights

# Capa de atención multi-cabeza
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads
        self.wq = Dense(d_model)
        self.wk = Dense(d_model)
        self.wv = Dense(d_model)
        self.dense = Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))

        output = self.dense(concat_attention)
        return output, attention_weights

# Codificación Posicional
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, d_model)

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
            tf.range(position)[:, tf.newaxis],
            tf.range(d_model)[tf.newaxis, :],
            d_model
        )
        angle_rads[:, 0::2] = tf.math.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = tf.math.cos(angle_rads[:, 1::2])
        return tf.cast(angle_rads[tf.newaxis, ...], dtype=tf.float32)

    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        return pos * angle_rates

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

# Red completa (simplificada)
def transformer(vocab_size, num_layers, units, d_model, num_heads, dropout_rate):
    inputs = tf.keras.layers.Input(shape=(None,))
    enc_padding_mask = None  # Se define una máscara si es necesario
    dec_padding_mask = None  # Se define una máscara si es necesario

    # Codificación de entradas
    enc_output = Embedding(vocab_size, d_model)(inputs)
    enc_output = PositionalEncoding(vocab_size, d_model)(enc_output)

    # Capas del encoder
    for _ in range(num_layers):
        enc_output, _ = MultiHeadAttention(d_model, num_heads)(enc_output, enc_output, enc_output, enc_padding_mask)
        enc_output = Dropout(dropout_rate)(enc_output)
        enc_output = LayerNormalization(epsilon=1e-6)(enc_output)

    # Modelo final
    outputs = Dense(vocab_size)(enc_output)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    return model

# Ejemplo de compilación del modelo
sample_transformer = transformer(
    vocab_size=8500,
    num_layers=6,
    units=2048,
    d_model=512,
    num_heads=8,
    dropout_rate=0.1
)

sample_transformer.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
sample_transformer.summary()


---
Cierre del ejercicio

---

