<a href="https://colab.research.google.com/github/juhumkwon/source_code/blob/main/Encoder_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import tensorflow as tf
import numpy as np

# 문장 및 토큰화
sentence = "I go to school"
vocab = {'<pad>': 0, 'I': 1, 'go': 2, 'to': 3, 'school': 4}
token_ids = [vocab[word] for word in sentence.split()]  # [1, 2, 3, 4]
input_tokens = tf.constant([token_ids])  # (1, seq_len)

# 파라미터
d_model = 512
num_heads = 8
depth = d_model // num_heads
max_len = 10

# 임베딩
embedding = tf.keras.layers.Embedding(input_dim=len(vocab), output_dim=d_model)
x = embedding(input_tokens)  # (1, seq_len, d_model)
seq_len = tf.shape(x)[1]

# 포지셔널 인코딩
def get_positional_encoding(position, d_model):
    angle_rads = np.arange(position)[:, np.newaxis] / np.power(10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

pos_encoding = get_positional_encoding(max_len, d_model)
x += pos_encoding[:, :seq_len, :]

# 쿼리, 키, 밸류 projection
Wq = tf.keras.layers.Dense(d_model)
Wk = tf.keras.layers.Dense(d_model)
Wv = tf.keras.layers.Dense(d_model)
Wo = tf.keras.layers.Dense(d_model)

q = Wq(x)  # (1, seq_len, d_model)
k = Wk(x)
v = Wv(x)

# 스플릿: (batch, seq_len, num_heads, depth) -> (batch, num_heads, seq_len, depth)
def split_heads(x, batch_size):
    x = tf.reshape(x, (batch_size, -1, num_heads, depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

batch_size = tf.shape(x)[0]
q = split_heads(q, batch_size)
k = split_heads(k, batch_size)
v = split_heads(v, batch_size)

# 스케일드 닷 프로덕트 어텐션
def scaled_dot_product_attention(q, k, v):
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
    return output, attention_weights

scaled_attention, attn_weights = scaled_dot_product_attention(q, k, v)  # (batch, num_heads, seq_len, depth)

# concat heads: (batch, seq_len, d_model)
def concat_heads(x):
    x = tf.transpose(x, perm=[0, 2, 1, 3])  # (batch, seq_len, num_heads, depth)
    return tf.reshape(x, (batch_size, -1, d_model))

concat_attention = concat_heads(scaled_attention)  # (1, seq_len, d_model)

# 최종 projection
output = Wo(concat_attention)  # (1, seq_len, d_model)

print("최종 인코더 출력:", output.shape)

최종 인코더 출력: (1, 4, 512)
