In [48]:
import numpy as np

# --- Hyperparameters ---
D_MODEL = 64
N_HEADS = 4
D_FF = 128
MAX_SEQ_LEN = 20
N_LAYERS = 2

# --- UTILITIES ---

def initialize_weights(shape):
    fan_in = shape[0]
    limit = np.sqrt(1 / fan_in)
    return np.random.uniform(-limit, limit, shape)

def relu(x):
    return np.maximum(0, x)

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / np.sum(e_x, axis=-1, keepdims=True)

def create_causal_mask(seq_len):
    mask = np.triu(np.ones((seq_len, seq_len)), k=1).astype(bool)
    return mask

# --- KOMPONEN TRANSFORMER ---

class TokenEmbedding:
    def __init__(self, vocab_size, d_model):
        self.weight = initialize_weights((vocab_size, d_model))
    def forward(self, x):
        return self.weight[x]

def sinusoidal_positional_encoding(max_len, d_model):
    pe = np.zeros((max_len, d_model))
    position = np.arange(0, max_len)[:, np.newaxis]
    div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
    pe[:, 0::2] = np.sin(position * div_term)
    pe[:, 1::2] = np.cos(position * div_term[:d_model//2])
    return pe

class PositionalEncoding:
    def __init__(self, max_len, d_model):
        self.pe = sinusoidal_positional_encoding(max_len, d_model)
    def forward(self, x):
        seq_len = x.shape[1]
        return x + self.pe[:seq_len, :]

def scaled_dot_product_attention(Q, K, V, mask=None):
    d_k = Q.shape[-1]
    scores = np.matmul(Q, K.swapaxes(-2, -1)) / np.sqrt(d_k)

    if mask is not None:
        if mask.ndim == 2:
             mask = mask[np.newaxis, np.newaxis, :, :]
        scores = np.where(mask, -1e9, scores)

    attention_weights = softmax(scores)
    output = np.matmul(attention_weights, V)

    return output, attention_weights

class MultiHeadAttention:
    def __init__(self, d_model, n_heads):
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        self.Wq = initialize_weights((d_model, d_model))
        self.Wk = initialize_weights((d_model, d_model))
        self.Wv = initialize_weights((d_model, d_model))
        self.Wo = initialize_weights((d_model, d_model))

    def split_heads(self, x):
        batch_size, seq_len, _ = x.shape
        x = x.reshape(batch_size, seq_len, self.n_heads, self.d_k)
        return x.transpose(0, 2, 1, 3).reshape(-1, seq_len, self.d_k)

    def combine_heads(self, x, batch_size, seq_len):
        x = x.reshape(batch_size, self.n_heads, seq_len, self.d_k)
        x = x.transpose(0, 2, 1, 3)
        return x.reshape(batch_size, seq_len, self.d_model)

    def forward(self, x, mask):
        batch_size, seq_len, _ = x.shape
        Q, K, V = [np.matmul(x, W) for W in [self.Wq, self.Wk, self.Wv]]

        Q_split, K_split, V_split = [self.split_heads(v) for v in [Q, K, V]]

        attn_output, attention_weights_all_heads = scaled_dot_product_attention(Q_split, K_split, V_split, mask)

        attn_output_combined = self.combine_heads(attn_output, batch_size, seq_len)
        output = np.matmul(attn_output_combined, self.Wo)

        single_head_attn = np.squeeze(attention_weights_all_heads[0])

        if single_head_attn.ndim > 2:
          single_head_attn = single_head_attn[-seq_len:, -seq_len:]

        return output, single_head_attn

class FeedForwardNetwork:
    def __init__(self, d_model, d_ff):
        self.W1 = initialize_weights((d_model, d_ff))
        self.B1 = np.zeros((1, d_ff))
        self.W2 = initialize_weights((d_ff, d_model))
        self.B2 = np.zeros((1, d_model))

    def forward(self, x):
        original_shape = x.shape
        x_flat = x.reshape(-1, original_shape[-1])

        h = np.matmul(x_flat, self.W1) + self.B1
        h = relu(h)

        output_flat = np.matmul(h, self.W2) + self.B2
        return output_flat.reshape(original_shape)

class LayerNorm:
    def __init__(self, d_model, epsilon=1e-6):
        self.epsilon = epsilon
        self.gamma = np.ones(d_model)
        self.beta = np.zeros(d_model)

    def forward(self, x):
        mean = np.mean(x, axis=-1, keepdims=True)
        variance = np.var(x, axis=-1, keepdims=True)
        x_norm = (x - mean) / np.sqrt(variance + self.epsilon)
        return self.gamma * x_norm + self.beta

class DecoderBlock:
    def __init__(self, d_model, n_heads, d_ff):
        self.norm1 = LayerNorm(d_model)
        self.attn = MultiHeadAttention(d_model, n_heads)
        self.norm2 = LayerNorm(d_model)
        self.ffn = FeedForwardNetwork(d_model, d_ff)

    def forward(self, x, causal_mask):
        # Self-Attention (Pre-Norm + Residual)
        x_norm1 = self.norm1.forward(x)
        attn_output, single_head_attn = self.attn.forward(x_norm1, causal_mask)
        x = x + attn_output

        # FFN (Pre-Norm + Residual)
        x_norm2 = self.norm2.forward(x)
        ffn_output = self.ffn.forward(x_norm2)
        x = x + ffn_output

        return x, single_head_attn

class OutputLayer:
    def __init__(self, d_model, vocab_size, tied_weights):
        self.W = tied_weights.T # Weight Tying
        self.B = np.zeros((1, vocab_size))

    def forward(self, x):
        original_shape = x.shape
        x_flat = x.reshape(-1, original_shape[-1])
        logits_flat = np.matmul(x_flat, self.W) + self.B
        logits = logits_flat.reshape(original_shape[0], original_shape[1], -1)
        probabilities = softmax(logits)
        return logits, probabilities

class DecoderOnlyTransformer:
    def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff, max_seq_len):
        self.n_layers = n_layers
        self.token_embed = TokenEmbedding(vocab_size, d_model)
        self.pos_embed = PositionalEncoding(max_seq_len, d_model)
        self.decoder_blocks = [DecoderBlock(d_model, n_heads, d_ff) for _ in range(n_layers)]
        self.final_norm = LayerNorm(d_model)
        self.output_layer = OutputLayer(d_model, vocab_size, self.token_embed.weight)

    def forward(self, input_tokens):
        batch_size, seq_len = input_tokens.shape

        x = self.token_embed.forward(input_tokens)
        x = self.pos_embed.forward(x)
        causal_mask = create_causal_mask(seq_len)

        attention_matrices = []
        for block in self.decoder_blocks:
            x, single_head_attn = block.forward(x, causal_mask)
            attention_matrices.append(single_head_attn)

        x = self.final_norm.forward(x)
        logits, probabilities = self.output_layer.forward(x)

        next_token_probs = probabilities[:, -1, :]

        return logits, next_token_probs, attention_matrices



In [57]:
def clean_text_strictly(text):
    punc = '.,;:"!?'
    text = text.lower()
    for char in punc:
        text = text.replace(char, '')
    return text.strip()

def build_vocab_from_corpus(corpus_text):
    tokens = ["<PAD>", "<UNK>", "<SOS>", "<EOS>"]

    cleaned_corpus = clean_text_strictly(corpus_text)

    # Membagi corpus menjadi kata-kata
    corpus_tokens = cleaned_corpus.split()
    unique_corpus_tokens = sorted(list(set(corpus_tokens)))

    tokens.extend(unique_corpus_tokens)

    vocab = {token: i for i, token in enumerate(tokens)}
    id_to_token = {i: token for token, i in vocab.items()}

    return vocab, id_to_token, len(tokens)

def simple_tokenize(text, vocab):
    cleaned_text = clean_text_strictly(text)
    tokens = cleaned_text.split()
    ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
    return np.array([ids], dtype=np.int32)

In [60]:
print("="*60)
print("## SETUP TRANSFORMER INTERAKTIF (PYTHON/NUMPY ONLY) ##")
print("="*60)

print("LANGKAH 1: Masukkan Korpus untuk membangun Kosakata:")
corpus_input = input("Korpus: \n> ")

TOKEN_TO_ID, ID_TO_TOKEN, VOCAB_SIZE_TEST = build_vocab_from_corpus(corpus_input)

print(f"\n✅ Kosakata Dibangun. Ukuran Vocab: {VOCAB_SIZE_TEST}")
print(f"Token: {list(ID_TO_TOKEN.values())}")
print("-" * 60)

gpt_model = DecoderOnlyTransformer(
    vocab_size=VOCAB_SIZE_TEST,
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    d_ff=D_FF,
    max_seq_len=MAX_SEQ_LEN
)

print("\nLANGKAH 2: Masukkan Teks Masukan untuk Prediksi Token Berikutnya:")
input_text = input("Input Teks (Contoh: Kucing suka tidur): \n> ")

input_data = simple_tokenize(input_text, TOKEN_TO_ID)

if input_data.shape[1] == 0:
    print("❌ ERROR: Input teks tidak mengandung token yang valid.")
else:
    logits, next_token_probs, attention_matrices = gpt_model.forward(input_data)

    last_token_id = input_data[0, -1]
    last_token_str = ID_TO_TOKEN.get(last_token_id, '<UNK>')
    INPUT_SEQ_LEN = input_data.shape[1]

    print("\n" + "="*50)
    print("1. LOGITS (Skor Mentah Sebelum Softmax)")
    print(f"Shape: {logits.shape} (Batch=1, Seq_Len={INPUT_SEQ_LEN}, Vocab={VOCAB_SIZE_TEST})")
    print(f"Logits untuk Posisi Terakhir ('{last_token_str}'):\n{logits[0, -1, :]}")
    print("="*50)

    print("\n2. NEXT SOFTMAX PREDICTION (Probabilitas Token Berikutnya)")
    probs_vector = next_token_probs[0]
    TOP_N = 5
    top_indices = np.argsort(probs_vector)[::-1][:TOP_N]

    print(f"Prediksi {TOP_N} Token Teratas (setelah '{input_text}'):")
    print("-" * 40)
    for rank, idx in enumerate(top_indices):
        token = ID_TO_TOKEN[idx]
        prob = probs_vector[idx]
        logit_value = logits[0, -1, idx]

        print(f"Rank {rank+1}: '{token}' \t| Probabilitas: {prob*100:.3f}% \t| Logit: {logit_value:.4f}")
    print("-" * 40)

## SETUP TRANSFORMER INTERAKTIF (PYTHON/NUMPY ONLY) ##
LANGKAH 1: Masukkan Korpus untuk membangun Kosakata:
Korpus: 
> Kucing lucu suka tidur di sofa dan makan ikan

✅ Kosakata Dibangun. Ukuran Vocab: 13
Token: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'dan', 'di', 'ikan', 'kucing', 'lucu', 'makan', 'sofa', 'suka', 'tidur']
------------------------------------------------------------

LANGKAH 2: Masukkan Teks Masukan untuk Prediksi Token Berikutnya:
Input Teks (Contoh: Kucing suka tidur): 
> dinosaurus

1. LOGITS (Skor Mentah Sebelum Softmax)
Shape: (1, 1, 13) (Batch=1, Seq_Len=1, Vocab=13)
Logits untuk Posisi Terakhir ('<UNK>'):
[-1.11077793 -0.105291    0.90916034  0.27682617 -1.21726434 -2.02807764
  0.10318151 -0.06852905  1.11791946  0.21859016 -0.82952143  1.30458056
  1.30945751]

2. NEXT SOFTMAX PREDICTION (Probabilitas Token Berikutnya)
Prediksi 5 Token Teratas (setelah 'dinosaurus'):
----------------------------------------
Rank 1: 'tidur' 	| Probabilitas: 18.870% 	| Logit: 1.3095