In [2]:
!pip install tensorflow==2.19.0
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPU available:",
tf.config.list_physical_devices('GPU'))

TensorFlow version: 2.19.0
GPU available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
import requests
import numpy as np
# Unduh teks Shakespeare
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
text = requests.get(url).text
print(f"Panjang teks: {len(text)} karakter")
print("Contoh:\n", text[:500])

Panjang teks: 1115394 karakter
Contoh:
 First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [4]:
# Bangun vocabulary
vocab = sorted(set(text))
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")
# Mapping char â†” index
char_to_idx = {ch: i for i, ch in enumerate(vocab)}
idx_to_char = np.array(vocab)
# Encode teks menjadi integer
text_as_int = np.array([char_to_idx[c] for c in text])
# Parameter
seq_length = 100 # Panjang konteks
batch_size = 64
buffer_size = 10000

# Buat dataset pasangan (input, target)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text
dataset = sequences.map(split_input_target)
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
dataset = dataset.prefetch(tf.data.AUTOTUNE)

Vocabulary size: 65


In [5]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model, max_len=5000):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.pos_encoding = self.positional_encoding(max_len, d_model)

    def positional_encoding(self, position, d_model):
        # Learned positional embedding (lebih sederhana)
        return tf.Variable(tf.random.normal((1, position, d_model)))

    def call(self, x):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x) * tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[:, :seq_len, :]
        return x

In [6]:
class CausalSelfAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % num_heads == 0
        self.depth = d_model // num_heads
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask=None):
        batch_size = tf.shape(q)[0]
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        # Skala dot-product attention
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # Causal mask
        if mask is None:
            seq_len = tf.shape(q)[2]
            mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
            mask = mask[tf.newaxis, tf.newaxis, :, :]
        scaled_attention_logits += (mask * -1e9)
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        output = tf.reshape(output, (batch_size, -1, self.d_model))
        return self.dense(output)

In [7]:
    def call(self, v, k, q, mask=None):
        batch_size = tf.shape(q)[0]
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        # Skala dot-product attention
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # Causal mask
        if mask is None:
            seq_len = tf.shape(q)[2]
            mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
            mask = mask[tf.newaxis, tf.newaxis, :, :]
        scaled_attention_logits += (mask * -1e9)
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        output = tf.reshape(output, (batch_size, -1, self.d_model))
        return self.dense(output)

In [8]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

In [9]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.mha = CausalSelfAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training):
        attn_output = self.mha(x, x, x) # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

In [10]:
class GPT(tf.keras.Model):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, dff, max_len=1000, rate=0.1):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.pos_embedding = PositionalEmbedding(vocab_size, d_model, max_len)
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
        self.final_layer = tf.keras.layers.Dense(vocab_size)

    def call(self, x, training):
        x = self.pos_embedding(x) # (batch, seq_len, d_model)
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x = self.dec_layers[i](x, training=training)
        logits = self.final_layer(x) # (batch, seq_len, vocab_size)
        return logits

In [15]:
# Hyperparameter
vocab_size = len(vocab)
d_model = 128
num_layers = 4
um_heads = 8
dff = 512
max_len = 512 # Increased max_len to accommodate longer generated sequences (e.g., 6 + 300 = 306)
# Inisialisasi model
model = GPT(vocab_size, d_model, num_layers, num_heads, dff, max_len)
# Optimizer dan loss
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def loss_function(real, pred):
# Opsional: mask padding jika ada (di dataset ini, tidak ada padding sebenarnya)
# Tapi kita pertahankan untuk umum
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)
@tf.function
def train_step(inp, tar):
    with tf.GradientTape() as tape:
        predictions = model(inp, training=True)
        loss = loss_function(tar, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss # Kembalikan loss untuk logging

# Pelatihan
EPOCHS = 50
for epoch in range(EPOCHS):
    epoch_loss = 0.0
    num_batches = 0
    for batch, (inp, tar) in enumerate(dataset):
        loss = train_step(inp, tar)
        epoch_loss += loss
        num_batches += 1
        if batch % 100 == 0:
            avg_loss = epoch_loss / num_batches
            print(f'Epoch {epoch+1} Batch {batch} Avg Loss {avg_loss:.4f}')

Epoch 1 Batch 0 Avg Loss 4.3424
Epoch 1 Batch 100 Avg Loss 3.0122
Epoch 2 Batch 0 Avg Loss 2.3697
Epoch 2 Batch 100 Avg Loss 2.3185
Epoch 3 Batch 0 Avg Loss 2.1703
Epoch 3 Batch 100 Avg Loss 2.1082
Epoch 4 Batch 0 Avg Loss 1.9953
Epoch 4 Batch 100 Avg Loss 1.9536
Epoch 5 Batch 0 Avg Loss 1.8618
Epoch 5 Batch 100 Avg Loss 1.8432
Epoch 6 Batch 0 Avg Loss 1.7827
Epoch 6 Batch 100 Avg Loss 1.7604
Epoch 7 Batch 0 Avg Loss 1.7434
Epoch 7 Batch 100 Avg Loss 1.6958
Epoch 8 Batch 0 Avg Loss 1.6333
Epoch 8 Batch 100 Avg Loss 1.6480
Epoch 9 Batch 0 Avg Loss 1.6365
Epoch 9 Batch 100 Avg Loss 1.6120
Epoch 10 Batch 0 Avg Loss 1.5501
Epoch 10 Batch 100 Avg Loss 1.5799
Epoch 11 Batch 0 Avg Loss 1.6062
Epoch 11 Batch 100 Avg Loss 1.5562
Epoch 12 Batch 0 Avg Loss 1.5635
Epoch 12 Batch 100 Avg Loss 1.5333
Epoch 13 Batch 0 Avg Loss 1.4904
Epoch 13 Batch 100 Avg Loss 1.5127
Epoch 14 Batch 0 Avg Loss 1.4881
Epoch 14 Batch 100 Avg Loss 1.4965
Epoch 15 Batch 0 Avg Loss 1.5245
Epoch 15 Batch 100 Avg Loss 1.482

In [13]:
def generate_text(model, start_string, length=100, temperature=1.0):
    # Encode string awal menjadi indeks
    input_ids = [char_to_idx.get(s, 0) for s in start_string] # gunakan 0 jika karakter tidak dikenal
    input_ids = tf.expand_dims(input_ids, 0) # shape: (1, seq_len)
    text_generated = []
    # Jalankan inference autoregressive
    for i in range(length):
        # Prediksi distribusi probabilitas untuk seluruh urutan
        predictions = model(input_ids, training=False) # shape: (1, seq_len, vocab_size)
        # Ambil prediksi untuk posisi terakhir
        last_pred = predictions[:, -1, :] # shape: (1, vocab_size)
        # Terapkan temperature scaling (opsional, untuk variasi)
        last_pred = last_pred / temperature
        # Sampling dari distribusi
        predicted_id = tf.random.categorical(last_pred, num_samples=1) # shape: (1, 1)
        predicted_id = tf.squeeze(predicted_id, axis=-1).numpy()[0] # scalar int
        # Tambahkan ke input berikutnya
        input_ids = tf.concat([input_ids, [[predicted_id]]], axis=1)
        # Tambahkan ke hasil
        text_generated.append(idx_to_char[predicted_id])
    return start_string + ''.join(text_generated)

In [20]:
# Generate teks
generated = generate_text(model, start_string="ROMEO:",
length=300, temperature=0.8)
print(generated)

ROMEO:
Come I do you think, and you well not stay to have perceived
Your wells?

ESCALUS:
Therefore it imor goty, y y oringre.

Wind be, matines were y y?
May howas mes, g, he ahe he corenoure t ay I ale onourest thles e moure bour me thithede con. ples s; abestsigles t t t gre wes! t t hot,
be t t we t b
