# Mini Transformer - Topic Classification

## Iskandar Muda Rizky Parlambang (18221109)

# Import Library

In [31]:
!pip install -U datasets



In [32]:
!pip uninstall -y -q tensorflow keras tensorflow-estimator tensorflow-text
!pip install -q -U tensorflow-text tensorflow

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.11.0 requires tensorflow==2.18.0, but you have tensorflow 2.19.0 which is incompatible.
tf-keras 2.18.0 requires tensorflow<2.19,>=2.18, but you have tensorflow 2.19.0 which is incompatible.[0m[31m
[0m

In [33]:
from datasets import load_dataset
import logging
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_text
from transformers import BertTokenizer
from torch.utils.data import DataLoader
import tensorflow_text as tf_text
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Load Dataset

In [34]:
dataset = load_dataset("valurank/Topic_Classification", split="train")

dataset = dataset.filter(lambda x: x['article_text'] is not None and len(str(x['article_text']).strip()) > 0)

# Encode Label
labels = dataset["topic"]
unique_labels = sorted(set(labels))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {v: k for k, v in label2id.items()}

def encode_label(data):
    data["topic"] = label2id[data["topic"]]
    return data

dataset = dataset.map(encode_label)
dataset = dataset.shuffle(seed=42)

train_dataset = dataset.select(range(0, 17500))
val_dataset = dataset.select(range(17500, len(dataset)))

# Tokenizer

In [35]:
# Memakai BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size
pad_token_id = tokenizer.pad_token_id

# Batches

In [36]:
def make_batches(dataset, tokenizer, batch_size=32):
    def tokenize_function(data):
        return tokenizer(
            data["article_text"],
            truncation=True,
            padding="max_length",
            max_length=64
        )

    tokenized = dataset.map(tokenize_function, batched=True)
    tokenized.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "topic"])

    def gen():
        for data in tokenized:
            yield (
                {
                    "input_ids": data["input_ids"],
                    "attention_mask": data["attention_mask"]
                },
                data["topic"]
            )

    return tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            {
                "input_ids": tf.TensorSpec(shape=(64,), dtype=tf.int64),
                "attention_mask": tf.TensorSpec(shape=(64,), dtype=tf.int64),
            },
            tf.TensorSpec(shape=(), dtype=tf.int64)
        )
    ).repeat().batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [37]:
train_batches = make_batches(train_dataset, tokenizer)
val_batches = make_batches(val_dataset, tokenizer)

Map:   0%|          | 0/17500 [00:00<?, ? examples/s]

Map:   0%|          | 0/4962 [00:00<?, ? examples/s]

# Positional Encoding

In [38]:
def positional_encoding(length, depth):
    depth = depth / 2
    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis, :] / depth
    angle_rates = 1 / (10000**depths)
    angle_rads = positions * angle_rates
    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)
    return tf.cast(pos_encoding, dtype=tf.float32)

In [39]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=False)
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

# Attention

In [40]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

In [41]:
class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(query=x, value=x, key=x)
        x = self.add([x, attn_output])
        return self.layernorm(x)

# Feed Forward

In [42]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        return self.layer_norm(x)

# Encoder

In [43]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        self.self_attention = GlobalSelfAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = FeedForward(d_model, dff, dropout_rate)

    def call(self, x):
        x = self.self_attention(x)
        x = self.ffn(x)
        return x

In [44]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size):
        super().__init__()
        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.enc_layers = [
            EncoderLayer(d_model=d_model, num_heads=num_heads, dff=dff)
            for _ in range(num_layers)
        ]

    def call(self, x):
        x = self.pos_embedding(x)
        for layer in self.enc_layers:
            x = layer(x)
        return x

# Topic Classifier

In [45]:
class TopicClassifier(tf.keras.Model):
    def __init__(self, encoder, num_classes):
        super().__init__()
        self.encoder = encoder
        self.global_avg_pool = tf.keras.layers.GlobalAveragePooling1D()
        self.classifier = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, x):
        input_ids = x["input_ids"]
        x = self.encoder(input_ids)
        x = self.global_avg_pool(x)
        return self.classifier(x)

# Build Model

In [46]:
model = TopicClassifier(
    encoder=Encoder(
        num_layers=2,
        d_model=128,
        num_heads=4,
        dff=512,
        vocab_size=vocab_size
    ),
    num_classes=len(label2id)
)

In [47]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train Model

In [48]:
model.fit(
    train_batches,
    validation_data=val_batches,
    epochs=5,
    steps_per_epoch=len(train_dataset) // 32,
    validation_steps=len(val_dataset) // 32
)

Epoch 1/5
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m300s[0m 533ms/step - accuracy: 0.1138 - loss: 4.1987 - val_accuracy: 0.4502 - val_loss: 2.2141
Epoch 2/5
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 580ms/step - accuracy: 0.5566 - loss: 1.7361 - val_accuracy: 0.5107 - val_loss: 2.1703
Epoch 3/5
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 589ms/step - accuracy: 0.8230 - loss: 0.6678 - val_accuracy: 0.4929 - val_loss: 2.5046
Epoch 4/5
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 590ms/step - accuracy: 0.9479 - loss: 0.2175 - val_accuracy: 0.4889 - val_loss: 2.8825
Epoch 5/5
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 591ms/step - accuracy: 0.9737 - loss: 0.1104 - val_accuracy: 0.4919 - val_loss: 3.0395


<keras.src.callbacks.history.History at 0x7c2d4c240950>

# Test

In [49]:
def predict_articles(model, tokenizer, articles, label_map, max_length=64):
    encoded = tokenizer(
        articles,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="tf"
    )
    input_ids = encoded["input_ids"]
    attention_mask = encoded["attention_mask"]

    preds = model(
        {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        },
        training=False
    )

    pred_ids = tf.argmax(preds, axis=1).numpy()

    for i, article in enumerate(articles):
        print(f"\nArticle {i+1}:\n{article[:250]}...")
        print(f"Predicted Topic: {label_map[pred_ids[i]]}")

In [50]:
test_articles = [
    "Taylor Swift was spotted leaving a studio in LA with rumors swirling about a new album.",
    "Lionel Messi scored twice and created two more as Inter Miami warmed up for the Club World Cup in style with a 5-1 win over the Columbus Crew in Major League Soccer on Saturday.",
    "NASA’s new space telescope has captured unprecedented images of distant galaxies.",
    "It's been 40 years since the controversial activist group Guerrilla Girls formed. Their most powerful campaign, the naked poster, broke new ground – and has had a lasting influence.",
    "Bitcoin prices soared after major tech companies announced crypto adoption plans.",
    "A recent study found that drinking coffee can help reduce the risk of heart disease.",
]

predict_articles(model, tokenizer, test_articles, id2label)


Article 1:
Taylor Swift was spotted leaving a studio in LA with rumors swirling about a new album....
Predicted Topic: Global Organizations

Article 2:
Lionel Messi scored twice and created two more as Inter Miami warmed up for the Club World Cup in style with a 5-1 win over the Columbus Crew in Major League Soccer on Saturday....
Predicted Topic: Soccer

Article 3:
NASA’s new space telescope has captured unprecedented images of distant galaxies....
Predicted Topic: Cosmology & The Universe

Article 4:
It's been 40 years since the controversial activist group Guerrilla Girls formed. Their most powerful campaign, the naked poster, broke new ground – and has had a lasting influence....
Predicted Topic: Climate Change

Article 5:
Bitcoin prices soared after major tech companies announced crypto adoption plans....
Predicted Topic: Crypto Trading & Speculation

Article 6:
A recent study found that drinking coffee can help reduce the risk of heart disease....
Predicted Topic: Epidemics & Ou

In [51]:
def evaluate_model(model, dataset, tokenizer, label_map, max_length=64):
    true_labels = []
    pred_labels = []

    for batch in dataset:
        input_ids = batch[0]["input_ids"]
        attention_mask = batch[0]["attention_mask"]
        labels = batch[1].numpy()

        preds = model(
            {
                "input_ids": input_ids,
                "attention_mask": attention_mask
            },
            training=False
        )
        pred_ids = tf.argmax(preds, axis=1).numpy()

        true_labels.extend(labels)
        pred_labels.extend(pred_ids)

    unique_label_ids = sorted(set(true_labels + pred_labels))

    print("Classification Report:\n")
    print(classification_report(
        true_labels,
        pred_labels,
        labels=unique_label_ids,
        target_names=[label_map[i] for i in unique_label_ids],
        zero_division=0
    ))

    acc = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average="weighted", zero_division=0)
    recall = recall_score(true_labels, pred_labels, average="weighted", zero_division=0)
    f1 = f1_score(true_labels, pred_labels, average="weighted", zero_division=0)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")

In [52]:
evaluate_model(model, val_batches.take(len(val_dataset) // 32), tokenizer, id2label)

Classification Report:

                                            precision    recall  f1-score   support

                 AI Policy and Regulations       0.38      0.61      0.47        31
                               AI Research       0.22      0.09      0.13        22
                               AI Startups       0.22      0.11      0.15        18
                                 Adventure       0.31      0.27      0.29        15
                         Aerobics & Cardio       0.00      0.00      0.00        16
               Africa Business & Economics       0.00      0.00      0.00         5
                           Africa politics       0.55      0.50      0.52        22
                               Agriculture       0.68      0.37      0.48        35
                           Art and Culture       0.62      0.48      0.54        61
                 Asia Business & Economics       0.23      0.23      0.23        13
                             Asia Politics       0.

# Eksperimen 3 Hyperparameter

In [53]:
def run_experiment(config, train_batches, val_batches, label_map, steps_per_epoch, validation_steps):
    print(f"\nTraining with config: {config}")

    model = TopicClassifier(
        encoder=Encoder(
            num_layers=config["num_layers"],
            d_model=config["d_model"],
            num_heads=config["num_heads"],
            dff=config["dff"],
            vocab_size=vocab_size
        ),
        num_classes=len(label_map)
    )

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    model.fit(
        train_batches,
        validation_data=val_batches,
        epochs=5,
        steps_per_epoch=steps_per_epoch,
        validation_steps=validation_steps,
        verbose=2
    )

    print(f"\nEvaluation Result for config: {config}")
    evaluate_model(model, val_batches.take(validation_steps), tokenizer, label_map)

In [54]:
configs = [
    {"num_layers": 2, "d_model": 128, "num_heads": 4, "dff": 512},
    {"num_layers": 2, "d_model": 128, "num_heads": 4, "dff": 216},
    {"num_layers": 2, "d_model": 128, "num_heads": 2, "dff": 512},
]

steps_per_epoch = len(train_dataset) // 32
validation_steps = len(val_dataset) // 32

for config in configs:
    run_experiment(config, train_batches, val_batches, id2label, steps_per_epoch, validation_steps)


Training with config: {'num_layers': 2, 'd_model': 128, 'num_heads': 4, 'dff': 512}
Epoch 1/5
546/546 - 307s - 562ms/step - accuracy: 0.2276 - loss: 3.5068 - val_accuracy: 0.4661 - val_loss: 2.2265
Epoch 2/5
546/546 - 298s - 546ms/step - accuracy: 0.6131 - loss: 1.4813 - val_accuracy: 0.5145 - val_loss: 2.0867
Epoch 3/5
546/546 - 322s - 590ms/step - accuracy: 0.8632 - loss: 0.5200 - val_accuracy: 0.4956 - val_loss: 2.5140
Epoch 4/5
546/546 - 300s - 550ms/step - accuracy: 0.9548 - loss: 0.1759 - val_accuracy: 0.4911 - val_loss: 2.8533
Epoch 5/5
546/546 - 322s - 590ms/step - accuracy: 0.9799 - loss: 0.0894 - val_accuracy: 0.4855 - val_loss: 3.0452

Evaluation Result for config: {'num_layers': 2, 'd_model': 128, 'num_heads': 4, 'dff': 512}
Classification Report:

                                            precision    recall  f1-score   support

                 AI Policy and Regulations       0.55      0.39      0.45        31
                               AI Research       0.28      

# Pre LN

# Attention

In [55]:
class BaseAttentionPreLN(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.add = tf.keras.layers.Add()
        self.layernorm = tf.keras.layers.LayerNormalization()

In [56]:
class GlobalSelfAttentionPreLN(BaseAttentionPreLN):
    def call(self, x):
        norm_x = self.layernorm(x)
        attn_output = self.mha(query=norm_x, value=norm_x, key=norm_x)
        x = self.add([x, attn_output])
        return x

# Feed Forward

In [57]:
class FeedForwardPreLN(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        norm_x = self.layer_norm(x)
        x = self.add([x, self.seq(norm_x)])
        return x

# Encoder

In [58]:
class EncoderLayerPreLN(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        self.self_attention = GlobalSelfAttentionPreLN(num_heads=num_heads, key_dim=d_model)
        self.ffn = FeedForwardPreLN(d_model, dff, dropout_rate)

    def call(self, x):
        x = self.self_attention(x)
        x = self.ffn(x)
        return x

In [59]:
class EncoderPreLN(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size):
        super().__init__()
        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.enc_layers = [
            EncoderLayerPreLN(d_model=d_model, num_heads=num_heads, dff=dff)
            for _ in range(num_layers)
        ]

    def call(self, x):
        x = self.pos_embedding(x)
        for layer in self.enc_layers:
            x = layer(x)
        return x

# Topic Classifier

In [60]:
class TopicClassifier(tf.keras.Model):
    def __init__(self, encoder, num_classes):
        super().__init__()
        self.encoder = encoder
        self.global_avg_pool = tf.keras.layers.GlobalAveragePooling1D()
        self.classifier = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, x):
        input_ids = x["input_ids"]
        x = self.encoder(input_ids)
        x = self.global_avg_pool(x)
        return self.classifier(x)

# Build Model

In [61]:
model = TopicClassifier(
    encoder=EncoderPreLN(
        num_layers=2,
        d_model=128,
        num_heads=4,
        dff=512,
        vocab_size=vocab_size
    ),
    num_classes=len(label2id)
)

In [62]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train Model

In [63]:
model.fit(
    train_batches,
    validation_data=val_batches,
    epochs=5,
    steps_per_epoch=len(train_dataset) // 32,
    validation_steps=len(val_dataset) // 32
)

Epoch 1/5
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m298s[0m 529ms/step - accuracy: 0.1379 - loss: 4.0660 - val_accuracy: 0.4712 - val_loss: 2.1355
Epoch 2/5
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 590ms/step - accuracy: 0.6142 - loss: 1.4901 - val_accuracy: 0.4964 - val_loss: 2.2810
Epoch 3/5
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 528ms/step - accuracy: 0.8855 - loss: 0.4277 - val_accuracy: 0.4998 - val_loss: 2.9833
Epoch 4/5
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m281s[0m 516ms/step - accuracy: 0.9580 - loss: 0.1621 - val_accuracy: 0.5067 - val_loss: 3.1067
Epoch 5/5
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 476ms/step - accuracy: 0.9831 - loss: 0.0790 - val_accuracy: 0.5012 - val_loss: 3.3171


<keras.src.callbacks.history.History at 0x7c2d3bfd1c10>

# Test

In [64]:
def predict_articles(model, tokenizer, articles, label_map, max_length=64):
    encoded = tokenizer(
        articles,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="tf"
    )
    input_ids = encoded["input_ids"]
    attention_mask = encoded["attention_mask"]

    preds = model(
        {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        },
        training=False
    )

    pred_ids = tf.argmax(preds, axis=1).numpy()

    for i, article in enumerate(articles):
        print(f"\nArticle {i+1}:\n{article[:250]}...")
        print(f"Predicted Topic: {label_map[pred_ids[i]]}")

In [65]:
test_articles = [
    "Taylor Swift was spotted leaving a studio in LA with rumors swirling about a new album.",
    "Lionel Messi scored twice and created two more as Inter Miami warmed up for the Club World Cup in style with a 5-1 win over the Columbus Crew in Major League Soccer on Saturday.",
    "NASA’s new space telescope has captured unprecedented images of distant galaxies.",
    "It's been 40 years since the controversial activist group Guerrilla Girls formed. Their most powerful campaign, the naked poster, broke new ground – and has had a lasting influence.",
    "Bitcoin prices soared after major tech companies announced crypto adoption plans.",
    "A recent study found that drinking coffee can help reduce the risk of heart disease.",
]

predict_articles(model, tokenizer, test_articles, id2label)


Article 1:
Taylor Swift was spotted leaving a studio in LA with rumors swirling about a new album....
Predicted Topic: Baseball

Article 2:
Lionel Messi scored twice and created two more as Inter Miami warmed up for the Club World Cup in style with a 5-1 win over the Columbus Crew in Major League Soccer on Saturday....
Predicted Topic: Soccer

Article 3:
NASA’s new space telescope has captured unprecedented images of distant galaxies....
Predicted Topic: Physics

Article 4:
It's been 40 years since the controversial activist group Guerrilla Girls formed. Their most powerful campaign, the naked poster, broke new ground – and has had a lasting influence....
Predicted Topic: Baseball

Article 5:
Bitcoin prices soared after major tech companies announced crypto adoption plans....
Predicted Topic: Baseball

Article 6:
A recent study found that drinking coffee can help reduce the risk of heart disease....
Predicted Topic: Baseball


# Evaluation

In [66]:
def evaluate_model(model, dataset, tokenizer, label_map, max_length=64):
    true_labels = []
    pred_labels = []

    for batch in dataset:
        input_ids = batch[0]["input_ids"]
        attention_mask = batch[0]["attention_mask"]
        labels = batch[1].numpy()

        preds = model(
            {
                "input_ids": input_ids,
                "attention_mask": attention_mask
            },
            training=False
        )
        pred_ids = tf.argmax(preds, axis=1).numpy()

        true_labels.extend(labels)
        pred_labels.extend(pred_ids)

    unique_label_ids = sorted(set(true_labels + pred_labels))

    print("Classification Report:\n")
    print(classification_report(
        true_labels,
        pred_labels,
        labels=unique_label_ids,
        target_names=[label_map[i] for i in unique_label_ids],
        zero_division=0
    ))

    acc = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average="weighted", zero_division=0)
    recall = recall_score(true_labels, pred_labels, average="weighted", zero_division=0)
    f1 = f1_score(true_labels, pred_labels, average="weighted", zero_division=0)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")

In [67]:
evaluate_model(model, val_batches.take(len(val_dataset) // 32), tokenizer, id2label)

Classification Report:

                                            precision    recall  f1-score   support

                 AI Policy and Regulations       0.35      0.42      0.38        31
                               AI Research       0.31      0.45      0.37        22
                               AI Startups       0.29      0.33      0.31        18
                                 Adventure       0.45      0.33      0.38        15
                         Aerobics & Cardio       0.50      0.31      0.38        16
               Africa Business & Economics       0.00      0.00      0.00         5
                           Africa politics       0.82      0.41      0.55        22
                               Agriculture       0.50      0.51      0.51        35
                           Art and Culture       0.46      0.56      0.50        61
                 Asia Business & Economics       0.22      0.54      0.31        13
                             Asia Politics       0.

# Eksperimen 3 Hyperparameter

In [68]:
def run_experiment(config, train_batches, val_batches, label_map, steps_per_epoch, validation_steps):
    print(f"\nTraining with config: {config}")

    model = TopicClassifier(
        encoder=EncoderPreLN(
            num_layers=config["num_layers"],
            d_model=config["d_model"],
            num_heads=config["num_heads"],
            dff=config["dff"],
            vocab_size=vocab_size
        ),
        num_classes=len(label_map)
    )

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    model.fit(
        train_batches,
        validation_data=val_batches,
        epochs=5,
        steps_per_epoch=steps_per_epoch,
        validation_steps=validation_steps,
        verbose=2
    )

    print(f"\nEvaluation Result for config: {config}")
    evaluate_model(model, val_batches.take(validation_steps), tokenizer, label_map)

In [69]:
configs = [
    {"num_layers": 2, "d_model": 128, "num_heads": 4, "dff": 512},
    {"num_layers": 2, "d_model": 128, "num_heads": 4, "dff": 216},
    {"num_layers": 2, "d_model": 128, "num_heads": 2, "dff": 512},
]

steps_per_epoch = len(train_dataset) // 32
validation_steps = len(val_dataset) // 32

for config in configs:
    run_experiment(config, train_batches, val_batches, id2label, steps_per_epoch, validation_steps)


Training with config: {'num_layers': 2, 'd_model': 128, 'num_heads': 4, 'dff': 512}
Epoch 1/5
546/546 - 256s - 469ms/step - accuracy: 0.2913 - loss: 3.1466 - val_accuracy: 0.4917 - val_loss: 2.0346
Epoch 2/5
546/546 - 248s - 453ms/step - accuracy: 0.6903 - loss: 1.1562 - val_accuracy: 0.5010 - val_loss: 2.3709
Epoch 3/5
546/546 - 263s - 481ms/step - accuracy: 0.9062 - loss: 0.3259 - val_accuracy: 0.5111 - val_loss: 2.8465
Epoch 4/5
546/546 - 262s - 480ms/step - accuracy: 0.9658 - loss: 0.1388 - val_accuracy: 0.4996 - val_loss: 3.1328
Epoch 5/5
546/546 - 261s - 478ms/step - accuracy: 0.9803 - loss: 0.0919 - val_accuracy: 0.5167 - val_loss: 3.2356

Evaluation Result for config: {'num_layers': 2, 'd_model': 128, 'num_heads': 4, 'dff': 512}
Classification Report:

                                            precision    recall  f1-score   support

                 AI Policy and Regulations       0.51      0.61      0.56        31
                               AI Research       0.45      

# Ringkasan Perbandingan Performa

## Berdasarkan hasil evaluasi terhadap dua pendekatan arsitektur transformer, yaitu Post-Layer Normalization dan Pre-Layer Normalization, dapat disimpulkan bahwa secara umum performa keduanya berada pada tingkat yang relatif sebanding dalam hal accuracy, precision, recall, dan F1-score.

## Salah satu temuan penting adalah Pre-LN cenderung memberikan performa yang lebih stabil dan konsisten di berbagai konfigurasi hyperparameter. Meskipun tidak selalu menghasilkan skor tertinggi, Pre-LN menunjukkan hasil yang cukup merata tanpa fluktuasi besar. Sementara itu, Post-LN menunjukkan lebih sensitif terhadap perubahan hyperparameter. Hal ini memperlihatkan bahwa Post-LN mungkin memerlukan tuning yang lebih hati-hati dan Pre-LN mendukung pelatihan yang lebih stabil.