In [1]:
!pip install -U datasets



In [64]:
from datasets import load_dataset

# Load hanya split 'train'
dataset = load_dataset("valurank/Topic_Classification", split="train")

dataset = dataset.filter(lambda x: x['article_text'] is not None and len(str(x['article_text']).strip()) > 0)

labels = dataset["topic"]

unique_labels = sorted(set(labels))
label2id = {label: i for i, label in enumerate(unique_labels)}

# 4. Ubah semua label string ke angka
def encode_label(example):
    example["topic"] = label2id[example["topic"]]
    return example

dataset = dataset.map(encode_label)

train_dataset = dataset.select(range(0, 17500))
val_dataset = dataset.select(range(17500, len(dataset)))

In [3]:
!pip uninstall -y -q tensorflow keras tensorflow-estimator tensorflow-text
!pip install -q -U tensorflow-text tensorflow

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.11.0 requires tensorflow==2.18.0, but you have tensorflow 2.19.0 which is incompatible.
tf-keras 2.18.0 requires tensorflow<2.19,>=2.18, but you have tensorflow 2.19.0 which is incompatible.[0m[31m
[0m

In [65]:
import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

import tensorflow_text

from transformers import BertTokenizer
from torch.utils.data import DataLoader

In [66]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [67]:
def make_batches(dataset, tokenizer, batch_size=32):
    def tokenize_function(example):
        return tokenizer(
            example["article_text"],
            truncation=True,
            padding="max_length",
            max_length=64
        )

    tokenized = dataset.map(tokenize_function, batched=True)

    tokenized.set_format("torch", columns=["input_ids", "attention_mask", "topic"])
    return DataLoader(tokenized, batch_size=batch_size, shuffle=True)

In [68]:
train_batches = make_batches(train_dataset, tokenizer)
val_batches = make_batches(val_dataset, tokenizer)

Map:   0%|          | 0/17500 [00:00<?, ? examples/s]

Map:   0%|          | 0/4962 [00:00<?, ? examples/s]

In [69]:
for batch in train_batches:
    break

input = batch["input_ids"]          # (batch_size, seq_len)
attension_mask = batch["attention_mask"]     # (batch_size, seq_len)
labels = batch["topic"]       # (batch_size,)

print(input.shape)           # torch.Size([32, 64])
print(labels.shape)           # torch.Size([32, 64])
print(labels.shape)    # torch.Size([32])

print(input[0][:10])         # input_ids baris pertama
print(labels[0])       # label pertama (int)

torch.Size([32, 64])
torch.Size([32])
torch.Size([32])
tensor([  101,  2188,  2739,  1006,  3746,  4923,  1024, 28180, 14758,  1007])
tensor(95)


In [70]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

In [71]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

In [72]:
vocab_size = tokenizer.vocab_size

embed_article = PositionalEmbedding(vocab_size=vocab_size, d_model=128)

article_emb = embed_article(input)

In [73]:
article_emb._keras_mask

<tf.Tensor: shape=(32, 64), dtype=bool, numpy=
array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])>

In [74]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [75]:
sample_gsa = GlobalSelfAttention(num_heads=2, key_dim=512)

print(article_emb.shape)
print(sample_gsa(article_emb).shape)

(32, 64, 128)
(32, 64, 128)


In [76]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x)
    return x

In [77]:
sample_ffn = FeedForward(128, 512)

print(article_emb.shape)
print(sample_ffn(article_emb).shape)

(32, 64, 128)
(32, 64, 128)


In [78]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [79]:
sample_encoder_layer = EncoderLayer(d_model=128, num_heads=8, dff=512)

print(article_emb.shape)
print(sample_encoder_layer(article_emb).shape)

(32, 64, 128)
(32, 64, 128)


In [80]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

In [81]:
# Instantiate the encoder.
sample_encoder = Encoder(num_layers=4,
                         d_model=128,
                         num_heads=8,
                         dff=512,
                         vocab_size=vocab_size)

sample_encoder_output = sample_encoder(input, training=False)

# Print the shape.
print(input.shape)
print(sample_encoder_output.shape)  # Shape `(batch_size, input_seq_len, d_model)`.



torch.Size([32, 64])
(32, 64, 128)


In [82]:
class TopicClassifier(tf.keras.Model):
  def __init__(self, encoder, num_classes):
    super().__init__()
    self.encoder = encoder
    self.global_avg_pool = tf.keras.layers.GlobalAveragePooling1D()
    self.classifier = tf.keras.layers.Dense(num_classes, activation='softmax')

  def call(self, x):
    x = self.encoder(x)                   # (batch, seq_len, d_model)
    x = self.global_avg_pool(x)          # (batch, d_model)
    return self.classifier(x)            # (batch, num_classes)


In [83]:
num_classes = len(label2id)

model = TopicClassifier(sample_encoder, num_classes)

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)


In [84]:
def convert_to_tf_dataset(dataloader):
    for batch in dataloader:
        yield tf.convert_to_tensor(batch['input_ids']), tf.convert_to_tensor(batch['topic'])

train_tf_dataset = tf.data.Dataset.from_generator(
    lambda: convert_to_tf_dataset(train_batches),
    output_signature=(
        tf.TensorSpec(shape=(None, 64), dtype=tf.int64),
        tf.TensorSpec(shape=(None,), dtype=tf.int64)
    )
).prefetch(tf.data.AUTOTUNE)

val_tf_dataset = tf.data.Dataset.from_generator(
    lambda: convert_to_tf_dataset(val_batches),
    output_signature=(
        tf.TensorSpec(shape=(None, 64), dtype=tf.int64),
        tf.TensorSpec(shape=(None,), dtype=tf.int64)
    )
).prefetch(tf.data.AUTOTUNE)


In [85]:
model.fit(
    train_tf_dataset,
    validation_data=val_tf_dataset,
    epochs=5
)


Epoch 1/5
    547/Unknown [1m881s[0m 2s/step - accuracy: 0.0239 - loss: 4.4769



[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m953s[0m 2s/step - accuracy: 0.0239 - loss: 4.4767 - val_accuracy: 0.0000e+00 - val_loss: 8.7841
Epoch 2/5
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m933s[0m 2s/step - accuracy: 0.0286 - loss: 4.3797 - val_accuracy: 0.0000e+00 - val_loss: 9.0337
Epoch 3/5
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m920s[0m 2s/step - accuracy: 0.0283 - loss: 4.3718 - val_accuracy: 0.0000e+00 - val_loss: 9.3323
Epoch 4/5
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m919s[0m 2s/step - accuracy: 0.0284 - loss: 4.3575 - val_accuracy: 0.0000e+00 - val_loss: 9.4706
Epoch 5/5
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m941s[0m 2s/step - accuracy: 0.0303 - loss: 4.3558 - val_accuracy: 0.0000e+00 - val_loss: 9.5749


<keras.src.callbacks.history.History at 0x7cbb13916e50>

In [88]:
def predict_articles(model, tokenizer, articles, label_map, max_length=64):
    """
    Memprediksi topik dari daftar artikel teks.

    model:      Trained TopicClassifier model (tf.keras.Model)
    tokenizer:  Hugging Face tokenizer (e.g., BertTokenizer)
    articles:   List of string (artikel-artikel)
    label_map:  Dict mapping int label -> nama topik (id2label)
    max_length: Panjang maksimum token input
    """
    # Tokenisasi semua artikel
    encoded = tokenizer(
        articles,
        truncation=True,
        padding="max_length",
        max_length=64,
        return_tensors="tf"
    )

    input_ids = encoded["input_ids"]

    # Prediksi
    preds = model(input_ids, training=False)
    pred_ids = tf.argmax(preds, axis=1).numpy()

    # Tampilkan hasil
    for i, article in enumerate(articles):
        print(f"\n📝 Article {i+1}:\n{article[:250]}...")  # Print sebagian isi artikel
        print(f"🔮 Predicted Topic: {label_map[pred_ids[i]]}")


In [89]:
# Buat mapping id → label (jika belum)
id2label = {v: k for k, v in label2id.items()}

# Contoh artikel uji
test_articles = [
    "Taylor Swift was spotted leaving a studio in LA with rumors swirling about a new album.",
    "The government has passed a new education reform bill aimed at reducing school fees.",
    "NASA’s new space telescope has captured unprecedented images of distant galaxies.",
    "The latest football championship saw an unexpected win by the underdog team.",
    "Bitcoin prices soared after major tech companies announced crypto adoption plans.",
    "A recent study found that drinking coffee can help reduce the risk of heart disease.",
]

# Prediksi topiknya
predict_articles(model, tokenizer, test_articles, id2label)



📝 Article 1:
Taylor Swift was spotted leaving a studio in LA with rumors swirling about a new album....
🔮 Predicted Topic: Epidemics & Outbreaks

📝 Article 2:
The government has passed a new education reform bill aimed at reducing school fees....
🔮 Predicted Topic: Epidemics & Outbreaks

📝 Article 3:
NASA’s new space telescope has captured unprecedented images of distant galaxies....
🔮 Predicted Topic: Epidemics & Outbreaks

📝 Article 4:
The latest football championship saw an unexpected win by the underdog team....
🔮 Predicted Topic: Epidemics & Outbreaks

📝 Article 5:
Bitcoin prices soared after major tech companies announced crypto adoption plans....
🔮 Predicted Topic: Epidemics & Outbreaks

📝 Article 6:
A recent study found that drinking coffee can help reduce the risk of heart disease....
🔮 Predicted Topic: Epidemics & Outbreaks
