# **SMS SPAM DETECTION - FINE-TUNING INDOBERT**

## **Install and Import Library**

In [1]:
!pip install -q transformers tensorflow

In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.metrics import classification_report

## **Load Dataset**

In [3]:
train = pd.read_csv('/content/train (1).csv')
val = pd.read_csv('/content/validation.csv')
test = pd.read_csv('testing.csv')

In [4]:
print(train['Pesan'].apply(len).max())

427


## **Tokenizing and Transform Dataset**

In [5]:
model_name = 'indobenchmark/indobert-base-p1'
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
def encode(text, tokenizer, max_length=500):
  encodings = tokenizer(text.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors='tf')
  return encodings['input_ids'], encodings['attention_mask']

In [7]:
train_input_ids, train_attention_mask = encode(train['Pesan'], tokenizer)
val_input_ids, val_attention_mask = encode(val['Pesan'], tokenizer)
test_input_ids, test_attention_mask = encode(test['Pesan'], tokenizer)

In [8]:
train_labels = tf.convert_to_tensor(train.iloc[:, 1:].values, dtype=tf.float32)
val_labels = tf.convert_to_tensor(val.iloc[:, 1:].values, dtype=tf.float32)
test_labels = tf.convert_to_tensor(test.iloc[:, 1:].values, dtype=tf.float32)

In [9]:
train_dataset = tf.data.Dataset.from_tensor_slices(({
    "input_ids": train_input_ids,
    "attention_mask": train_attention_mask
  }, train_labels)
)

In [10]:
val_dataset = tf.data.Dataset.from_tensor_slices(({
    "input_ids": val_input_ids,
    "attention_mask": val_attention_mask
  }, val_labels)
)

In [11]:
test_dataset = tf.data.Dataset.from_tensor_slices(({
    "input_ids": test_input_ids,
    "attention_mask": test_attention_mask
  }, test_labels)
)

In [12]:
batch_size = 3
train_dataset = train_dataset.shuffle(len(train)).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

## **Load Model**

In [13]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

tf_model.h5:   0%|          | 0.00/656M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## **Fine-Tuning IndoBERT**

In [14]:
optimizer = Adam(learning_rate=3e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

wait = 0
best_val_loss = float('inf')
tolerance = 3
min_delta = 1e-4

num_epoch = 5
for epoch in range(num_epoch):
  print(f"Epoch {epoch+1}/{num_epoch}")

  train_loss_total = 0
  train_steps = 0

  for step, (x_batch_train, y_batch_train) in enumerate (train_dataset):
    with tf.GradientTape() as tape:
      logits = model(x_batch_train, training=True).logits
      loss_value = loss_fn(y_batch_train, logits)

    grads = tape.gradient(loss_value, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    train_loss_total += loss_value
    train_steps += 1

    if step % 50 == 0:
      print(f"training loss at step {step}: {loss_value: .4f}")

  train_loss_avg = train_loss_total / train_steps

  val_loss_total = 0

  for x_batch_val, y_batch_val in val_dataset:
    val_logits = model(x_batch_val, training=False).logits
    val_loss_total += loss_fn(y_batch_val, val_logits)

  val_loss_avg = val_loss_total / len(val_dataset)

  print("=========================================================")
  print(f"Training loss: {train_loss_avg: .4f}")
  print(f"Validation loss: {val_loss_avg: .4f}")
  print("=========================================================")

  if val_loss_avg < best_val_loss - min_delta:
    best_val_loss = val_loss_avg
    wait = 0
    print("Validation loss improved")
  else:
    wait += 1
    print(f"No improvement in validation loss. Tolerance: {wait}/{tolerance}")

  if wait >= tolerance:
    print("Early stopping triggered, Stopping training...")
    break

Epoch 1/5
training loss at step 0:  0.6982
training loss at step 50:  0.0853
training loss at step 100:  2.0642
training loss at step 150:  0.0028
training loss at step 200:  0.0050
Training loss:  0.1823
Validation loss:  0.1537
Validation loss improved
Epoch 2/5
training loss at step 0:  0.0280
training loss at step 50:  0.4609
training loss at step 100:  1.1029
training loss at step 150:  0.0055
training loss at step 200:  1.4185
Training loss:  0.1028
Validation loss:  0.1377
Validation loss improved
Epoch 3/5
training loss at step 0:  0.0014
training loss at step 50:  0.0302
training loss at step 100:  0.0308
training loss at step 150:  0.0185
training loss at step 200:  0.0051
Training loss:  0.0862
Validation loss:  0.1709
No improvement in validation loss. Tolerance: 1/3
Epoch 4/5
training loss at step 0:  0.0013
training loss at step 50:  0.9167
training loss at step 100:  0.0075
training loss at step 150:  0.0007
training loss at step 200:  0.0011
Training loss:  0.0594
Valid

## **Evaluation**

In [15]:
preds = model.predict(test_dataset)
pred_labels = tf.argmax(preds.logits, axis=1)

true_labels = []
for _, label in test_dataset:
  true_labels.extend(label.numpy())

true_labels = np.array(true_labels)

target_names = ['ham', 'spam']
print(classification_report(true_labels, pred_labels, target_names=target_names))

              precision    recall  f1-score   support

         ham       1.00      0.97      0.99       111
        spam       0.98      1.00      0.99       118

    accuracy                           0.99       229
   macro avg       0.99      0.99      0.99       229
weighted avg       0.99      0.99      0.99       229



In [21]:
test_text = "KAMU BERHASIL MEMENANGKAN UNDIAN"

test_encodings = tokenizer(test_text, truncation=True, padding=True, max_length=500, return_tensors="tf")

preds = model(test_encodings)
pred_label = tf.argmax(preds.logits, axis=1).numpy()[0]

label_mapping = {0: "ham", 1: "spam"}
print(f"Predicted label: {label_mapping[pred_label]}")

Predicted label: spam


## **Save Model and Tokenizer**

In [17]:
model.save_pretrained('indobert_ham_spam')
tokenizer.save_pretrained('indobert_ham_spam')

('indobert_ham_spam/tokenizer_config.json',
 'indobert_ham_spam/special_tokens_map.json',
 'indobert_ham_spam/vocab.txt',
 'indobert_ham_spam/added_tokens.json',
 'indobert_ham_spam/tokenizer.json')

In [18]:
import shutil
from google.colab import files

shutil.make_archive('indobert_ham_spam', 'zip', 'indobert_ham_spam')
files.download('indobert_ham_spam.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>