In [None]:
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

def tokenize_text(text, max_length=500):
    encoded = tokenizer(text.to_list(), padding=True, truncation=True, max_length=max_length, return_tensors='tf')
    token_ids = encoded['input_ids']
    attention_mask = encoded['attention_mask']

    return token_ids, attention_mask

In [None]:
strengthtrain_token_ids, strengthtrain_attention_mask = tokenize_text(df_strengthness_train['text'])
strengthtest_token_ids, strengthtest_attention_mask = tokenize_text(df_strengthness_test['text'])
strengthval_token_ids, strengthval_attention_mask = tokenize_text(df_strengthness_val['text'])

In [None]:
label_strengthness_train = tf.convert_to_tensor(df_strengthness_train.iloc[:,1:].values, dtype=tf.float32)
label_strengthness_test = tf.convert_to_tensor(df_strengthness_test.iloc[:,1:].values, dtype=tf.float32)
label_strengthness_val = tf.convert_to_tensor(df_strengthness_val.iloc[:,1:].values, dtype=tf.float32)

In [None]:
formatted_strengthness_train = tf.data.Dataset.from_tensor_slices(({"input_ids":strengthtrain_token_ids, "attention_mask":strengthtrain_attention_mask}, label_strengthness_train))
formatted_strengthness_test = tf.data.Dataset.from_tensor_slices(({"input_ids":strengthtest_token_ids, "attention_mask":strengthtest_attention_mask}, label_strengthness_test))
formatted_strengthness_val = tf.data.Dataset.from_tensor_slices(({"input_ids":strengthval_token_ids, "attention_mask":strengthval_attention_mask}, label_strengthness_val))

In [None]:
batch_size = 8
formatted_strengthness_train = formatted_strengthness_train.shuffle(len(df_strengthness_train)).batch(batch_size)
formatted_strengthness_test = formatted_strengthness_test.batch(batch_size)
formatted_strengthness_val = formatted_strengthness_val.batch(batch_size)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1",num_labels=label_strengthness_train.shape[1])

tf_model.h5:   0%|          | 0.00/656M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = Adam(learning_rate=3e-5)
loss = CategoricalCrossentropy(from_logits=True)

In [None]:
tolerance = 3
min_delta = 1e-4
num_epoch = 3
wait = 0
best_loss = float('inf')

for epoch in range(num_epoch):
    print(f"Epoch {epoch}/{num_epoch}")
    print("="*30)
    train_loss_total = 0
    train_step = 0

    for s(x_batch_train, y_batch_train) in (formatted_strengthness_train):
      with tf.GradientTape() as tape:
        logits = model(x_batch_train, training=True).logits
        loss_value = loss(y_batch_train, logits)

      gradients = tape.gradient(loss_value, model.trainable_weights)
      optimizer.apply_gradients(zip(gradients, model.trainable_weights))

      train_loss_total += loss_value
      train_step += 1
      if train_step % 100 == 0:
        print(f"Training loss (step {train_step}): {loss_value:.4f}")
      train_loss_avg = train_loss_total / train_step

      val_loss_total = 0

    for x_batch_val, y_batch_val in formatted_strengthness_val:
      val_logits = model(x_batch_val, training=False).logits
      val_loss_value = loss(y_batch_val, val_logits)

      val_loss_total += val_loss_value
    val_loss_avg = val_loss_total / len(formatted_strengthness_val)
    print("="*30)
    print(f"Training loss: {train_loss_avg:.4f}")
    print(f"Validation loss: {val_loss_avg:.4f}")

    if val_loss_avg < best_loss - min_delta:
      best_loss = val_loss_avg
      wait = 0
      print("Loss Berkurang")
    else:
      wait += 1
      print ("Loss Meningkat")

    if wait >= tolerance:
      print("Early stopping triggered")
      break

Tidak menemukan checkpoint. Mulai dari awal...
Epoch 0/3
----------
Training loss (step 100): 0.5439
Training loss (step 200): 0.5064
Training loss (step 300): 0.3042
Training loss (step 400): 0.5635
Training loss (step 500): 0.3013
Training loss (step 600): 0.2695
Training loss (step 700): 0.2263
Training loss (step 800): 0.0874
Training loss (step 900): 0.2808
Training loss (step 1000): 0.7972
Training loss (step 1100): 0.4569
Training loss (step 1200): 0.4214
Training loss (step 1300): 0.5282
Training loss (step 1400): 0.5789
Training loss (step 1500): 0.3204
Training loss: 0.3855
Validation loss: 0.3858
Loss Berkurang
Epoch 1/3
----------
Training loss (step 100): 0.2647
Training loss (step 200): 0.2639
Training loss (step 300): 0.4639
Training loss (step 400): 0.6202
Training loss (step 500): 0.3695
Training loss (step 600): 0.2164
Training loss (step 700): 0.5444
Training loss (step 800): 0.2564
Training loss (step 900): 0.4184
Training loss (step 1000): 0.0707
Training loss (ste

In [None]:
threshold = 0.5

preds = model.predict(formatted_strengthness_test)
probs = tf.nn.softmax(preds.logits, axis=-1).numpy()
preds_label = np.argmax(probs, axis=1)

true_labels = []
for _, label in formatted_strengthness_test:
    true_labels.extend(label.numpy())

true_labels = np.argmax(true_labels, axis=1)

target_names = ['hs_weak', 'hs_moderate', 'hs_strong']

report = classification_report(true_labels, preds_label, target_names=target_names, zero_division=0)
accuracy = accuracy_score(true_labels, preds_label)

print(report)
print(f"Accuracy: {accuracy}")

              precision    recall  f1-score   support

     hs_weak       0.87      0.99      0.93      3122
 hs_moderate       0.00      0.00      0.00       458
   hs_strong       0.75      0.80      0.77       137

    accuracy                           0.86      3717
   macro avg       0.54      0.60      0.57      3717
weighted avg       0.76      0.86      0.81      3717

Accuracy: 0.8638687113263385


In [None]:
model.save_pretrained('hs_strengthness_bert')
tokenizer.save_pretrained('hs_strengthness_bert')

shutil.make_archive('hs_strengthness_bert', 'zip', 'hs_multilabel_bert')
files.download('hs_strengthness_bert.zip')