# INFORMASI
### IndoBERTweet  
### Model hasil MLM di Fine-tune ke data YouTube kembali

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import time
import itertools

import torch

import warnings
warnings.filterwarnings("ignore")

SEED = 11
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [13]:
df_yt = pd.read_csv('final_data_yt.csv')

df_yt.head()

Unnamed: 0,text,cyberbullying,clean_text_bert,clean_text_ML,clean_text_ML_2
0,Kalau cowok sudah sakit hati dan kecewa memang...,0,Kalau cowok sudah sakit hati dan kecewa memang...,kalau cowok sakit hati kecewa memang kayak mbak,kalau cowok sakit hati kecewa memang kayak mbak
1,Om ded kasih panggung ke cewek problematik bia...,1,Om ded kasih panggung ke cewek problematik bia...,om ded kasih panggung cewek problematik biaya ...,om ded kasih panggung cewek problematik biaya ...
2,om ded?? apa are kamu doing !!!!!!!!!!!!!,0,om ded? apa are kamu doing !,om ded apa are kamu doing,om ded apa are kamu doing
3,"Ya Allah,jauhkan anak anak kita dari pergaulan...",0,Ya Allah jauhkan anak anak kita dari pergaulan...,allah jauhkan anak anak pergaulan bebas dunia ...,jauh anak anak gaul bebas dunia malam hubung s...
4,"gue kira pintar people, tetapi hal kayak begin...",0,gue kira pintar people tetapi hal kayak begini...,gue kira pintar people kayak begini angkat pod...,gue kira pintar people kayak begini angkat pod...


In [14]:
df_yt = df_yt[['clean_text_bert', 'cyberbullying']]

In [15]:
df_yt['cyberbullying'].value_counts()

cyberbullying
0    405
1    245
Name: count, dtype: int64

In [16]:
df_yt

Unnamed: 0,clean_text_bert,cyberbullying
0,Kalau cowok sudah sakit hati dan kecewa memang...,0
1,Om ded kasih panggung ke cewek problematik bia...,1
2,om ded? apa are kamu doing !,0
3,Ya Allah jauhkan anak anak kita dari pergaulan...,0
4,gue kira pintar people tetapi hal kayak begini...,0
...,...,...
645,Padahal dia sadar lo nglakuin nya orang ya sam...,0
646,Tidak takut kena penyakit semoga ada hikmahnya,0
647,Kebanyakan main Uno sih akhirnya tumbangkan Ki...,1
648,Eeh Erica Berisik amat kamu Kalau memang kamu ...,0


In [17]:
df_yt = df_yt.rename(columns={'clean_text_bert': 'text', 'cyberbullying': 'label'})
df_yt

Unnamed: 0,text,label
0,Kalau cowok sudah sakit hati dan kecewa memang...,0
1,Om ded kasih panggung ke cewek problematik bia...,1
2,om ded? apa are kamu doing !,0
3,Ya Allah jauhkan anak anak kita dari pergaulan...,0
4,gue kira pintar people tetapi hal kayak begini...,0
...,...,...
645,Padahal dia sadar lo nglakuin nya orang ya sam...,0
646,Tidak takut kena penyakit semoga ada hikmahnya,0
647,Kebanyakan main Uno sih akhirnya tumbangkan Ki...,1
648,Eeh Erica Berisik amat kamu Kalau memang kamu ...,0


In [18]:
from sklearn.model_selection import train_test_split



# Split 70% Train, 30% Sisa
df_train, df_temp = train_test_split(df_yt, test_size=0.3, random_state=42, stratify=df_yt['label'])
# Split Sisa jadi 10% Val, 20% Test (1/3 dari sisa untuk Val)
df_val, df_test = train_test_split(df_temp, test_size=2/3, random_state=42, stratify=df_temp['label'])

print(f"Train: {len(df_train)}, Val: {len(df_val)}, Test: {len(df_test)}")


Train: 455, Val: 65, Test: 130


In [None]:
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
MODEL_NAME = './indobertweet-yt-dapt' # Model yang dipanggil ini model hasil MLM DAPT
MAX_LEN = 128

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

class CyberBullyingDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df['text'].to_numpy()
        self.labels = df['label'].to_numpy()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [20]:
train_dataset = CyberBullyingDataset(df_train, tokenizer, MAX_LEN)
val_dataset = CyberBullyingDataset(df_val, tokenizer, MAX_LEN)
test_dataset = CyberBullyingDataset(df_test, tokenizer, MAX_LEN)

In [None]:
from sklearn.utils.class_weight import compute_class_weight

# IMBALANCE HANDLING (Calculate Weights)
class_labels = np.unique(df_train['label'])
class_weights = compute_class_weight('balanced', classes=class_labels, y=df_train['label'])
# Ubah ke FloatTensor (Nanti dipindah ke device di dalam Trainer)
weights_tensor = torch.tensor(class_weights, dtype=torch.float)

print(f"Class Weights: {class_weights}")

Class Weights: [0.80388693 1.32267442]


In [22]:
from transformers import Trainer
import torch.nn as nn

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # Pastikan weights ada di device yang sama dengan model
        loss_fct = nn.CrossEntropyLoss(weight=weights_tensor.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [23]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

## Fine Tuning Model

In [None]:
from transformers import BertForSequenceClassification, TrainingArguments

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

BATCH_SIZE = 4      
GRAD_ACCUM = 2     
EPOCHS = 8
LR = 2e-5

# MODEL INITIALIZATION
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model = model.to(device)

# TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir='./results',          # Folder output
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE, # Batch size fisik (4)
    per_device_eval_batch_size=8,          
    gradient_accumulation_steps=GRAD_ACCUM, # Akumulasi jadi (4x2=8)
    
    fp16=True,                       # MIXED PRECISION 
    
    learning_rate=LR,
    eval_strategy="epoch",     # Evaluasi tiap akhir epoch
    save_strategy="epoch",           # Save model tiap akhir epoch
    load_best_model_at_end=True,     # Load model terbaik di akhir training
    metric_for_best_model="f1",      # Patokan model terbaik adalah F1 Score
    greater_is_better=True,                                                             
    save_total_limit=2,              
    logging_dir='./logs',
    logging_steps=10,
    dataloader_num_workers=0         
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./indobertweet-yt-dapt and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Device: cuda


In [25]:
from transformers import EarlyStoppingCallback

# 9. INITIALIZE TRAINER
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] 
)

In [26]:
print("\n== TRAINING INDOBERTWEET ==")
trainer.train()


== TRAINING INDOBERTWEET ==


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5835,0.529415,0.738462,0.761583,0.775407,0.737467
2,0.4876,0.489959,0.784615,0.775,0.751524,0.759514
3,0.3395,0.660431,0.753846,0.752551,0.70122,0.711111
4,0.2377,0.645131,0.830769,0.8175,0.822663,0.819854
5,0.0853,0.864986,0.846154,0.839323,0.82622,0.831781
6,0.1167,0.985981,0.846154,0.839323,0.82622,0.831781
7,0.0061,0.989863,0.846154,0.839323,0.82622,0.831781
8,0.0654,1.00422,0.830769,0.819876,0.814024,0.816714


TrainOutput(global_step=456, training_loss=0.24590868861496187, metrics={'train_runtime': 106.9699, 'train_samples_per_second': 34.028, 'train_steps_per_second': 4.263, 'total_flos': 239431060377600.0, 'train_loss': 0.24590868861496187, 'epoch': 8.0})

## Hasil Evaluasi

In [27]:
from sklearn.metrics import classification_report

preds_output = trainer.predict(test_dataset)

y_preds = np.argmax(preds_output.predictions, axis=1)
y_true = preds_output.label_ids

print(classification_report(y_true, y_preds, target_names=['Biasa', 'Cyberbullying']))

               precision    recall  f1-score   support

        Biasa       0.80      0.95      0.87        81
Cyberbullying       0.88      0.61      0.72        49

     accuracy                           0.82       130
    macro avg       0.84      0.78      0.80       130
 weighted avg       0.83      0.82      0.81       130

