In [1]:
# Prepare Google Colab Environement and build handmade library
# !git clone https://github.com/kaenova/Headline_Detection.git
# %cd "/content/Headline_Detection"

# !make lib

In [2]:
# # Reset Google Colab Environment
# %cd ..
# !rm -fr Headline_Detection

# Load Dataset

In [3]:
import os
import pandas as pd
data_path = "../../data/4. Processed/" 
folders = os.listdir(data_path)
scenario_data = {}

for folder in folders:
    current_data_path = f"{data_path}/{folder}"
    if not os.path.isdir(current_data_path):
        continue
    files = os.listdir(current_data_path)
    data_dict = {}
    for file in files:
        current_file = f"{current_data_path}/{file}"
        data_dict[file] = pd.read_csv(current_file)
    scenario_data[int(folder)] = data_dict

In [4]:
import torch
import pandas
import typing
from torch.utils.data import Dataset
from kaelib.processor import TextProcessingPipeline


class TextClassificationDataset(Dataset):
    def __init__(
        self,
        df: "pandas.DataFrame",
        x_column_name: "str" = "tweet",
        y_column_name: "str" = "labels",
        preprocessor: "typing.Optional[TextProcessingPipeline]" = None,
    ):
        self.x = df[x_column_name].astype(str).to_list()
        self.y = torch.tensor(df[y_column_name].astype(int).to_list())
        assert len(self.x) == len(self.y)
        self.preprocessor = preprocessor

    def __len__(self):
        return len(self.x)

    def _process_idx_text(self, idx):
        data = self.x[idx]
        if type(idx) is not slice:
            data = [self.x[idx]]
        if self.preprocessor is not None:
            data = self.preprocessor.process_corpus(data)
        return data

    def __getitem__(self, idx):
        processed_corpus = self._process_idx_text(idx)
        return processed_corpus, self.y[idx]

    def __repr__(self) -> str:
        return "\n".join([f"{self.x[i]} : {self.y[i]}" for i in range(5)])


scenario_datasets = {}
for scenario in scenario_data:
    data = scenario_data[scenario]
    datasets = {}
    for data_type in data:
        datasets[data_type] = TextClassificationDataset(data[data_type])
    scenario_datasets[scenario] = datasets


ModuleNotFoundError: No module named 'NDETCStemmerWraper'

# Modelling

In [14]:
# Hyperparameters
"""
https://arxiv.org/pdf/1810.04805.pdf

We use a batch size of 32 and fine-tune for 3
epochs over the data for all GLUE tasks. For each
task, we selected the best fine-tuning learning rate
(among 5e-5, 4e-5, 3e-5, and 2e-5)
"""

seq_length = 256
learning_rate = 2e-6
batch_size = 2

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import lightning.pytorch as pl
from transformers import BertForSequenceClassification, BertTokenizerFast
from torchmetrics.classification import F1Score, Accuracy, Recall, Precision


class BERTClassifier(pl.LightningModule):
    def __init__(
        self,
        huggingface_model_name: "str",
        seq_length: 'int' = 256,
        out_feature: "int" = 2,
        pad_sequence: "bool" = False,
    ):
        super().__init__()
        self.seq_length = seq_length
        self.pad_sequence = pad_sequence
        self.tokenizer = BertTokenizerFast.from_pretrained(huggingface_model_name)
        self.huggingface_model = BertForSequenceClassification.from_pretrained(
            huggingface_model_name,
            num_labels=out_feature,
            problem_type="multi_label_classification",
        )

        self.f1_scorer = F1Score(task="multiclass", num_classes=out_feature)
        self.accuracy_scorer = Accuracy(task="multiclass", num_classes=out_feature)
        self.precision_scorer = Precision(task="multiclass", num_classes=out_feature)
        self.recall_scorer = Recall(task="multiclass", num_classes=out_feature)

    def _forward_huggingface_tokenizers(self, x: "list[str]"):
        bert_tokens = self.tokenizer(
            x,
            max_length=seq_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return bert_tokens

    def forward(self, x: "list[str]") -> "torch.Tensor":
        # Prepare str
        tokens: torch.Tensor = self._forward_huggingface_tokenizers(x)
        logits = self.huggingface_model(**tokens).logits # type: ignore

        return logits

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("training_loss", loss)
        f1_score = self.f1_scorer(pred, y)
        self.log("training_f1", f1_score, prog_bar=True)
        accuracy = self.accuracy_scorer(pred, y)
        self.log("training_accuracy", accuracy, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("validation_loss", loss)
        f1_score = self.f1_scorer(pred, y)
        self.log("validation_f1", f1_score, prog_bar=True)
        accuracy = self.accuracy_scorer(pred, y)
        self.log("validation_accuracy", accuracy, prog_bar=True)
        precision = self.precision_scorer(pred, y)
        self.log("validation_precision", precision, prog_bar=True)
        recall = self.recall_scorer(pred, y)
        self.log("validation_recall", recall, prog_bar=True)

    def test_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("test_loss", loss)
        f1_score = self.f1_scorer(pred, y)
        self.log("test_f1", f1_score, prog_bar=True)
        accuracy = self.accuracy_scorer(pred, y)
        self.log("test_accuracy", accuracy, prog_bar=True)
        precision = self.precision_scorer(pred, y)
        self.log("test_precision", precision, prog_bar=True)
        recall = self.recall_scorer(pred, y)
        self.log("test_recall", recall, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        return optimizer


In [18]:
model = BERTClassifier("indobenchmark/indobert-base-p2", seq_length, 2)
model(['test','halo']).shape

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([2, 2])

# Train, Val, Test

In [19]:
available_cpus = 0
model_module = BERTClassifier
model_name = "fasttext"

In [20]:
import datetime
import random
import torch
import numpy as np

from torch.utils.data import DataLoader
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint

now = datetime.datetime.now()
timestamp = now.strftime('%Y-%m-%d_%H-%M-%S')

for scenario in scenario_datasets:
    np.random.seed(2023)
    random.seed(2023)
    torch.manual_seed(2023)
    
    run_id = f"{timestamp}_scenario_{scenario}"
    
    if scenario == 0:
        continue

    model = model_module("indobenchmark/indobert-base-p2", seq_length, 2)

    # Dataset
    train_dataset = scenario_datasets[scenario]["train.csv"]
    test_dataset = scenario_datasets[scenario]["test.csv"]
    validation_dataset = scenario_datasets[scenario]["validation.csv"]
    train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=available_cpus)
    validation_loader = DataLoader(validation_dataset, batch_size=batch_size, num_workers=available_cpus)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=available_cpus)

    tb_log = TensorBoardLogger("tensorboard", model_name, run_id)
    
    checkpoint_callback_val = ModelCheckpoint(monitor="validation_loss", filename='val_{epoch}-{validation_loss:.2f}')
    checkpoint_callback_train = ModelCheckpoint(monitor="training_loss", filename='train_{epoch}-{training_loss:.2f}')
    early_stop_callback = EarlyStopping(
        monitor="validation_loss", min_delta=1e-3, patience=5, verbose=True, mode="min"
    )

    trainer = pl.Trainer(logger=tb_log, callbacks=[early_stop_callback, checkpoint_callback_val, checkpoint_callback_train])
    trainer.fit(
        model=model, train_dataloaders=train_loader, val_dataloaders=validation_loader
    )
    print(run_id)
    trainer.test(model=model, dataloaders=test_loader)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

  | Name              | Type                          | Params
--------------------------------------------------------------------
0 | huggingface_model | BertForSequenceClassification | 124 M 
1 | f1_scorer         | MulticlassF1Score             | 0     
2 | accuracy_scorer   | MulticlassAccuracy            | 0     
3 | precision_scorer  | MulticlassPrecision           | 0     
4 | recall_scorer     | MulticlassRecall              | 0     
--------------------------------------------------------------------
124 M     Tr

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

2023-04-12_13-27-25_scenario_1


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

In [None]:
id2label = {
 0 : "Bukan headline",
 1 : "Headline"
}

test_input = [
    "aku suka aplikasi ini", 
    "tidak suka sama aplikasi ini", 
    "oke", 
    "keren aplikasi", 
    "pengungkapan itu ditandai dengan ditangkapnya 4 orang pria, dan ditemukan sabu dalam klip plastik bening saat digeledah"
]
with torch.no_grad():
    pred = model(test_input)
    pred = F.softmax(pred, dim=1)
    pred_np = pred.argmax(dim=1).cpu().detach().numpy()
    for i in range(len(test_input)):
        print(f"'{test_input[i]}' : {id2label[pred_np[i]]}")