In [1]:
# Prepare Google Colab Environement and build handmade library
!git clone https://github.com/kaenova/Headline_Detection.git
%cd "/content/Headline_Detection"

!make lib-install

Cloning into 'Headline_Detection'...
remote: Enumerating objects: 245, done.[K
remote: Counting objects: 100% (245/245), done.[K
remote: Compressing objects: 100% (181/181), done.[K
remote: Total 245 (delta 106), reused 185 (delta 46), pack-reused 0[K
Receiving objects: 100% (245/245), 20.87 MiB | 17.99 MiB/s, done.
Resolving deltas: 100% (106/106), done.
/content/Headline_Detection
pip install -r requirements.txt
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/kaenova/NDETCStemmer.git@master (from -r requirements.txt (line 8))
  Cloning https://github.com/kaenova/NDETCStemmer.git (to revision master) to /tmp/pip-req-build-pvkdrgg_
  Running command git clone --filter=blob:none --quiet https://github.com/kaenova/NDETCStemmer.git /tmp/pip-req-build-pvkdrgg_
  Resolved https://github.com/kaenova/NDETCStemmer.git to commit c1c8063b0d0725e97c6318ad4472953d1b0566bb
  Preparing metadata (setup.py) ... [

In [2]:
# # Reset Google Colab Environment
# %cd ..
# !rm -fr Headline_Detection

In [1]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizerFast
from datasets import load_dataset
from torchmetrics.classification import MulticlassAccuracy
from torch.utils.data import Dataset, DataLoader


# Custom handmade library
import kaelib.processor.preprocessing_func as prep_func
from kaelib.processor import TextProcessingPipeline,NDETCStemmerWraper

# Load Dataset

In [2]:
data = load_dataset("jakartaresearch/google-play-review")
data_train = data.get('train').to_pandas()[['text', 'label']]
data_test = data.get('validation').to_pandas()[['text', 'label']]



  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# Train test split

text_train = data_train["text"].values.tolist()
text_test = data_test["text"].values.tolist()

label2id = {"pos": 1, "neg": 0}
id2label = {v: k for k, v in label2id.items()}

labels_train = data_train["label"].apply(lambda x: label2id[x]).values.tolist()
labels_test = data_test["label"].apply(lambda x: label2id[x]).values.tolist()

# Prep text
pipeline = TextProcessingPipeline([
    prep_func.lowercasing,
    prep_func.remove_html_tags,
    prep_func.remove_url,
    prep_func.remove_punctuation
])

X_train = pipeline.process_corpus(text_train)
X_test = pipeline.process_corpus(text_test)

y_train = labels_train[:]
y_test = labels_test[:]

In [4]:
class IndoBERTDataset(Dataset):
    def __init__(self, texts: 'lst[str]', targets: 'lst[str]'):
        assert len(texts) == len(targets)
        self.tokenizer = BertTokenizerFast.from_pretrained("indobenchmark/indobert-base-p2")
        self.texts = texts
        self.targets = targets

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        bert_tokens = self.tokenizer(self.texts[idx], max_length=256, 
                                     padding='max_length', truncation=True, 
                                     return_tensors='pt' )
        targets = torch.tensor(self.targets[idx], dtype=torch.int64)
        return bert_tokens, targets
    
    @staticmethod
    def squeeze_unwanted_batch(x, device):
        """
        This function handles an unexpected output of DataLoader that creates extra dimension on dim=1
        """
        new_x = {}
        for keys in x:
            new_x[keys] = x[keys].squeeze().to(device)
        return new_x

In [5]:
train_dataset = IndoBERTDataset(X_train, y_train)
test_dataset = IndoBERTDataset(X_test, y_test)

# Prepare Train

In [6]:
"""
https://arxiv.org/pdf/1810.04805.pdf

We use a batch size of 32 and fine-tune for 3
epochs over the data for all GLUE tasks. For each
task, we selected the best fine-tuning learning rate
(among 5e-5, 4e-5, 3e-5, and 2e-5)
"""

batch_size = 32
epochs = 3
device = "cuda"
lr = 5e-5 

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

num_mini_batch = math.ceil(len(X_train) / batch_size)

model = BertForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p2",
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification",
).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train Loop

In [7]:
optim = torch.optim.Adam(model.parameters(), lr=lr)

metrics = MulticlassAccuracy(2).to(device)

# Half precision
scaler = torch.cuda.amp.GradScaler()

for i in range(epochs):
    # Progress bar
    pbar = tqdm(total=num_mini_batch)
    pbar.set_description(f"EPOCH {i + 1} / {epochs}: Training")
    
    epoch_loss = []
    avg_loss = 0
    epoch_loss_test = []
    avg_loss_test = 0
    epoch_metrics_test = []
    avg_metric_test = 0

    for train_tokens, train_target in train_loader:
        model.zero_grad()
        train_target = train_target.to(device)
        # Forward
        with torch.cuda.amp.autocast():
            train_tokens = IndoBERTDataset.squeeze_unwanted_batch(train_tokens, device)
            pred = model(**train_tokens).logits
            loss = F.cross_entropy(pred, train_target)

        # Backprop
        scaler.scale(loss).backward()
        scaler.step(optim)
        scaler.update()

        # Metrics and logging
        epoch_loss.append(loss.item())
        avg_loss = sum(epoch_loss) / len(epoch_loss)
        pbar.set_postfix(
            {
                "loss": f"{avg_loss:.4f}",
                "loss_test": f"{avg_loss_test:.4f}",
                "metric_test": f"{avg_metric_test:.4f}",
            }
        )
        pbar.update(1)

    for test_tokens, test_target in test_loader:
        test_tokens = IndoBERTDataset.squeeze_unwanted_batch(test_tokens, device)
        test_target = test_target.to(device)
        # Forward test
        with torch.no_grad():
            pred_test = model(**test_tokens).logits
            loss_test = F.cross_entropy(pred_test, test_target)
            metrics_test = metrics(pred_test, test_target)

        epoch_loss_test.append(loss_test.item())
        avg_loss_test = sum(epoch_loss_test) / len(epoch_loss_test)
        epoch_metrics_test.append(metrics_test.item())
        avg_metric_test = sum(epoch_metrics_test) / len(epoch_metrics_test)

        pbar.set_description(f"EPOCH {i + 1} / {epochs}: Infering on Testing Data")
        pbar.set_postfix(
            {
                "loss": f"{avg_loss:.4f}",
                "loss_test": f"{avg_loss_test:.4f}",
                "metric_test": f"{avg_metric_test:.4f}",
            }
        )
    
    pbar.close()


EPOCH 1 / 3: Infering on Testing Data: 100%|██████████| 220/220 [02:26<00:00,  1.51it/s, loss=0.2240, loss_test=0.2013, metric_test=0.7821]
EPOCH 2 / 3: Infering on Testing Data: 100%|██████████| 220/220 [02:22<00:00,  1.54it/s, loss=0.1434, loss_test=0.2006, metric_test=0.8488]
EPOCH 3 / 3: Infering on Testing Data: 100%|██████████| 220/220 [02:25<00:00,  1.51it/s, loss=0.1051, loss_test=0.2548, metric_test=0.8019]
