In [1]:
# Prepare Google Colab Environement and build handmade library
!git clone https://github.com/kaenova/Headline_Detection.git
%cd "/content/Headline_Detection"

!make lib-install

Cloning into 'Headline_Detection'...
remote: Enumerating objects: 250, done.[K
remote: Counting objects: 100% (250/250), done.[K
remote: Compressing objects: 100% (185/185), done.[K
remote: Total 250 (delta 109), reused 188 (delta 47), pack-reused 0[K
Receiving objects: 100% (250/250), 20.88 MiB | 12.70 MiB/s, done.
Resolving deltas: 100% (109/109), done.
/content/Headline_Detection
pip install -r requirements.txt
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/kaenova/NDETCStemmer.git@master (from -r requirements.txt (line 8))
  Cloning https://github.com/kaenova/NDETCStemmer.git (to revision master) to /tmp/pip-req-build-ua8r6q1w
  Running command git clone --filter=blob:none --quiet https://github.com/kaenova/NDETCStemmer.git /tmp/pip-req-build-ua8r6q1w
  Resolved https://github.com/kaenova/NDETCStemmer.git to commit c1c8063b0d0725e97c6318ad4472953d1b0566bb
  Preparing metadata (setup.py) ... [

In [2]:
# # Reset Google Colab Environment
# %cd ..
# !rm -fr Headline_Detection

In [1]:

import torch
import math
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizerFast
from datasets import load_dataset
from torchmetrics.classification import MulticlassAccuracy, MulticlassF1Score
from torch.utils.data import Dataset, DataLoader


# Custom handmade library
import kaelib.processor.preprocessing_func as prep_func
import kaelib.processor.custom_metrics as custom_metric
from kaelib.processor import TextProcessingPipeline,NDETCStemmerWraper, MetricsContainer

# Load Dataset

In [2]:
data = load_dataset("jakartaresearch/google-play-review")
data_train = data.get('train').to_pandas()[['text', 'label']]
data_test = data.get('validation').to_pandas()[['text', 'label']]

Found cached dataset google-play-review (C:/Users/kaeno/.cache/huggingface/datasets/jakartaresearch___google-play-review/default/1.0.0/df84e67f495cc6639ab0bbf74ff0190498a0b22294fdaca26a5b25e090671c29)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# Train test split

text_train = data_train["text"].values.tolist()
text_test = data_test["text"].values.tolist()

label2id = {"pos": 1, "neg": 0}
id2label = {v: k for k, v in label2id.items()}

labels_train = data_train["label"].apply(lambda x: label2id[x]).values.tolist()
labels_test = data_test["label"].apply(lambda x: label2id[x]).values.tolist()

# Prep text
pipeline = TextProcessingPipeline([
    prep_func.lowercasing,
    prep_func.remove_html_tags,
    prep_func.remove_url,
    prep_func.remove_punctuation
])

X_train = pipeline.process_corpus(text_train)
X_test = pipeline.process_corpus(text_test)

y_train = labels_train[:]
y_test = labels_test[:]

In [4]:
class IndoBERTDataset(Dataset):
    def __init__(self, texts: 'list[str]', targets: 'list[str]'):
        assert len(texts) == len(targets)
        self.tokenizer = BertTokenizerFast.from_pretrained("indobenchmark/indobert-base-p2")
        self.texts = texts
        self.targets = targets

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        bert_tokens = self.tokenizer(self.texts[idx], max_length=256, 
                                     padding='max_length', truncation=True, 
                                     return_tensors='pt' )
        targets = torch.tensor(self.targets[idx], dtype=torch.int64)
        return bert_tokens, targets
    
    @staticmethod
    def squeeze_unwanted_batch(x, device):
        """
        This function handles an unexpected output of DataLoader that creates extra dimension on dim=1
        """
        new_x = {}
        for keys in x:
            new_x[keys] = x[keys].squeeze().to(device)
        return new_x

In [5]:
train_dataset = IndoBERTDataset(X_train, y_train)
test_dataset = IndoBERTDataset(X_test, y_test)

# Prepare Train

In [6]:

"""
https://arxiv.org/pdf/1810.04805.pdf

We use a batch size of 32 and fine-tune for 3
epochs over the data for all GLUE tasks. For each
task, we selected the best fine-tuning learning rate
(among 5e-5, 4e-5, 3e-5, and 2e-5)
"""

batch_size = 32
epochs = 3
device = "cpu"
lr = 2e-5

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

num_mini_batch = math.ceil(len(X_train) / batch_size)

model = BertForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p2",
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification",
).to(device)

optim = torch.optim.Adam(model.parameters(), lr=lr)

metrics = MetricsContainer({
    "acc_train" : MulticlassAccuracy(num_classes=2).to(device),
    "acc_test" : MulticlassAccuracy(num_classes=2).to(device),
    "f1_train" : MulticlassF1Score(num_classes=2).to(device),
    "f1_test" : MulticlassF1Score(num_classes=2).to(device),
    "loss_train" : custom_metric.loss_metrics,
    "loss_test" : custom_metric.loss_metrics,
})

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train Loop

In [14]:
# Half precision
scaler = torch.cuda.amp.GradScaler()

for i in range(epochs):
    # Progress bar
    pbar = tqdm(total=num_mini_batch)
    pbar.set_description(f"EPOCH {i + 1} / {epochs}: Training")
    
    # Metrics initialize new batch
    metrics.new_batch()

    # Train
    model.train()
    for train_tokens, train_target in train_loader:
        model.zero_grad()
        train_target = train_target.to(device)
        # Forward
        with torch.cuda.amp.autocast():
            train_tokens = IndoBERTDataset.squeeze_unwanted_batch(train_tokens, device)
            pred = model(**train_tokens).logits
            loss = F.cross_entropy(pred, train_target)

        # Backprop
        scaler.scale(loss).backward()
        scaler.step(optim)
        scaler.update()

        # Metrics and logging
        metrics("loss_train", loss)
        metrics("f1_train", pred, train_target)
        metrics("acc_train", pred, train_target)
        pbar.set_postfix(metrics.mean_metrics_batch())
        pbar.update(1)

    # Eval
    model.eval()
    pbar.set_description(f"EPOCH {i + 1} / {epochs}: Infering on Testing Data")
    
    for test_tokens, test_target in test_loader:
        test_tokens = IndoBERTDataset.squeeze_unwanted_batch(test_tokens, device)
        test_target = test_target.to(device)
        # Forward test
        with torch.no_grad():
            pred_test = model(**test_tokens).logits
            loss_test = F.cross_entropy(pred_test, test_target)

        # Metrics and logging
        metrics("loss_test", loss_test)
        metrics("f1_test", pred_test, test_target)
        metrics("acc_test", pred_test, test_target)
        pbar.set_postfix(metrics.mean_metrics_batch())

    pbar.close()
    metrics.process_batch()


EPOCH 1 / 3: Infering on Testing Data:   0%|          | 0/220 [01:22<?, ?it/s, acc_train=0.509, acc_test=0.537, f1_train=0.482, f1_test=0.51, loss_train=Empty, loss_test=nan]


{'acc_train': [], 'acc_test': [], 'f1_train': [], 'f1_test': [], 'loss_train': [], 'loss_test': []}




{'acc_train': [], 'acc_test': [0.7166666984558105], 'f1_train': [], 'f1_test': [0.6745762825012207], 'loss_train': [], 'loss_test': [0.5094448328018188]}




{'acc_train': [], 'acc_test': [0.7166666984558105, 0.4615384638309479], 'f1_train': [], 'f1_test': [0.6745762825012207, 0.4285714328289032], 'loss_train': [], 'loss_test': [0.5094448328018188, 0.5666851997375488]}




{'acc_train': [], 'acc_test': [0.7166666984558105, 0.4615384638309479, 0.4464285671710968], 'f1_train': [], 'f1_test': [0.6745762825012207, 0.4285714328289032, 0.4385964870452881], 'loss_train': [], 'loss_test': [0.5094448328018188, 0.5666851997375488, 0.5761623382568359]}


KeyboardInterrupt: 