In [56]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizerFast
from datasets import load_dataset
from torchmetrics.classification import MulticlassAccuracy


# Custom handmade library
import kaelib.processor.preprocessing_func as prep_func
from kaelib.processor import TextProcessingPipeline,NDETCStemmerWraper

# Load Dataset

In [57]:
data = load_dataset("jakartaresearch/google-play-review")
data_train = data.get('train').to_pandas()[['text', 'label']]
data_test = data.get('validation').to_pandas()[['text', 'label']]

Found cached dataset google-play-review (C:/Users/kaeno/.cache/huggingface/datasets/jakartaresearch___google-play-review/default/1.0.0/df84e67f495cc6639ab0bbf74ff0190498a0b22294fdaca26a5b25e090671c29)
100%|██████████| 2/2 [00:00<00:00, 666.13it/s]


In [58]:
# Train test split

text_train = data_train["text"].values.tolist()
text_test = data_test["text"].values.tolist()

label2id = {"pos": 1, "neg": 0}
id2label = {v: k for k, v in label2id.items()}

labels_train = data_train["label"].apply(lambda x: label2id[x]).values.tolist()
labels_test = data_test["label"].apply(lambda x: label2id[x]).values.tolist()

# Prep text
pipeline = TextProcessingPipeline([
    prep_func.lowercasing,
    prep_func.remove_html_tags,
    prep_func.remove_url,
    prep_func.remove_punctuation
])

X_train = pipeline.process_corpus(text_train)
X_test = pipeline.process_corpus(text_test)

y_train = labels_train[:]
y_test = labels_test[:]

In [59]:
# Batching function
# https://stackoverflow.com/questions/8290397/how-to-split-an-iterable-in-constant-size-chunks
from itertools import islice

def batcher(iterable, batch_size):
    iterator = iter(iterable)
    while batch := list(islice(iterator, batch_size)):
        yield batch

In [64]:
tokenizer = BertTokenizerFast.from_pretrained("indobenchmark/indobert-base-p2")

def tokenize_corpus(corpus: 'lst[str]') -> torch.Tensor:
    return tokenizer(corpus, max_length=256, padding='max_length',
                     truncation=True, return_tensors='pt')

In [66]:
"""
https://arxiv.org/pdf/1810.04805.pdf

We use a batch size of 32 and fine-tune for 3
epochs over the data for all GLUE tasks. For each
task, we selected the best fine-tuning learning rate
(among 5e-5, 4e-5, 3e-5, and 2e-5)
"""

batch_size = 32
epochs = 3
device = "cpu"
lr = 5e-5 

num_mini_batch = math.ceil(len(X_train) / batch_size)

model = BertForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p2",
                                        num_labels=2,
                                        id2label=id2label,
                                        label2id=label2id,
                                        problem_type="multi_label_classification"
                                      )


optim = torch.optim.Adam(model.parameters(), lr=lr)

metrics = MulticlassAccuracy(2).to(device)

# Allocate some test tensor
tokenized_x_test = tokenize_corpus(X_test)
target_test = torch.tensor(y_test, dtype=torch.int64, device=device)

for i in range(epochs):
    batch_generator_text = batcher(X_train, batch_size)
    batch_generator_label = batcher(y_train, batch_size)

    epoch_loss = []
    epoch_loss_test = []
    epoch_metrics_test = []

    with tqdm(total=num_mini_batch) as pbar:
        pbar.set_description(f"EPOCH {i + 1} / {epochs}")
        
        for j in range(num_mini_batch):
            model.zero_grad()
            # Prepare data
            mini_batch_text = next(batch_generator_text)
            mini_batch_labels = next(batch_generator_label)
            target = torch.tensor(mini_batch_labels, dtype=torch.int64, device=device)

            # Forward
            mini_batch_tokenized_text = tokenize_corpus(mini_batch_text)
            pred = model(**mini_batch_tokenized_text).logits
            loss = F.cross_entropy(pred, target)
            # Backprop
            loss.backward()
            optim.step()

            # Forward test
            with torch.no_grad():
                pred_test = model(**tokenized_x_test).logits
                loss_test = F.cross_entropy(pred_test, target_test)
                metrics_test = metrics(pred_test, target_test)

            # Metrics and logging
            epoch_loss.append(loss.item())
            avg_loss = sum(epoch_loss) / len(epoch_loss)

            epoch_loss_test.append(loss_test.item())
            avg_loss_test = sum(epoch_loss_test) / len(epoch_loss_test)
            epoch_metrics_test.append(metrics_test.item())
            avg_metric_test = sum(epoch_metrics_test) / len(epoch_metrics_test)

            pbar.set_postfix(
                {
                    "loss": f"{avg_loss:.4f}",
                    "loss_test": f"{avg_loss_test:.4f}",
                    "metric_test": f"{avg_metric_test:.4f}",
                }
            )
            pbar.update(1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
EPOCH 1 / 3:   0%|          | 0/220 [00:25<?, ?it/s]


RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 2368733184 bytes.