In [None]:
# Prepare Google Colab Environement and build handmade library
!git clone https://github.com/kaenova/Headline_Detection.git
%cd "/content/Headline_Detection"

!make lib-install

In [None]:
# # Reset Google Colab Environment
# %cd ..
# !rm -fr Headline_Detection

# Load Dataset

In [3]:
import torch
import math
import pandas as pd
import numpy as np
import torch.nn.functional as F

from tqdm import tqdm
from torchmetrics.classification import MulticlassAccuracy
from datasets import load_dataset

# Custom handmade library
import kaelib.processor.preprocessing_func as prep_func
from kaelib.processor import TextProcessingPipeline
from kaelib.model import  FastTextClassifier

In [4]:
data = load_dataset("jakartaresearch/google-play-review")
data_train = data.get('train').to_pandas()[['text', 'label']]
data_test = data.get('validation').to_pandas()[['text', 'label']]

Found cached dataset google-play-review (C:/Users/kaeno/.cache/huggingface/datasets/jakartaresearch___google-play-review/default/1.0.0/df84e67f495cc6639ab0bbf74ff0190498a0b22294fdaca26a5b25e090671c29)
100%|██████████| 2/2 [00:00<00:00, 667.40it/s]


In [5]:
data_train.head()

Unnamed: 0,text,label
0,Halo\n blibli. Sedikit saran untuk gratis ong...,pos
1,So far so good. Respon cepat.,pos
2,thank,neg
3,Aplikasi sering not responding di hp saya (as...,neg
4,Gak ada komentar.,pos


In [6]:
# Train test split

text_train = data_train["text"].values.tolist()
text_test = data_test["text"].values.tolist()

label2id = {"pos": 1, "neg": 0}
id2label = {v: k for k, v in label2id.items()}

labels_train = data_train["label"].apply(lambda x: label2id[x]).values.tolist()
labels_test = data_test["label"].apply(lambda x: label2id[x]).values.tolist()


In [7]:
# Prep text
pipeline = TextProcessingPipeline([
    prep_func.lowercasing,
    prep_func.remove_html_tags,
    prep_func.remove_url,
    prep_func.remove_punctuation
])

X_train = pipeline.process_corpus(text_train)
X_test = pipeline.process_corpus(text_test)

y_train = labels_train[:]
y_test = labels_test[:]

In [8]:
# Batching function
# https://stackoverflow.com/questions/8290397/how-to-split-an-iterable-in-constant-size-chunks
from itertools import islice

def batcher(iterable, batch_size):
    iterator = iter(iterable)
    while batch := list(islice(iterator, batch_size)):
        yield batch


In [9]:
batch_size = 64
epochs = 10
device = "cpu"
lr = 0.05

num_mini_batch = math.ceil(len(X_train) / batch_size)

model = FastTextClassifier().to(device)
optim = torch.optim.Adam(model.parameters(), lr=lr)

metrics = MulticlassAccuracy(2).to(device)

# Allocate some test tensor
target_test = torch.tensor(y_test, dtype=torch.int64, device=device)

for i in range(epochs):
    batch_generator_text = batcher(X_train, batch_size)
    batch_generator_label = batcher(y_train, batch_size)

    epoch_loss = []
    epoch_loss_test = []
    epoch_metrics_test = []

    with tqdm(total=num_mini_batch) as pbar:
        pbar.set_description(f"EPOCH {i + 1} / {epochs}")
        
        for j in range(num_mini_batch):
            model.zero_grad()
            # Prepare data
            mini_batch_text = next(batch_generator_text)
            mini_batch_labels = next(batch_generator_label)
            target = torch.tensor(mini_batch_labels, dtype=torch.int64, device=device)

            # Forward
            pred = model(mini_batch_text)
            loss = F.cross_entropy(pred, target)
            # Backprop
            loss.backward()
            optim.step()

            # Forward test
            with torch.no_grad():
                pred_test = model(X_test)
                loss_test = F.cross_entropy(pred_test, target_test)
                metrics_test = metrics(pred_test, target_test)

            # Metrics and logging
            epoch_loss.append(loss.item())
            avg_loss = sum(epoch_loss) / len(epoch_loss)

            epoch_loss_test.append(loss_test.item())
            avg_loss_test = sum(epoch_loss_test) / len(epoch_loss_test)
            epoch_metrics_test.append(metrics_test.item())
            avg_metric_test = sum(epoch_metrics_test) / len(epoch_metrics_test)

            pbar.set_postfix(
                {
                    "loss": f"{avg_loss:.4f}",
                    "loss_test": f"{avg_loss_test:.4f}",
                    "metric_test": f"{avg_metric_test:.4f}",
                }
            )
            pbar.update(1)


EPOCH 1 / 10: 100%|██████████| 110/110 [00:25<00:00,  4.26it/s, loss=0.3267, loss_test=0.3207, metric_test=0.6134]
EPOCH 2 / 10: 100%|██████████| 110/110 [00:25<00:00,  4.27it/s, loss=0.2664, loss_test=0.2707, metric_test=0.7766]
EPOCH 3 / 10: 100%|██████████| 110/110 [00:26<00:00,  4.19it/s, loss=0.2513, loss_test=0.2661, metric_test=0.7897]
EPOCH 4 / 10: 100%|██████████| 110/110 [00:27<00:00,  3.96it/s, loss=0.2434, loss_test=0.2636, metric_test=0.7957]
EPOCH 5 / 10: 100%|██████████| 110/110 [00:26<00:00,  4.20it/s, loss=0.2434, loss_test=0.2687, metric_test=0.7941]
EPOCH 6 / 10: 100%|██████████| 110/110 [00:25<00:00,  4.25it/s, loss=0.2393, loss_test=0.2694, metric_test=0.7959]
EPOCH 7 / 10: 100%|██████████| 110/110 [00:24<00:00,  4.44it/s, loss=0.2348, loss_test=0.2689, metric_test=0.7985]
EPOCH 8 / 10: 100%|██████████| 110/110 [00:24<00:00,  4.47it/s, loss=0.2318, loss_test=0.2696, metric_test=0.7996]
EPOCH 9 / 10: 100%|██████████| 110/110 [00:26<00:00,  4.14it/s, loss=0.2297, los

In [15]:
test_input = ["aku suka aplikasi ini", "tidak suka sama aplikasi ini", "oke", "keren aplikasi", "malu pake aplikasi ini"]
with torch.no_grad():
    pred = model(test_input)
    pred = F.softmax(pred, dim=1)
    pred_np = pred.argmax(dim=1).cpu().detach().numpy()
    for i in range(len(test_input)):
        print(f"'{test_input[i]}' : {id2label[pred_np[i]]}")

'aku suka aplikasi ini' : pos
'tidak suka sama aplikasi ini' : neg
'oke' : pos
'keren aplikasi' : pos
'malu pake aplikasi ini' : neg
