In [None]:
# Prepare Google Colab Environement and build handmade library
# !git clone https://github.com/kaenova/Headline_Detection.git
# %cd "/content/Headline_Detection"

# !make lib

In [None]:
# # Reset Google Colab Environment
# %cd ..
# !rm -fr Headline_Detection

# Load Dataset

In [8]:
from datasets import load_dataset

# Custom handmade library
from kaelib.processor import TextProcessingPipeline

In [9]:
data = load_dataset("jakartaresearch/google-play-review")
data_train = data.get('train').to_pandas()[['text', 'label']]
data_test = data.get('validation').to_pandas()[['text', 'label']]

Found cached dataset google-play-review (C:/Users/kaeno/.cache/huggingface/datasets/jakartaresearch___google-play-review/default/1.0.0/df84e67f495cc6639ab0bbf74ff0190498a0b22294fdaca26a5b25e090671c29)


  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
data_train.head()

Unnamed: 0,text,label
0,Halo\n blibli. Sedikit saran untuk gratis ong...,pos
1,So far so good. Respon cepat.,pos
2,thank,neg
3,Aplikasi sering not responding di hp saya (as...,neg
4,Gak ada komentar.,pos


In [11]:
# Train test split

text_train = data_train["text"].values.tolist()
text_test = data_test["text"].values.tolist()

label2id = {"pos": 1, "neg": 0}
id2label = {v: k for k, v in label2id.items()}

labels_train = data_train["label"].apply(lambda x: label2id[x]).values.tolist()
labels_test = data_test["label"].apply(lambda x: label2id[x]).values.tolist()


In [25]:
from torch.utils.data import Dataset
import torch
import typing

class TextClassificationDataset(Dataset):
    def __init__(
        self,
        x: "list[str]",
        y: "list[int]",
        preprocessor: "typing.Optional[TextProcessingPipeline]" = None,
    ):
        assert len(x) == len(y)
        self.x = x
        self.y = torch.tensor(y)
        self.preprocessor = preprocessor

    def __len__(self):
        return len(self.x)

    def _process_idx_text(self, idx):
        data = self.x[idx]
        if type(idx) is not slice:
            data = [self.x[idx]]
        if self.preprocessor is not None:
            data = self.preprocessor.process_corpus(data)
        return data

    def __getitem__(self, idx):
        processed_corpus = self._process_idx_text(idx)
        return processed_corpus, self.y[idx]

train_dataset = TextClassificationDataset(text_train, labels_train)
test_dataset = TextClassificationDataset(text_test, labels_test)


In [None]:
# https://arxiv.org/pdf/1607.01759.pdf

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchnlp.word_to_vector import FastText
import lightning.pytorch as pl


class FastTextClassifier(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.fasttext = FastText("id")
        self.feed_forward = nn.Sequential(
            nn.Linear(300, 150),
            nn.ReLU(),
            nn.Linear(150, 64),
            nn.ReLU(),
            nn.Linear(64, 2),
        )

    def forward(self, x: "list[str]") -> "torch.Tensor":
        # Prepare str
        x_tensor = []
        for sentence in x:
            sentence_seq = sentence.split(" ")
            # Average from word embedding to text embedding
            word_embedding = self.fasttext[sentence_seq].mean(dim=0)
            x_tensor.append(word_embedding)
        x_feed = torch.stack(x_tensor)
        y = self.feed_forward(x_feed)
        return y

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("training_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("validation_loss", loss)

    def test_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        pred = self.forward(x[0])
        loss = F.cross_entropy(pred, y)
        self.log("test_loss", loss)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.00001)
        return optimizer


In [54]:
from torch.utils.data import DataLoader
from lightning.pytorch.loggers import TensorBoardLogger

model = FastTextClassifier()

tb_log = TensorBoardLogger("tensorboard", "fasttext")

train_loader = DataLoader(train_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=50)
validation_loader = DataLoader(test_dataset, batch_size=50)

trainer = pl.Trainer(max_epochs=500, logger=tb_log)
trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=validation_loader)

trainer.test(model=model, dataloaders=test_loader)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name         | Type       | Params
--------------------------------------------
0 | feed_forward | Sequential | 54.9 K
--------------------------------------------
54.9 K    Trainable params
0         Non-trainable params
54.9 K    Total params
0.220     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.4003424048423767}]

In [51]:
test_input = [
    "aku suka aplikasi ini", 
    "tidak suka sama aplikasi ini", 
    "oke", 
    "keren aplikasi", 
    "malu pake aplikasi ini"
]
with torch.no_grad():
    pred = model(test_input)
    pred = F.softmax(pred, dim=1)
    pred_np = pred.argmax(dim=1).cpu().detach().numpy()
    for i in range(len(test_input)):
        print(f"'{test_input[i]}' : {id2label[pred_np[i]]}")

'aku suka aplikasi ini' : pos
'tidak suka sama aplikasi ini' : pos
'oke' : pos
'keren aplikasi' : pos
'malu pake aplikasi ini' : neg
