In [1]:
# Prepare Google Colab Environement and build handmade library
!git clone https://github.com/kaenova/Headline_Detection.git
%cd "/content/Headline_Detection"

!make lib

Cloning into 'Headline_Detection'...
remote: Enumerating objects: 399, done.[K
remote: Counting objects: 100% (399/399), done.[K
remote: Compressing objects: 100% (280/280), done.[K
remote: Total 399 (delta 200), reused 300 (delta 101), pack-reused 0[K
Receiving objects: 100% (399/399), 22.53 MiB | 32.82 MiB/s, done.
Resolving deltas: 100% (200/200), done.
/content/Headline_Detection
pip install -r requirements.txt
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/kaenova/NDETCStemmer.git@master (from -r requirements.txt (line 18))
  Cloning https://github.com/kaenova/NDETCStemmer.git (to revision master) to /tmp/pip-req-build-48gfhky_
  Running command git clone --filter=blob:none --quiet https://github.com/kaenova/NDETCStemmer.git /tmp/pip-req-build-48gfhky_
  Resolved https://github.com/kaenova/NDETCStemmer.git to commit c1c8063b0d0725e97c6318ad4472953d1b0566bb
  Preparing metadata (setup.py) ... 

In [None]:
# # Reset Google Colab Environment
# %cd ..
# !rm -fr Headline_Detection

In [2]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizerFast
from datasets import load_dataset
from torchmetrics.classification import MulticlassAccuracy, MulticlassF1Score
from torch.utils.data import Dataset, DataLoader


# Custom handmade library
import kaelib.processor.preprocessing_func as prep_func
import kaelib.processor.custom_metrics as custom_metric
from kaelib.processor import TextProcessingPipeline,NDETCStemmerWraper, MetricsContainer, HeadlineDataset

# Load Dataset

In [None]:
# data = load_dataset("jakartaresearch/google-play-review")
# data_train = data.get('train').to_pandas()[['text', 'label']]
# data_test = data.get('validation').to_pandas()[['text', 'label']]

In [None]:
# # Train test split

# text_train = data_train["text"].values.tolist()
# text_test = data_test["text"].values.tolist()

# label2id = {"pos": 1, "neg": 0}
# id2label = {v: k for k, v in label2id.items()}

# labels_train = data_train["label"].apply(lambda x: label2id[x]).values.tolist()
# labels_test = data_test["label"].apply(lambda x: label2id[x]).values.tolist()

# # Prep text
# pipeline = TextProcessingPipeline([
#     prep_func.lowercasing,
#     prep_func.remove_html_tags,
#     prep_func.remove_url,
#     prep_func.remove_punctuation
# ])

# X_train = pipeline.process_corpus(text_train)
# X_test = pipeline.process_corpus(text_test)

# y_train = labels_train[:]
# y_test = labels_test[:]

In [None]:
# class IndoBERTDataset(Dataset):
#     def __init__(self, texts: 'list[str]', targets: 'list[str]'):
#         assert len(texts) == len(targets)
#         self.tokenizer = BertTokenizerFast.from_pretrained("indobenchmark/indobert-base-p2")
#         self.texts = texts
#         self.targets = targets

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         bert_tokens = self.tokenizer(self.texts[idx], max_length=256, 
#                                      padding='max_length', truncation=True, 
#                                      return_tensors='pt' )
#         targets = torch.tensor(self.targets[idx], dtype=torch.int64)
#         return bert_tokens, targets
    
#     @staticmethod
#     def squeeze_unwanted_batch(x, device):
#         """
#         This function handles an unexpected output of DataLoader that creates extra dimension on dim=1
#         """
#         new_x = {}
#         for keys in x:
#             new_x[keys] = x[keys].squeeze().to(device)
#         return new_x

In [None]:
# train_dataset = IndoBERTDataset(X_train, y_train)
# test_dataset = IndoBERTDataset(X_test, y_test)

# Load Dataset v2

In [3]:
train_path = "/content/train.csv"
test_path = "/content/test.csv"

tokenizer = BertTokenizerFast.from_pretrained("indobenchmark/indobert-base-p2")
train_dataset = HeadlineDataset(train_path, tokenizer,
                                max_length=256,
                                padding="max_length",
                                truncation=True,
                                return_tensors="pt",
                                )
test_dataset = HeadlineDataset(test_path, tokenizer,
                                max_length=256,
                                padding="max_length",
                                truncation=True,
                                return_tensors="pt",
                                )

label2id = {"headline": 1, "non-headline": 0}
id2label = {v: k for k, v in label2id.items()}

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

# Prepare Train

In [4]:

"""
https://arxiv.org/pdf/1810.04805.pdf

We use a batch size of 32 and fine-tune for 3
epochs over the data for all GLUE tasks. For each
task, we selected the best fine-tuning learning rate
(among 5e-5, 4e-5, 3e-5, and 2e-5)
"""

batch_size = 32
epochs = 3
device = "cuda"
lr = 2e-6

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

num_mini_batch = math.ceil(len(train_dataset) / batch_size)

model = BertForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p2",
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification",
).to(device)

optim = torch.optim.Adam(model.parameters(), lr=lr)

metrics = MetricsContainer({
    "acc_train" : MulticlassAccuracy(num_classes=2).to(device),
    "acc_test" : MulticlassAccuracy(num_classes=2).to(device),
    "f1_train" : MulticlassF1Score(num_classes=2).to(device),
    "f1_test" : MulticlassF1Score(num_classes=2).to(device),
    "loss_train" : custom_metric.loss_metrics,
    "loss_test" : custom_metric.loss_metrics,
})

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train Loop

In [7]:
# Half precision
scaler = torch.cuda.amp.GradScaler()

for i in range(epochs):
    # Progress bar
    pbar = tqdm(total=num_mini_batch)
    pbar.set_description(f"EPOCH {i + 1} / {epochs}: Training")
    
    # Metrics initialize new batch
    metrics.new_batch()

    # Train
    model.train()
    for train_tokens, train_target in train_loader:
        model.zero_grad()
        train_target = train_target.to(device)
        # Forward
        with torch.cuda.amp.autocast():
            # train_tokens = IndoBERTDataset.squeeze_unwanted_batch(train_tokens, device)
            train_tokens['input_ids'] = train_tokens['input_ids'].to(device)
            train_tokens['attention_mask'] = train_tokens['attention_mask'].to(device)
            pred = model(**train_tokens).logits
            loss = F.cross_entropy(pred, train_target)

        # Backprop
        scaler.scale(loss).backward()
        scaler.step(optim)
        scaler.update()

        # Metrics and logging
        metrics("loss_train", loss)
        metrics("f1_train", pred, train_target)
        metrics("acc_train", pred, train_target)
        pbar.set_postfix(metrics.mean_metrics_batch())
        pbar.update(1)

    # Eval
    model.eval()
    pbar.set_description(f"EPOCH {i + 1} / {epochs}: Infering on Testing Data")
    
    for test_tokens, test_target in test_loader:
        test_target = test_target.to(device)
        # Forward test
        with torch.no_grad():
            test_tokens['input_ids'] = test_tokens['input_ids'].to(device)
            test_tokens['attention_mask'] = test_tokens['attention_mask'].to(device)
            pred_test = model(**test_tokens).logits
            loss_test = F.cross_entropy(pred_test, test_target)

        # Metrics and logging
        metrics("loss_test", loss_test)
        metrics("f1_test", pred_test, test_target)
        metrics("acc_test", pred_test, test_target)
        pbar.set_postfix(metrics.mean_metrics_batch())

    pbar.close()
    metrics.process_batch()


EPOCH 1 / 3: Infering on Testing Data: 100%|██████████| 59/59 [00:36<00:00,  1.60it/s, acc_train=0.889, acc_test=0.956, f1_train=0.875, f1_test=0.953, loss_train=0.378, loss_test=0.176]
EPOCH 2 / 3: Infering on Testing Data: 100%|██████████| 59/59 [00:32<00:00,  1.80it/s, acc_train=0.964, acc_test=0.977, f1_train=0.965, f1_test=0.975, loss_train=0.135, loss_test=0.101]
EPOCH 3 / 3: Infering on Testing Data: 100%|██████████| 59/59 [00:33<00:00,  1.77it/s, acc_train=0.981, acc_test=0.973, f1_train=0.98, f1_test=0.97, loss_train=0.0784, loss_test=0.087]


In [10]:
model.save_pretrained("../indo-bert-twitter-headline")

In [11]:
tokenizer.save_pretrained("../indo-bert-twitter-headline")

('../indo-bert-twitter-headline/tokenizer_config.json',
 '../indo-bert-twitter-headline/special_tokens_map.json',
 '../indo-bert-twitter-headline/vocab.txt',
 '../indo-bert-twitter-headline/added_tokens.json',
 '../indo-bert-twitter-headline/tokenizer.json')

# Manual Testing

In [16]:
# Prepare
from transformers import pipeline
from kaelib.processor.TextProcessingPipeline import TextProcessingPipeline
import kaelib.processor.preprocessing_func as pf
preprocessor = TextProcessingPipeline([
    pf.lowercasing,
    pf.remove_username,
    pf.remove_url,
    pf.remove_punctuation,
])
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [30]:
text = ["Daftar Rumah dan Kendaraan Mewah Rafael Alun Trisambodo, Hartanya Rp 56 M", 
        "disaat-saat kaya gini malah harus melakukan debugging memang",
        "Curhat Penyesalan Kekasih Mario Dandy soal Penganiayaan David Viral, AG: Karena Kecerobohan Saya",
        "Potret Ayah David Bersimpuh Depan Istri Gus Dur, Tetap Menunduk Saat Sri Mulyani Minta Maaf",
        "hi",
        "apa lirik lagu Indonesia yang menurut kamu sangat buset bagus bener dah ampun.",
        "Aktif di komunitas eh tau-tau dapet pekerjaan."]
text = preprocessor.process_corpus(text)
pipe(text)



[{'label': 'headline', 'score': 0.8657898902893066},
 {'label': 'non-headline', 'score': 0.9008456468582153},
 {'label': 'headline', 'score': 0.8795629143714905},
 {'label': 'headline', 'score': 0.8916939496994019},
 {'label': 'non-headline', 'score': 0.8179709911346436},
 {'label': 'non-headline', 'score': 0.8258944153785706},
 {'label': 'non-headline', 'score': 0.8858309984207153}]

# Save the model to huggingface

In [32]:
from huggingface_hub import login
login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [43]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_folder(
    folder_path="/content/indo-bert-twitter-headline/",
    repo_id="kaenova/indo-bert-twitter-headline",
)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

'https://huggingface.co/kaenova/indo-bert-twitter-headline/tree/main/'