In [1]:
machine = "local"

In [2]:
# Prepare Google Colab Environement and build handmade library
!git clone https://github.com/kaenova/Headline_Detection.git
%cd "/content/Headline_Detection"

!make lib

%cd "/content/"

print("Please upload 'pfizer.csv'")
from google.colab import files
files.upload()

from google.colab import drive
drive.mount('/content/drive')

machine = "colab"

Cloning into 'Headline_Detection'...
remote: Enumerating objects: 891, done.[K
remote: Counting objects: 100% (351/351), done.[K
remote: Compressing objects: 100% (268/268), done.[K
remote: Total 891 (delta 115), reused 271 (delta 77), pack-reused 540[K
Receiving objects: 100% (891/891), 39.81 MiB | 11.55 MiB/s, done.
Resolving deltas: 100% (378/378), done.
/content/Headline_Detection
pip install -r requirements.txt
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/kaenova/NDETCStemmer.git@master (from -r requirements.txt (line 28))
  Cloning https://github.com/kaenova/NDETCStemmer.git (to revision master) to /tmp/pip-req-build-33ms91zu
  Running command git clone --filter=blob:none --quiet https://github.com/kaenova/NDETCStemmer.git /tmp/pip-req-build-33ms91zu
  Resolved https://github.com/kaenova/NDETCStemmer.git to commit 5f5e65552189ef7607a126666dca0d5104c256dc
  Preparing metadata (setup.py) ...

Saving pfizer.csv to pfizer.csv
Mounted at /content/drive


In [3]:
# # Reset Google Colab Environment
# %cd ..
# !rm -fr Headline_Detection

# Load Checkpoint Model

In [4]:
# Hyperparameters
"""
https://arxiv.org/pdf/1810.04805.pdf

We use a batch size of 32 and fine-tune for 3
epochs over the data for all GLUE tasks. For each
task, we selected the best fine-tuning learning rate
(among 5e-5, 4e-5, 3e-5, and 2e-5)
"""
hyper_params = {
    'model_name': "indolem/indobertweet-base-uncased",
    'seq_length': 256,
    'out_feature': 2,
    'learning_rate': 2e-5,
    'batch_size': 16
}

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import lightning.pytorch as pl
from transformers import BertForSequenceClassification, BertTokenizerFast
from torchmetrics.classification import F1Score, Accuracy, Recall, Precision


class BERTClassifier(pl.LightningModule):
    def __init__(
        self,
        huggingface_model_name: "str" = "indolem/indobertweet-base-uncased",
        seq_length: "int" = 256,
        out_feature: "int" = 2,
        pad_sequence: "bool" = True,
    ):
        super().__init__()
        self.seq_length = seq_length
        self.pad_sequence = pad_sequence
        self.tokenizer = BertTokenizerFast.from_pretrained(huggingface_model_name)
        self.huggingface_model = BertForSequenceClassification.from_pretrained(
            huggingface_model_name,
            num_labels=out_feature,
            problem_type="multi_label_classification",
        ).to(self.device)

        self.f1_scorer = F1Score(task="multiclass", num_classes=out_feature)
        self.accuracy_scorer = Accuracy(task="multiclass", num_classes=out_feature)
        self.precision_scorer = Precision(task="multiclass", num_classes=out_feature)
        self.recall_scorer = Recall(task="multiclass", num_classes=out_feature)

    def _forward_huggingface_tokenizers(self, x: "list[str]"):
        for sentence in x:
            sentence_seq = sentence.split(" ")
            if len(sentence_seq) > self.seq_length:
                sentence_seq = sentence_seq[: self.seq_length]
            if self.pad_sequence:
                while len(sentence_seq) < self.seq_length:
                    sentence_seq.append("[PAD]")
        tokens = self.tokenizer(
            x,
            max_length=512, # Max BERT tokens
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        input_ids = tokens["input_ids"].to(self.device)  # type: ignore
        attention_mask = tokens["attention_mask"].to(self.device)  # type: ignore
        return input_ids, attention_mask

    def forward(self, x: "list[str]") -> "torch.Tensor":
        # Prepare str
        if not isinstance(x, list):
            x = list(x)
        input_ids, attention_mask = self._forward_huggingface_tokenizers(x)
        logits = self.huggingface_model(input_ids=input_ids, attention_mask=attention_mask).logits  # type: ignore
        return logits
        
    def predict_step(self, batch, batch_idx):
        x, y = batch
        if not isinstance(x, list):
            x = list(x)
        return self(x)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=hyper_params['learning_rate'])
        return optimizer


In [6]:
model_module = BERTClassifier
def create_model():
    return model_module(hyper_params['model_name'], hyper_params['seq_length'], hyper_params['out_feature'])

In [7]:
# Load IndoBERTweet Scenario 5
load_path = "tensorboard_checkpoint/indobert/2023-04-27_04-48-48_scenario_5/checkpoints/val_epoch=0-validation_loss=0.0691.ckpt"
device = "cpu"

if machine == "colab":
    load_path = "/content/drive/Shareddrives/_PercobaanKaenova/Tugas Akhir/tensorboard/2023_04_21/indobert/2023-04-27_04-48-48_scenario_5/checkpoints/val_epoch=0-validation_loss=0.0691.ckpt"
    device = "cuda"

model = create_model()
checkpoint_weight = torch.load(load_path, map_location=device)
model.load_state_dict(checkpoint_weight['state_dict'])

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at indolem/indobertweet-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/ind

<All keys matched successfully>

# Preparing Data Preprocessor

In [8]:
from kaelib.processor.TextProcessingPipeline import TextProcessingPipeline
import kaelib.processor.preprocessing_func as pf

# Process based on Scenario 5
preprocessor = TextProcessingPipeline([
        pf.lowercasing,
        pf.remove_username,
        pf.remove_url,
        pf.remove_emoji,
])

# Load External Data

In [9]:
import pandas as pd

data_path = "../../data/5. External Data/pfizer.csv"

if machine == "colab":
    data_path = "/content/pfizer.csv"

df = pd.read_csv(data_path)

In [10]:
import torch
import pandas
import typing
from torch.utils.data import Dataset, DataLoader
from kaelib.processor import TextProcessingPipeline

class TextClassificationTestDataset(Dataset):
    def __init__(
        self,
        df: "pandas.DataFrame",
        x_column_name: "str" = "tweet",
        preprocessor: "typing.Optional[TextProcessingPipeline]" = None,
    ):
        self.x = df[x_column_name].astype(str).to_list()
        self.preprocessor = preprocessor

    def __len__(self):
        return len(self.x)

    def _process_idx_text(self, idx):
        data = self.x[idx]
        if type(idx) is not slice:
            data = [self.x[idx]]
        if self.preprocessor is not None:
            data = self.preprocessor.process_corpus(data)
        return data

    def __getitem__(self, idx):
        processed_corpus = self._process_idx_text(idx)
        y_dummy = 0
        if isinstance(idx, slice):
            y_dummy = [0 for _ in range(len(processed_corpus))]
        return processed_corpus[0], y_dummy

    def __repr__(self) -> str:
        return "\n".join([f"{self.x[i]}" for i in range(5)])



In [11]:
X_datasets = TextClassificationTestDataset(df, 'tweet', preprocessor)
X_loader = DataLoader(X_datasets, batch_size=hyper_params['batch_size'], shuffle=False)

In [12]:
import lightning.pytorch as pl
trainer = pl.Trainer()
model.eval()
pred = trainer.predict(model=model, dataloaders=X_loader)

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [20]:
pred_tensor = torch.cat(pred)
print(pred_tensor.shape, len(X_datasets))

torch.Size([9972, 2]) 9972


# Process the prediction

In [21]:
import torch.nn.functional as F

In [22]:
softmax_tensor = F.softmax(pred_tensor)
argmax_tensor = torch.argmax(softmax_tensor, 1)
np_pred = argmax_tensor.cpu().numpy()

  softmax_tensor = F.softmax(pred_tensor)


In [23]:
df_final = df.copy()
df_final['prediction (0 non-headline / 1 headline)'] = np_pred
df_final

Unnamed: 0,TweetID,TweetURL,tweet,prediction (0 non-headline / 1 headline)
0,1506983615460425733,https://twitter.com/purwa760443751/status/1506...,Kabar baik! Vaksin Pfizer-BioNTech akan masuk ...,1
1,1506979567310807050,https://twitter.com/kompascom/status/150697956...,Pemprov DKI menyediakan berbagai jenis merek v...,1
2,1506961906560483330,https://twitter.com/FaisalRasyidZ/status/15069...,@drpriono1 Saya mau divaksin booster asal vaks...,0
3,1506786297721274368,https://twitter.com/PKCTamansari/status/150678...,Berikut Jadwal Vaksin Covid19 Kamis 24 Maret 2...,0
4,1506592237102108673,https://twitter.com/MarsyaLiana5/status/150659...,"vaksin Coronavac, vaksin jadi buatan Sinovac a...",1
...,...,...,...,...
9967,1349190850241122304,https://twitter.com/11airbening/status/1349190...,"@fullmoonfolks Mau vaksin pfizer, dan pemerint...",0
9968,1349177154991312898,https://twitter.com/InayahRasyid/status/134917...,Nah Lho! Indonesia Diminta Tak Boleh Gugat Kal...,0
9969,1349165455349403648,https://twitter.com/beritaKBR/status/134916545...,"Banyak orang bertanya, kok efikasinya lebih re...",1
9970,1349157230831013891,https://twitter.com/KKMPutrajaya/status/134915...,Kita akan menerima 12.8 juta dos vaksin #Pfize...,0


In [24]:
df_final['prediction (0 non-headline / 1 headline)'].value_counts()

1    7648
0    2324
Name: prediction (0 non-headline / 1 headline), dtype: int64

In [25]:
print("Percentages of headline:", 7648 / (7648 + 2324))

Percentages of headline: 0.7669474528680305


In [26]:
df_final.to_csv("pfizer_pred.csv")