In [1]:
import os, sys
sys.path.append('../')
os.chdir('../')

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

# from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, BertTokenizer
# from nltk.tokenize import TweetTokenizer
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification

from utils.forward_fn import forward_sequence_classification
from utils.metrics import document_sentiment_metrics_fn
# from utils.data_utils_kazee3 import DocumentSentimentDataset, DocumentSentimentDataLoader
from utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [2]:
print("Is cuda available?", torch.cuda.is_available())
print("Device count?", torch.cuda.device_count())
print("Current device?", torch.cuda.current_device())
print("Device name? ", torch.cuda.get_device_name(torch.cuda.current_device()))

Is cuda available? True
Device count? 4
Current device? 0
Device name?  NVIDIA RTX A5000


In [3]:
torch.cuda.set_device(2)

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())


def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']


def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [5]:
set_seed(25072024)

## Load Model

In [6]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Prepare Dataset

In [9]:
model_save = './dataset/dataset_smsa_stok/save_model/smsa_save_model'

train_dataset_path = './dataset/dataset_smsa_stok/data_clean/data_train/data_smsa_train.tsv'
valid_dataset_path = './dataset/dataset_smsa_stok/data_clean/data_train/data_smsa_valid.tsv'
# test_dataset_path = './dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'

In [10]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
# test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)  
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)  
# test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)

In [11]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


# Test model on sample sentences

In [12]:
text = 'btw saham bbca dan bank mandiri on fire ya sejak januari demikian jg saham bbri dan bbni'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: btw saham bbca dan bank mandiri on fire ya sejak januari demikian jg saham bbri dan bbni | Label : positive (37.952%)


# Fine Tuning & Evaluation

In [13]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [16]:
from sklearn.metrics import classification_report

# Fungsi untuk menghitung dan menampilkan classification report
def display_classification_report(list_hyp, list_label):
    list_hyp_idx = [DocumentSentimentDataset.LABEL2INDEX[hyp] for hyp in list_hyp]
    list_label_idx = [DocumentSentimentDataset.LABEL2INDEX[label] for label in list_label]
    
    target_names = [DocumentSentimentDataset.INDEX2LABEL[i] for i in range(DocumentSentimentDataset.NUM_LABELS)]
    
    print("\nClassification Report:")
    print(classification_report(list_label_idx, list_hyp_idx, target_names=target_names))

# Train
n_epochs = 5
best_f1 = 0

for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)

    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss += tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch + 1),
                                                                                   total_train_loss / (i + 1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch + 1),
                                                             total_train_loss / (i + 1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)

    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Calculate total loss
        valid_loss = loss.item()
        total_loss += valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(
            total_loss / (i + 1), metrics_to_string(metrics)))

    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch + 1),
                                                   total_loss / (i + 1), metrics_to_string(metrics)))

    if metrics['F1'] > best_f1:
        best_f1 = metrics['F1']
        model.save_pretrained(model_save)
        tokenizer.save_pretrained(model_save)
        config.save_pretrained(model_save)

        print('current best')
        # Display classification report for the current best model
        display_classification_report(list_hyp, list_label)


(Epoch 1) TRAIN LOSS:0.2272 LR:0.00000300: 100%|██████████| 83/83 [00:15<00:00,  5.33it/s]


(Epoch 1) TRAIN LOSS:0.2272 ACC:0.93 F1:0.92 REC:0.91 PRE:0.92 LR:0.00000300


VALID LOSS:0.5020 ACC:0.81 F1:0.80 REC:0.78 PRE:0.82: 100%|██████████| 21/21 [00:03<00:00,  5.58it/s]


(Epoch 1) VALID LOSS:0.5020 ACC:0.81 F1:0.80 REC:0.78 PRE:0.82
current best

Classification Report:
              precision    recall  f1-score   support

    positive       0.80      0.91      0.85       340
     neutral       0.83      0.70      0.76       158
    negative       0.83      0.73      0.78       160

    accuracy                           0.81       658
   macro avg       0.82      0.78      0.80       658
weighted avg       0.82      0.81      0.81       658



(Epoch 2) TRAIN LOSS:0.1630 LR:0.00000300: 100%|██████████| 83/83 [00:15<00:00,  5.20it/s]


(Epoch 2) TRAIN LOSS:0.1630 ACC:0.95 F1:0.95 REC:0.95 PRE:0.95 LR:0.00000300


VALID LOSS:0.5047 ACC:0.82 F1:0.81 REC:0.81 PRE:0.81: 100%|██████████| 21/21 [00:03<00:00,  5.97it/s]


(Epoch 2) VALID LOSS:0.5047 ACC:0.82 F1:0.81 REC:0.81 PRE:0.81
current best

Classification Report:
              precision    recall  f1-score   support

    positive       0.87      0.85      0.86       340
     neutral       0.79      0.77      0.78       158
    negative       0.76      0.81      0.79       160

    accuracy                           0.82       658
   macro avg       0.81      0.81      0.81       658
weighted avg       0.82      0.82      0.82       658



(Epoch 3) TRAIN LOSS:0.1240 LR:0.00000300: 100%|██████████| 83/83 [00:16<00:00,  5.01it/s]


(Epoch 3) TRAIN LOSS:0.1240 ACC:0.96 F1:0.96 REC:0.96 PRE:0.96 LR:0.00000300


VALID LOSS:0.5314 ACC:0.82 F1:0.80 REC:0.81 PRE:0.80: 100%|██████████| 21/21 [00:03<00:00,  6.10it/s]


(Epoch 3) VALID LOSS:0.5314 ACC:0.82 F1:0.80 REC:0.81 PRE:0.80


(Epoch 4) TRAIN LOSS:0.0857 LR:0.00000300: 100%|██████████| 83/83 [00:15<00:00,  5.28it/s]


(Epoch 4) TRAIN LOSS:0.0857 ACC:0.98 F1:0.97 REC:0.97 PRE:0.98 LR:0.00000300


VALID LOSS:0.5806 ACC:0.82 F1:0.81 REC:0.82 PRE:0.80: 100%|██████████| 21/21 [00:03<00:00,  5.94it/s]


(Epoch 4) VALID LOSS:0.5806 ACC:0.82 F1:0.81 REC:0.82 PRE:0.80


(Epoch 5) TRAIN LOSS:0.0668 LR:0.00000300: 100%|██████████| 83/83 [00:15<00:00,  5.23it/s]


(Epoch 5) TRAIN LOSS:0.0668 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98 LR:0.00000300


VALID LOSS:0.6063 ACC:0.81 F1:0.80 REC:0.80 PRE:0.81: 100%|██████████| 21/21 [00:03<00:00,  5.86it/s]

(Epoch 5) VALID LOSS:0.6063 ACC:0.81 F1:0.80 REC:0.80 PRE:0.81





## Test Model for Sentence

In [18]:
text = 'btw saham bbca dan bank mandiri on fire ya sejak januari demikian jg saham bbri dan bbni'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(
    f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: btw saham bbca dan bank mandiri on fire ya sejak januari demikian jg saham bbri dan bbni | Label : positive (99.022%)


In [20]:
text = 'aku gatau yaa ga percaya gitu sama bmri'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(
    f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: aku gatau yaa ga percaya gitu sama bmri | Label : positive (83.946%)


## Summary