In [None]:
pip install torch torchvision

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
pip install transformers



In [None]:
!git clone https://github.com/indobenchmark/indonlu

Cloning into 'indonlu'...
remote: Enumerating objects: 509, done.[K
remote: Counting objects: 100% (193/193), done.[K
remote: Compressing objects: 100% (83/83), done.[K
remote: Total 509 (delta 119), reused 139 (delta 110), pack-reused 316 (from 1)[K
Receiving objects: 100% (509/509), 9.46 MiB | 27.51 MiB/s, done.
Resolving deltas: 100% (239/239), done.


In [None]:
import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from indonlu.utils.forward_fn import forward_sequence_classification
from indonlu.utils.metrics import document_sentiment_metrics_fn
from indonlu.utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader
from torch.utils.data import Dataset, DataLoader

In [None]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

In [None]:
# Set random seed
set_seed(19072021)

In [None]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
count_param(model)

124443651

In [None]:
train_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'

In [None]:
class DocumentSentimentDataset(Dataset):
    # Static constant variable
    LABEL2INDEX = {'positive': 0, 'neutral': 1, 'negative': 2} # Map dari label string ke index
    INDEX2LABEL = {0: 'positive', 1: 'neutral', 2: 'negative'} # Map dari Index ke label string
    NUM_LABELS = 3 # Jumlah label

    def load_dataset(self, path):
        df = pd.read_csv(path, sep='\t', header=None) # Baca tsv file dengan pandas
        df.columns = ['text','sentiment'] # Berikan nama pada kolom tabel
        df['sentiment'] = df['sentiment'].apply(lambda lab: self.LABEL2INDEX[lab]) # Konversi string label ke index
        return df

    def __init__(self, dataset_path, tokenizer, *args, **kwargs):
        self.data = self.load_dataset(dataset_path) # Load tsv file

        # Assign tokenizer, disini kita menggunakan tokenizer subword dari HuggingFace
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        data = self.data.loc[index,:] # Ambil data pada baris tertentu dari tabel
        text, sentiment = data['text'], data['sentiment'] # Ambil nilai text dan sentiment
        subwords = self.tokenizer.encode(text) # Tokenisasi text menjadi subword

    # Return numpy array dari subwords dan label
        return np.array(subwords), np.array(sentiment), data['text']

    def __len__(self):
        return len(self.data)  # Return panjang dari dataset

In [None]:
class DocumentSentimentDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, *args, **kwargs):
        super(DocumentSentimentDataLoader, self).__init__(*args, **kwargs)
        self.max_seq_len = max_seq_len # Assign batas maksimum subword
        self.collate_fn = self._collate_fn # Assign fungsi collate_fn dengan fungsi yang kita definisikan

    def _collate_fn(self, batch):
        batch_size = len(batch) # Ambil batch size
        max_seq_len = max(map(lambda x: len(x[0]), batch)) # Cari panjang subword maksimal dari batch
        max_seq_len = min(self.max_seq_len, max_seq_len) # Bandingkan dengan batas yang kita tentukan sebelumnya

    # Buat buffer untuk subword, mask, dan sentiment labels, inisialisasikan semuanya dengan 0
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        sentiment_batch = np.zeros((batch_size, 1), dtype=np.int64)

    # Isi semua buffer
        for i, (subwords, sentiment, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            sentiment_batch[i,0] = sentiment

    # Return subword, mask, dan sentiment data
        return subword_batch, mask_batch, sentiment_batch

In [None]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)



In [None]:
print(train_dataset[0])

(array([    2,  6540,    92,  2970,   213,  4259,  3553,   899,    34,
         259,  5590,   262,  2558,   386,   899,  1687,    26,  1574,
       30470,   899,  3310, 30468, 22130, 30360,  6123,  6368, 30468,
       22130, 30360,  2652,  1746, 30468,  8869,  6540,    34,  6315,
        1622,  1256,  8949,   899, 30468,  4222,  1622,   752,   245,
         295,  2083, 30470,  2346,  7107,   300, 30470,   405,   724,
        5189, 30470,   843, 17464,   899,   540, 10989,  3331,  1107,
       30468,   119,  3221,    79,    34,  2170,    98,  9167, 30457,
           3]), array(0), 'warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung . tahu berkualitas , dipadu keahlian memasak , dipadu kretivitas , jadilah warung yang menyajikan menu utama berbahan tahu , ditambah menu umum lain seperti ayam . semuanya selera indonesia . harga cukup terjangkau . jangan lewatkan tahu bletoka nya , tidak kalah dengan yang asli dari tegal !')


In [None]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


## Pengujian Model dengan Contoh Kalimat

In [None]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : positive (52.480%)


### Fine Tuning dan Evaluasi

In [None]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [None]:
# Train
n_epochs = 5

# Define helper functions
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metrics):
    return " | ".join(["{}: {:.4f}".format(k, v) for k, v in metrics.items()])

for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)

    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        # Pass the entire batch_data tuple to the forward function
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data, i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer))) # Assuming get_lr is defined elsewhere

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer))) # Assuming metrics_to_string and get_lr are defined elsewhere

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)

    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        # Pass the entire batch_data tuple to the forward function
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data, i2w=i2w, device='cuda')

        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics))) # Assuming metrics_to_string is defined elsewhere

    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics))) # Assuming metrics_to_string is defined elsewhere

(Epoch 1) TRAIN LOSS:0.3320 LR:0.00000300: 100%|██████████| 344/344 [02:42<00:00,  2.12it/s]


(Epoch 1) TRAIN LOSS:0.3320 ACC: 0.8766 | F1: 0.8292 | REC: 0.7993 | PRE: 0.8771 LR:0.00000300


VALID LOSS:0.1827 ACC: 0.9325 | F1: 0.9021 | REC: 0.8986 | PRE: 0.9062: 100%|██████████| 40/40 [00:08<00:00,  4.88it/s]


(Epoch 1) VALID LOSS:0.1827 ACC: 0.9325 | F1: 0.9021 | REC: 0.8986 | PRE: 0.9062


(Epoch 2) TRAIN LOSS:0.1572 LR:0.00000300: 100%|██████████| 344/344 [02:41<00:00,  2.13it/s]


(Epoch 2) TRAIN LOSS:0.1572 ACC: 0.9469 | F1: 0.9292 | REC: 0.9242 | PRE: 0.9345 LR:0.00000300


VALID LOSS:0.1772 ACC: 0.9405 | F1: 0.9087 | REC: 0.8883 | PRE: 0.9367: 100%|██████████| 40/40 [00:08<00:00,  4.92it/s]


(Epoch 2) VALID LOSS:0.1772 ACC: 0.9405 | F1: 0.9087 | REC: 0.8883 | PRE: 0.9367


(Epoch 3) TRAIN LOSS:0.1200 LR:0.00000300: 100%|██████████| 344/344 [02:42<00:00,  2.12it/s]


(Epoch 3) TRAIN LOSS:0.1200 ACC: 0.9615 | F1: 0.9509 | REC: 0.9479 | PRE: 0.9540 LR:0.00000300


VALID LOSS:0.1650 ACC: 0.9405 | F1: 0.9141 | REC: 0.9074 | PRE: 0.9231: 100%|██████████| 40/40 [00:08<00:00,  4.62it/s]


(Epoch 3) VALID LOSS:0.1650 ACC: 0.9405 | F1: 0.9141 | REC: 0.9074 | PRE: 0.9231


(Epoch 4) TRAIN LOSS:0.0894 LR:0.00000300: 100%|██████████| 344/344 [02:42<00:00,  2.12it/s]


(Epoch 4) TRAIN LOSS:0.0894 ACC: 0.9719 | F1: 0.9650 | REC: 0.9608 | PRE: 0.9694 LR:0.00000300


VALID LOSS:0.1816 ACC: 0.9333 | F1: 0.9075 | REC: 0.9030 | PRE: 0.9145: 100%|██████████| 40/40 [00:08<00:00,  4.52it/s]


(Epoch 4) VALID LOSS:0.1816 ACC: 0.9333 | F1: 0.9075 | REC: 0.9030 | PRE: 0.9145


(Epoch 5) TRAIN LOSS:0.0656 LR:0.00000300: 100%|██████████| 344/344 [02:43<00:00,  2.11it/s]


(Epoch 5) TRAIN LOSS:0.0656 ACC: 0.9807 | F1: 0.9751 | REC: 0.9726 | PRE: 0.9778 LR:0.00000300


VALID LOSS:0.1971 ACC: 0.9310 | F1: 0.9067 | REC: 0.9073 | PRE: 0.9088: 100%|██████████| 40/40 [00:09<00:00,  4.41it/s]

(Epoch 5) VALID LOSS:0.1971 ACC: 0.9310 | F1: 0.9067 | REC: 0.9073 | PRE: 0.9088





In [None]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    # Pass the entire batch_data tuple to the forward function
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data, i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)

print(df)

100%|██████████| 16/16 [00:02<00:00,  5.47it/s]

     index     label
0        0  negative
1        1  negative
2        2  negative
3        3  negative
4        4  negative
..     ...       ...
495    495   neutral
496    496   neutral
497    497   neutral
498    498  positive
499    499  positive

[500 rows x 2 columns]





### Prediksi Sentimen

In [None]:
text = 'Merasa kagum dengan toko ini tapi berubah menjadi kecewa setelah transaksi'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Merasa kagum dengan toko ini tapi berubah menjadi kecewa setelah transaksi | Label : negative (99.578%)
