In [1]:
!pip install transformers
!git clone https://github.com/indobenchmark/indonlu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 9.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.8 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 88.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.1
Cloning into 'indonlu'...
remote: Enumerating objects: 466, done.[K
remote: Counting objects: 100% (150/150), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 466 (delta 107), reused 106 (delta 103), p

In [2]:
import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm
  
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer
  
from indonlu.utils.forward_fn import forward_sequence_classification
from indonlu.utils.metrics import document_sentiment_metrics_fn
from indonlu.utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [3]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']
  
def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [4]:
# Set random seed
set_seed(19072021)

In [5]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS
  
# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Downloading:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [7]:
# Melihat parameter
count_param(model)

124443651

In [8]:
train_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'

**Kedua kode untuk kelas DocumentSentimentDataset dan DocumentSentimentDataLoader hanya untuk menunjukkan bagaimana kelas tersebut dibangun sehingga kita tidak perlu menuliskan kodenya.**

In [9]:
# class DocumentSentimentDataset(Dataset):
#     # Static constant variable
#     LABEL2INDEX = {'positive': 0, 'neutral': 1, 'negative': 2} # Map dari label string ke index
#     INDEX2LABEL = {0: 'positive', 1: 'neutral', 2: 'negative'} # Map dari Index ke label string
#     NUM_LABELS = 3 # Jumlah label
    
#     def load_dataset(self, path):
#         df = pd.read_csv(path, sep="\t", header=None) # Baca tsv file dengan pandas
#         df.columns = ['text','sentiment'] # Berikan nama pada kolom tabel
#         df['sentiment'] = df['sentiment'].apply(lambda lab: self.LABEL2INDEX[lab]) # Konversi string label ke index
#         return df
    
#     def __init__(self, dataset_path, tokenizer, *args, **kwargs):
#         self.data = self.load_dataset(dataset_path) # Load tsv file
  
#         # Assign tokenizer, disini kita menggunakan tokenizer subword dari HuggingFace
#         self.tokenizer = tokenizer 
  
#     def __getitem__(self, index):
#         data = self.data.loc[index,:] # Ambil data pada baris tertentu dari tabel
#         text, sentiment = data['text'], data['sentiment'] # Ambil nilai text dan sentiment
#         subwords = self.tokenizer.encode(text) # Tokenisasi text menjadi subword
    
#     # Return numpy array dari subwords dan label
#         return np.array(subwords), np.array(sentiment), data['text']
    
#     def __len__(self):
#         return len(self.data)  # Return panjang dari dataset

In [10]:
# class DocumentSentimentDataLoader(DataLoader):
# def __init__(self, max_seq_len=512, *args, **kwargs):
#     super(DocumentSentimentDataLoader, self).__init__(*args, **kwargs)
#     self.max_seq_len = max_seq_len # Assign batas maksimum subword
#     self.collate_fn = self._collate_fn # Assign fungsi collate_fn dengan fungsi yang kita definisikan
    
# def _collate_fn(self, batch):
#     batch_size = len(batch) # Ambil batch size
#     max_seq_len = max(map(lambda x: len(x[0]), batch)) # Cari panjang subword maksimal dari batch 
#     max_seq_len = min(self.max_seq_len, max_seq_len) # Bandingkan dengan batas yang kita tentukan sebelumnya
    
# # Buat buffer untuk subword, mask, dan sentiment labels, inisialisasikan semuanya dengan 0
#     subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
#     mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
#     sentiment_batch = np.zeros((batch_size, 1), dtype=np.int64)
    
# # Isi semua buffer
#     for i, (subwords, sentiment, raw_seq) in enumerate(batch):
#         subwords = subwords[:max_seq_len]
#         subword_batch[i,:len(subwords)] = subwords
#         mask_batch[i,:len(subwords)] = 1
#         sentiment_batch[i,0] = sentiment
        
# # Return subword, mask, dan sentiment data
#     return subword_batch, mask_batch, sentiment_batch

In [11]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

# Fungsi parameter shuffle adalah jika True dikirim langsung, dan jika False dikirim satu per satu 
train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)  
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)  
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)

  cpuset_checked))


In [12]:
print(train_dataset[0])

(array([    2,  6540,    92,  2970,   213,  4259,  3553,   899,    34,
         259,  5590,   262,  2558,   386,   899,  1687,    26,  1574,
       30470,   899,  3310, 30468, 22130, 30360,  6123,  6368, 30468,
       22130, 30360,  2652,  1746, 30468,  8869,  6540,    34,  6315,
        1622,  1256,  8949,   899, 30468,  4222,  1622,   752,   245,
         295,  2083, 30470,  2346,  7107,   300, 30470,   405,   724,
        5189, 30470,   843, 17464,   899,   540, 10989,  3331,  1107,
       30468,   119,  3221,    79,    34,  2170,    98,  9167, 30457,
           3]), array(0), 'warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung . tahu berkualitas , dipadu keahlian memasak , dipadu kretivitas , jadilah warung yang menyajikan menu utama berbahan tahu , ditambah menu umum lain seperti ayam . semuanya selera indonesia . harga cukup terjangkau . jangan lewatkan tahu bletoka nya , tidak kalah dengan yang asli dari tegal !')


In [13]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


**PREDIKSI**

In [14]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
  
logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
  
print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : positive (39.380%)


**EVALUASI**

In [15]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [16]:
# Train
n_epochs = 5
for epoch in range(n_epochs):
  model.train()
  torch.set_grad_enabled(True)

  total_train_loss = 0
  list_hyp, list_label = [], []

  train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
  for i, batch_data in enumerate(train_pbar):
      # Forward model
      loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

      # Update model
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      tr_loss = loss.item()
      total_train_loss = total_train_loss + tr_loss

      # Calculate metrics
      list_hyp += batch_hyp
      list_label += batch_label

      train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
          total_train_loss/(i+1), get_lr(optimizer)))

  # Calculate train metric
  metrics = document_sentiment_metrics_fn(list_hyp, list_label)
  print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
      total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

  # Evaluate on validation
  model.eval()
  torch.set_grad_enabled(False)

  total_loss, total_correct, total_labels = 0, 0, 0
  list_hyp, list_label = [], []

  pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
  for i, batch_data in enumerate(pbar):
      batch_seq = batch_data[-1]        
      loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
      
      # Calculate total loss
      valid_loss = loss.item()
      total_loss = total_loss + valid_loss

      # Calculate evaluation metrics
      list_hyp += batch_hyp
      list_label += batch_label
      metrics = document_sentiment_metrics_fn(list_hyp, list_label)

      pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
      
  metrics = document_sentiment_metrics_fn(list_hyp, list_label)
  print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
      total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:0.3466 LR:0.00000300: 100%|██████████| 344/344 [01:39<00:00,  3.46it/s]


(Epoch 1) TRAIN LOSS:0.3466 ACC:0.87 F1:0.82 REC:0.79 PRE:0.86 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.1905 ACC:0.93 F1:0.89 REC:0.89 PRE:0.90: 100%|██████████| 40/40 [00:04<00:00,  8.44it/s]


(Epoch 1) VALID LOSS:0.1905 ACC:0.93 F1:0.89 REC:0.89 PRE:0.90


  cpuset_checked))
(Epoch 2) TRAIN LOSS:0.1558 LR:0.00000300: 100%|██████████| 344/344 [01:36<00:00,  3.56it/s]


(Epoch 2) TRAIN LOSS:0.1558 ACC:0.95 F1:0.93 REC:0.92 PRE:0.94 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.1719 ACC:0.93 F1:0.90 REC:0.90 PRE:0.91: 100%|██████████| 40/40 [00:04<00:00,  8.58it/s]


(Epoch 2) VALID LOSS:0.1719 ACC:0.93 F1:0.90 REC:0.90 PRE:0.91


  cpuset_checked))
(Epoch 3) TRAIN LOSS:0.1202 LR:0.00000300: 100%|██████████| 344/344 [01:37<00:00,  3.55it/s]


(Epoch 3) TRAIN LOSS:0.1202 ACC:0.96 F1:0.95 REC:0.95 PRE:0.95 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.1676 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92: 100%|██████████| 40/40 [00:04<00:00,  8.49it/s]


(Epoch 3) VALID LOSS:0.1676 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92


  cpuset_checked))
(Epoch 4) TRAIN LOSS:0.0877 LR:0.00000300: 100%|██████████| 344/344 [01:36<00:00,  3.55it/s]


(Epoch 4) TRAIN LOSS:0.0877 ACC:0.97 F1:0.96 REC:0.96 PRE:0.97 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.1838 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92: 100%|██████████| 40/40 [00:04<00:00,  8.40it/s]


(Epoch 4) VALID LOSS:0.1838 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92


  cpuset_checked))
(Epoch 5) TRAIN LOSS:0.0660 LR:0.00000300: 100%|██████████| 344/344 [01:37<00:00,  3.54it/s]


(Epoch 5) TRAIN LOSS:0.0660 ACC:0.98 F1:0.97 REC:0.97 PRE:0.98 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.2037 ACC:0.94 F1:0.91 REC:0.90 PRE:0.93: 100%|██████████| 40/40 [00:04<00:00,  8.44it/s]

(Epoch 5) VALID LOSS:0.2037 ACC:0.94 F1:0.91 REC:0.90 PRE:0.93





In [17]:
torch.set_grad_enabled(False)
 
total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []
 
pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp
 
# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)
 
print(df)

  cpuset_checked))
100%|██████████| 16/16 [00:01<00:00, 10.28it/s]

     index     label
0        0  negative
1        1  negative
2        2  negative
3        3  negative
4        4  negative
..     ...       ...
495    495   neutral
496    496   neutral
497    497   neutral
498    498  positive
499    499  positive

[500 rows x 2 columns]





**PREDIKSI HASIL EVALUASI**

---



In [18]:
text = 'Merasa kagum dengan toko ini tapi berubah menjadi kecewa setelah transaksi'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
  
logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
  
print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Merasa kagum dengan toko ini tapi berubah menjadi kecewa setelah transaksi | Label : negative (99.765%)
