In [1]:
!pip install emoji
!pip install transformers



In [2]:
import torch
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm

import emoji
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import torch.optim as optimazer
import torch.nn.functional as F
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load Data

In [3]:
train_dataset_path = './sample_data/train_preprocess.tsv'
test_dataset_path = './sample_data/test_preprocess.tsv'
valid_dataset_path = './sample_data/valid_preprocess.tsv'

In [4]:
def load_dataset(path):
    return pd.read_csv(path, sep='\t', header=None, names=['text', 'category'])

train_df = load_dataset(train_dataset_path)
test_df = load_dataset(test_dataset_path)
valid_df = load_dataset(valid_dataset_path)

In [5]:
train_df

Unnamed: 0,text,category
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative
...,...,...
10995,tidak kecewa,positive
10996,enak rasa masakan nya apalagi kepiting yang me...,positive
10997,hormati partai-partai yang telah berkoalisi,neutral
10998,"pagi pagi di tol pasteur sudah macet parah , b...",negative


In [6]:
train_labels = train_df['category'].unique()
test_labels = test_df['category'].unique()
valid_labels = valid_df['category'].unique()

possible_labels = set(train_labels).union(test_labels).union(valid_labels)

In [7]:
label_dict = {label: idx for idx, label in enumerate(possible_labels)}

train_df['label'] = train_df.category.map(label_dict)
test_df['label'] = test_df.category.map(label_dict)
valid_df['label'] = valid_df.category.map(label_dict)

In [8]:
train_df

Unnamed: 0,text,category,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,2
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,0
2,lokasi strategis di jalan sumatera bandung . t...,positive,2
3,betapa bahagia nya diri ini saat unboxing pake...,positive,2
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,1
...,...,...,...
10995,tidak kecewa,positive,2
10996,enak rasa masakan nya apalagi kepiting yang me...,positive,2
10997,hormati partai-partai yang telah berkoalisi,neutral,0
10998,"pagi pagi di tol pasteur sudah macet parah , b...",negative,1


# Data Cleaning & Preprocessing

In [9]:
# preprocessing tahap 1
character = ['.',',',';',':','-,','...','?','!','(',')','[',']','{','}','<','>','"','/','\'','#','-','@',
             'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
             'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']

def repeatcharClean(text):
  for i in range(len(character)):
    charac_long = 5
    while charac_long > 2:
      char = character[i]*charac_long
      text = text.replace(char,character[i])
      charac_long -= 1
  return text

def clean_review(text):
  # ubah text menjadi huruf kecil
  text = text.lower()
  # ubah enter menjadi spasi
  text = re.sub(r'\n', ' ', text)
  # hapus emoji
  text = emoji.demojize(text)
  text = re.sub(':[A-Za-z_-]+:', ' ', text) # delete emoji
  # hapus emoticon
  text = re.sub(r"([xX;:]'?[dDpPvVoO3)(])", ' ', text)
  # hapus link
  text = re.sub(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})", "", text)
  # hapus usename
  text = re.sub(r"@[^\s]+[\s]?", ' ', text)
  # hapus hashtag
  text = re.sub(r'#(\S+)', r'\1', text)
  # hapus angka dan beberapa simbol
  text = re.sub('[^a-zA-Z,.?!]+',' ',text)
  # hapus karakter berulang
  text = repeatcharClean(text)
  # clear spasi
  text = re.sub(r'\s+', ' ', text)
  text = text.strip()
  return text

def clean_df(df):
  df['text'] = df['text'].apply(clean_review)
  return df

def preprocess_v1(df):
    df_pp = df.copy()
    df_pp['text'] = df_pp['text'].map(clean_review)

    # delete empty row
    df_pp['text'].replace('', np.nan)
    df_pp['text'].replace(' ', np.nan)
    df_pp.dropna(subset=['text'], inplace=True)
    return df_pp

In [10]:
train_df_processed = preprocess_v1(train_df)
test_df_processed = preprocess_v1(test_df)
valid_df_processed = preprocess_v1(valid_df)

In [11]:
train_df_processed

Unnamed: 0,text,category,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,2
1,mohon ulama lurus dan k mmbri hujjah partai ap...,neutral,0
2,lokasi strategis di jalan sumatera bandung . t...,positive,2
3,betapa bahagia nya diri ini saat unboxing pake...,positive,2
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,1
...,...,...,...
10995,tidak kecewa,positive,2
10996,enak rasa masakan nya apalagi kepiting yang me...,positive,2
10997,hormati partai partai yang telah berkoalisi,neutral,0
10998,"pagi pagi di tol pasteur sudah macet parah , b...",negative,1


In [12]:
unstandard_word_path = './sample_data/kamus_alay.csv'

unstandard_word = pd.read_csv(unstandard_word_path, sep=';')

unstandard_word

Unnamed: 0,slang,formal
0,woww,wow
1,aminn,amin
2,met,selamat
3,netaas,menetas
4,keberpa,keberapa
...,...,...
3260,ribed,ribet
3261,ntapz,mantap
3262,ntaps,mantap
3263,mmbri,memberi


In [13]:
# preprocessing tahap 2
normalize_word_dict = {}
for index, row in unstandard_word.iterrows():
    if row.iloc[0] not in normalize_word_dict:
        normalize_word_dict[row.iloc[0]] = row.iloc[1]

def normalize_review(text):
  # tokenize
  list_text = word_tokenize(text)
  # ubah bahasa tidak baku
  list_text = [normalize_word_dict[term] if term in normalize_word_dict else term for term in list_text]
  # gabung kembali kalimat
  text = " ".join(list_text)
  return text

def preprocess_v2(df):
  df_pp = df.copy()
  df_pp['text'] = df_pp['text'].map(normalize_review)  # Menggunakan kolom 'text' untuk normalisasi

  # Menghapus baris kosong
  df_pp['text'].replace('', np.nan)
  df_pp['text'].replace(' ', np.nan)
  df_pp.dropna(subset=['text'], inplace=True)
  return df_pp

In [14]:
train_df_processed = preprocess_v2(train_df)
test_df_processed = preprocess_v2(test_df)
valid_df_processed = preprocess_v2(valid_df)

In [76]:
preprocess_v2(test_df)

Unnamed: 0,text,category,label
0,kemarin gue datang ke tempat makan baru yang a...,negative,1
1,kayak nya sih gue tidak akan mau balik lagi ke...,negative,1
2,"kalau dipikir-pikir , sebenarnya tidak ada yan...",negative,1
3,ini pertama kalinya gua ke bank buat mengurusi...,negative,1
4,waktu sampai dengan gue pernah disuruh ibu lat...,negative,1
...,...,...,...
495,kata nya tidur yang baik itu minimal enam jam ...,neutral,0
496,indonesia itu ada di benua asia .,neutral,0
497,salah satu kegemaran anak remaja indonesia sek...,neutral,0
498,melihat warna hijau bisa bikin mata jadi lebih...,positive,2


# Loading Tokenizer and Encoding Dataset

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [17]:
encoded_data_train = tokenizer.batch_encode_plus(
    train_df_processed['text'].values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    test_df_processed['text'].values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    valid_df_processed['text'].values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

In [18]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_df_processed['label'].values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(test_df_processed['label'].values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(valid_df_processed['label'].values)

In [19]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [20]:
len(dataset_train), len(dataset_test), len(dataset_val)

(11000, 500, 1260)

# Setting up BERT Model

In [21]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Test Model on Sample

In [23]:
batch_size = 8

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_test = DataLoader(dataset_test,
                             sampler=SequentialSampler(dataset_test),
                             batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

In [24]:
text = 'terima kasih , pelayanan mantap nih . pengembalian dana cair kurang dari 12 jam ! rekomendasi banget traveloka !'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text}')
print(f'Label: {label}')
print(f'Score: {torch.softmax(logits, dim=-1).tolist()[0][label]}')

Text: terima kasih , pelayanan mantap nih . pengembalian dana cair kurang dari 12 jam ! rekomendasi banget traveloka !
Label: 0
Score: 0.3630008101463318


In [25]:
text = 'anak sekarang sulit untuk dinasehati'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text}')
print(f'Label: {label}')
print(f'Score: {torch.softmax(logits, dim=-1).tolist()[0][label]}')

Text: anak sekarang sulit untuk dinasehati
Label: 0
Score: 0.40268945693969727


# Fine-Tune & Evaluation

In [26]:
optimizer = optimazer.AdamW(model.parameters(),
                            lr=1e-5,
                            eps=1e-8)

In [27]:
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [28]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def precision_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, preds_flat, average='macro')

def recall_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return recall_score(labels_flat, preds_flat, average='macro')

def accuracy_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, preds_flat)

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [30]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [31]:
for epoch_i in tqdm(range(0, epochs)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train,
                        desc='Epoch {:1d}'.format(epoch_i+1, epochs),
                        leave=False,
                        disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.2f}'.format(loss.item()/len(batch))})

    torch.save(model.state_dict(), f'finetuned_BERT{epoch_i}.model')

    tqdm.write(f'\nEpoch {epoch_i}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    val_precision = precision_score_func(predictions, true_vals)
    val_recall = recall_score_func(predictions, true_vals)
    val_accuracy = accuracy_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Precision Score (Macro): {val_precision}')
    tqdm.write(f'Recall Score (Macro): {val_recall}')
    tqdm.write(f'Accuracy Score: {val_accuracy}')

    training_stats = []
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': loss_train_avg,
            'Valid. Loss': val_loss,
            'F1': val_f1,
            'Precision': val_precision,
            'Recall': val_recall,
            'Accuracy': val_accuracy
        }
    )

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/1375 [00:00<?, ?it/s]


Epoch 0
Training loss: 0.5533035690025849
Validation loss: 0.4857281344129315
F1 Score (Weighted): 0.8195713274173326
Precision Score (Macro): 0.7922148632543573
Recall Score (Macro): 0.7462605770534406
Accuracy Score: 0.8206349206349206


Epoch 2:   0%|          | 0/1375 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.3859018749256026
Validation loss: 0.5214602022514313
F1 Score (Weighted): 0.8409328592105996
Precision Score (Macro): 0.8231177333938158
Recall Score (Macro): 0.7670661082919108
Accuracy Score: 0.8428571428571429


Epoch 3:   0%|          | 0/1375 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.31243388101119884
Validation loss: 0.48995345038703725
F1 Score (Weighted): 0.8710867340749383
Precision Score (Macro): 0.8277687503173716
Recall Score (Macro): 0.8414336246375173
Accuracy Score: 0.8698412698412699


Epoch 4:   0%|          | 0/1375 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.2485271608622914
Validation loss: 0.5693688083858571
F1 Score (Weighted): 0.8672215203810547
Precision Score (Macro): 0.8365515822728318
Recall Score (Macro): 0.8127407644061487
Accuracy Score: 0.8690476190476191


Epoch 5:   0%|          | 0/1375 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.20280874462679707
Validation loss: 0.604876760837675
F1 Score (Weighted): 0.8692073089407055
Precision Score (Macro): 0.8378817614551309
Recall Score (Macro): 0.8129983453838001
Accuracy Score: 0.8706349206349207


In [35]:
torch.save(model.state_dict(), 'finetuned_BERT.pth')

In [36]:
# from google.colab import files
# files.download('finetuned_BERT.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Test finetuned model on sample sentences (using test dataset)

In [72]:
text = 'top deh hari ini lion air cengkareng balikpapan on time sekali . mantap harus nya sepeti ini terus . sangat puas'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text}')
print(f'Label: {label}')
print(f'Score: {torch.softmax(logits, dim=-1).tolist()[0][label]}')

Text: top deh hari ini lion air cengkareng balikpapan on time sekali . mantap harus nya sepeti ini terus . sangat puas
Label: 2
Score: 0.9980873465538025


In [74]:
text = 'kecewa banget sama id pusat servis tapi . kan kita spent waktu untuk ke sini , tidak dikasih solusi . masa harus bolak balik .'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text}')
print(f'Label: {label}')
print(f'Score: {torch.softmax(logits, dim=-1).tolist()[0][label]}')

Text: kecewa banget sama id pusat servis tapi . kan kita spent waktu untuk ke sini , tidak dikasih solusi . masa harus bolak balik .
Label: 1
Score: 0.997861921787262


In [75]:
text = 'hai mas cepy . saya sudah ke bank bca terdekat untuk langsung diproses pergantian kartu , ya'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text}')
print(f'Label: {label}')
print(f'Score: {torch.softmax(logits, dim=-1).tolist()[0][label]}')

Text: hai mas cepy . saya sudah ke bank bca terdekat untuk langsung diproses pergantian kartu , ya
Label: 0
Score: 0.9969627261161804


Ket:
*   0 = netral
*   1 = negatif
*   2 = positif