In [1]:
import time
import numpy as np
import pandas as pd
import warnings
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings('ignore', category=UndefinedMetricWarning) 

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from bpemb import BPEmb
from torch import tensor
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence

## Prepare dataset and model

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [3]:
def load_data(path):
    data = pd.read_csv(path, keep_default_na=False)
    return data

def prepare_dataset(X_train_pth, y_train_pth, X_test_pth, y_test_pth):
    X_test = load_data(X_test_pth)
    y_test = load_data(y_test_pth)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    test = (X_test, y_test)

    X_tmp = load_data(X_train_pth)
    y_tmp = load_data(y_train_pth)
    X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=42)
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_val.reset_index(drop=True, inplace=True)
    y_val.reset_index(drop=True, inplace=True)

    train = (X_train, y_train)
    val = (X_val, y_val)
    return train, val, test

In [4]:
class DatasetBuzzer(Dataset):
    def __init__(self, data):
        self.data = data
        self.bpe = BPEmb(lang="id", vs=100000, dim=300)
        
    def __getitem__(self, idx):
        x_raw = self.data[0].loc[idx, 'text_used']
        y  = self.data[1].loc[idx, 'buzzer']
        
        x = self.bpe.encode_ids(x_raw)
        x = tensor(x)
        y = torch.LongTensor([y])
        
        return (x, y)
    
    def __len__(self):
        return len(self.data[0])

In [5]:
def padding(data):
    x_list = []
    y_list = []
    for x, y in data:
        x_list.append(x)
        y_list.append(y)
        
    x_pad = pad_sequence(x_list, batch_first=True)
    y_pad = pad_sequence(y_list, batch_first=True)
    
    return x_pad, y_pad

In [6]:
class LSTM(nn.Module):
    def __init__(self, hidden_size, num_class, weights):
        super(LSTM, self).__init__()
        self.word_embeddings = nn.Embedding.from_pretrained(weights)
        self.lstm = nn.LSTM(weights.shape[1], hidden_size, num_layers=2, dropout=0.5, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size, num_class)
        
    def forward(self, input_):
        input_ = self.word_embeddings(input_)
        out, (hidden_state, cell_state) = self.lstm(input_)
        out = self.fc(hidden_state[-1])
        
        return out

## Setup DataLoader

In [7]:
X_train_pth = '../data/dataset/RAW_X_train.csv'
y_train_pth = '../data/dataset/RAW_y_train.csv'
X_test_pth = '../data/dataset/RAW_X_test.csv'
y_test_pth = '../data/dataset/RAW_y_test.csv'

In [8]:
raw_train, raw_val, raw_test = prepare_dataset(X_train_pth, y_train_pth, X_test_pth, y_test_pth)

In [9]:
train_dataset = DatasetBuzzer(raw_train)
val_dataset = DatasetBuzzer(raw_val)
test_dataset = DatasetBuzzer(raw_test)

In [10]:
b_size = 32

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size = b_size, collate_fn=padding)
val_dataloader = DataLoader(val_dataset, shuffle=True, batch_size = b_size, collate_fn=padding)
test_dataloader = DataLoader(test_dataset, batch_size = b_size, collate_fn=padding)

In [11]:
print(f'total train batch: {len(train_dataloader)}')
print(f'total val batch: {len(val_dataloader)}')
print(f'total test batch: {len(test_dataloader)}')

total train batch: 60
total val batch: 15
total test batch: 19


## Hyperparameter and BPE Embeddings

In [12]:
weights = tensor(train_dataset.bpe.vectors)
hidden_size = 128
num_class = 2
epochs = 5

## Setup Model

In [13]:
model = LSTM(hidden_size, num_class, weights)
model.to(device)

LSTM(
  (word_embeddings): Embedding(100000, 300)
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

In [14]:
parameters = sum(p.numel() for p in model.parameters())
print(f'model has {parameters:,} trainable parameters')

model has 30,835,842 trainable parameters


## Setup Optimizer and Loss function

In [15]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

## Training and Validation model

In [16]:
def scoring(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return acc, pre, rec, f1

In [17]:
def train_model(model, dataloader):
    running_train_loss = 0
    running_train_acc = 0
    running_train_pre = 0
    running_train_rec = 0
    running_train_f1 = 0
    
    ## Training mode
    n_batch = len(dataloader)
    t0 = time.time()
    model.train()
    for idx, (x, y) in tqdm(enumerate(dataloader, 1)):
        optimizer.zero_grad()
        x = x.to(device)
        y = y.squeeze().to(device)
        
        y_pred = model(x)
        loss = criterion(y_pred, y)
        
        y = y.numpy()
        y_pred = torch.max(y_pred, 1)[1].detach().numpy()
        acc, pre, rec, f1 = scoring(y, y_pred)

        running_train_loss += loss.item() * x.size(0)
        running_train_acc += acc
        running_train_pre += pre
        running_train_rec += rec
        running_train_f1 += f1

        loss.backward()
        optimizer.step()

    epoch_train_loss = running_train_loss / len(dataloader.dataset)
    epoch_train_acc = running_train_acc / n_batch
    epoch_train_pre = running_train_pre / n_batch
    epoch_train_rec = running_train_rec / n_batch
    epoch_train_f1 = running_train_f1 / n_batch
    
    train_time = time.time() - t0
    return epoch_train_loss, epoch_train_acc, epoch_train_pre, epoch_train_rec, epoch_train_f1, train_time

In [18]:
def eval_model(model, dataloader):
    running_val_loss = 0
    running_val_acc = 0
    running_val_pre = 0
    running_val_rec = 0
    running_val_f1 = 0
    
    ## Validation mode
    n_batch = len(dataloader)
    t0 = time.time()
    model.eval()
    with torch.no_grad():
        for idx, (x, y) in tqdm(enumerate(dataloader, 1)):
            x = x.to(device)
            y = y.squeeze().to(device)

            y_pred = model(x)
            loss = criterion(y_pred, y)
            
            y = y.numpy()
            y_pred = torch.max(y_pred, 1)[1].detach().numpy()
            acc, pre, rec, f1 = scoring(y, y_pred)

            running_val_loss += loss.item() * x.size(0)
            running_val_acc += acc
            running_val_pre += pre
            running_val_rec += rec
            running_val_f1 += f1
            
    epoch_val_loss = running_val_loss / len(dataloader.dataset)
    epoch_val_acc = running_val_acc / n_batch
    epoch_val_pre = running_val_pre / n_batch
    epoch_val_rec = running_val_rec / n_batch
    epoch_val_f1 = running_val_f1 / n_batch
    
    val_time = time.time() - t0
    return epoch_val_loss, epoch_val_acc, epoch_val_pre, epoch_val_rec, epoch_val_f1, val_time

In [19]:
for epoch_i in range(0, epochs):
    print(f'==== Epoch {epoch_i+1} / {epochs} ====')
    epoch_train_loss, epoch_train_acc, epoch_train_pre, epoch_train_rec, epoch_train_f1, train_time = train_model(model, train_dataloader)
    epoch_val_loss, epoch_val_acc, epoch_val_pre, epoch_val_rec, epoch_val_f1, val_time = eval_model(model, val_dataloader)
    
    print("\t TRAINING")
    print(f"\tLoss: {epoch_train_loss:.2f}  |  Accuracy: {epoch_train_acc:.2f}%")
    print(f"\tPrecision: {epoch_train_pre:.2f}  |  Recall: {epoch_train_rec:.2f}  |  F1: {epoch_train_f1:.2f}")
    print(f"\tExec Time: {round(train_time)}s")
    print("\n")
    print("\t EVALUATING")
    print(f"\tLoss: {epoch_val_loss:.2f}  |  Accuracy: {epoch_val_acc:.2f}%")
    print(f"\tPrecision: {epoch_val_pre:.2f}  |  Recall: {epoch_val_rec:.2f}  |  F1: {epoch_val_f1:.2f}")
    print(f"\tExec Time: {round(val_time)}s")
    print("\n")

0it [00:00, ?it/s]

==== Epoch 1 / 5 ====


60it [15:02, 15.04s/it]
15it [01:18,  5.20s/it]
0it [00:00, ?it/s]

	 TRAINING
	Loss: 0.51  |  Accuracy: 0.78%
	Precision: 0.30  |  Recall: 0.09  |  F1: 0.12
	Exec Time: 902s


	 EVALUATING
	Loss: 0.48  |  Accuracy: 0.81%
	Precision: 0.70  |  Recall: 0.27  |  F1: 0.37
	Exec Time: 78s


==== Epoch 2 / 5 ====


60it [14:46, 14.78s/it]
15it [01:22,  5.48s/it]
0it [00:00, ?it/s]

	 TRAINING
	Loss: 0.43  |  Accuracy: 0.82%
	Precision: 0.58  |  Recall: 0.34  |  F1: 0.41
	Exec Time: 887s


	 EVALUATING
	Loss: 0.44  |  Accuracy: 0.81%
	Precision: 0.75  |  Recall: 0.32  |  F1: 0.44
	Exec Time: 82s


==== Epoch 3 / 5 ====


60it [15:04, 15.07s/it]
15it [01:14,  5.00s/it]
0it [00:00, ?it/s]

	 TRAINING
	Loss: 0.38  |  Accuracy: 0.85%
	Precision: 0.69  |  Recall: 0.52  |  F1: 0.57
	Exec Time: 904s


	 EVALUATING
	Loss: 0.42  |  Accuracy: 0.81%
	Precision: 0.64  |  Recall: 0.44  |  F1: 0.51
	Exec Time: 75s


==== Epoch 4 / 5 ====


60it [13:49, 13.83s/it]
15it [01:05,  4.39s/it]
0it [00:00, ?it/s]

	 TRAINING
	Loss: 0.32  |  Accuracy: 0.87%
	Precision: 0.75  |  Recall: 0.61  |  F1: 0.65
	Exec Time: 830s


	 EVALUATING
	Loss: 0.45  |  Accuracy: 0.82%
	Precision: 0.65  |  Recall: 0.39  |  F1: 0.47
	Exec Time: 66s


==== Epoch 5 / 5 ====


60it [15:31, 15.52s/it]
15it [01:18,  5.21s/it]

	 TRAINING
	Loss: 0.28  |  Accuracy: 0.89%
	Precision: 0.82  |  Recall: 0.68  |  F1: 0.72
	Exec Time: 931s


	 EVALUATING
	Loss: 0.48  |  Accuracy: 0.80%
	Precision: 0.56  |  Recall: 0.49  |  F1: 0.52
	Exec Time: 78s





