In [1]:
import time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from torch import tensor
from sklearn.model_selection import train_test_split
from bpemb import BPEmb
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence

## Prepare dataset and model

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
def load_data(path):
    data = pd.read_csv(path, keep_default_na=False)
    return data

def prepare_dataset(X_train_pth, y_train_pth, X_test_pth, y_test_pth):
    X_test = load_data(X_test_pth)
    y_test = load_data(y_test_pth)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    test = (X_test, y_test)

    X_tmp = load_data(X_train_pth)
    y_tmp = load_data(y_train_pth)
    X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=42)
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_val.reset_index(drop=True, inplace=True)
    y_val.reset_index(drop=True, inplace=True)

    train = (X_train, y_train)
    val = (X_val, y_val)
    return train, val, test

In [4]:
class DatasetBuzzer(Dataset):
    def __init__(self, data):
        self.data = data
        self.bpe = BPEmb(lang="id", vs=100000, dim=300)
        
    def __getitem__(self, idx):
        x_raw = self.data[0].loc[idx, 'text_used']
        y  = self.data[1].loc[idx, 'buzzer']
        
        x = self.bpe.encode_ids(x_raw)
        x = tensor(x)
        y = torch.LongTensor([y])
        
        return (x, y)
    
    def __len__(self):
        return len(self.data[0])

In [5]:
def padding(data):
    x_list = []
    y_list = []
    for x, y in data:
        x_list.append(x)
        y_list.append(y)
        
    x_pad = pad_sequence(x_list, batch_first=True)
    y_pad = pad_sequence(y_list, batch_first=True)
    
    return x_pad, y_pad

In [6]:
class LSTM(nn.Module):
    def __init__(self, hidden_size, num_class, weights):
        super(LSTM, self).__init__()
        self.word_embeddings = nn.Embedding.from_pretrained(weights)
        self.lstm = nn.LSTM(weights.shape[1], hidden_size, num_layers=2, dropout=0.5, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_class)
        
    def forward(self, input_):
        input_ = self.word_embeddings(input_)
        out, (hidden_state, cell_state) = self.lstm(input_)
        out = self.fc(hidden_state[-1])
        
        return out

## Setup DataLoader

In [7]:
X_train_pth = '../data/dataset/RAW_X_train.csv'
y_train_pth = '../data/dataset/RAW_y_train.csv'
X_test_pth = '../data/dataset/RAW_X_test.csv'
y_test_pth = '../data/dataset/RAW_y_test.csv'

In [8]:
raw_train, raw_val, raw_test = prepare_dataset(X_train_pth, y_train_pth, X_test_pth, y_test_pth)

In [9]:
train_dataset = DatasetBuzzer(raw_train)
val_dataset = DatasetBuzzer(raw_val)
test_dataset = DatasetBuzzer(raw_test)

In [10]:
b_size = 64

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size = b_size, collate_fn=padding)
val_dataloader = DataLoader(val_dataset, shuffle=True, batch_size = b_size, collate_fn=padding)
test_dataloader = DataLoader(test_dataset, batch_size = b_size, collate_fn=padding)

In [13]:
print(f'total train batch: {len(train_dataloader)}')
print(f'total val batch: {len(val_dataloader)}')
print(f'total test batch: {len(test_dataloader)}')

total train batch: 30
total val batch: 8
total test batch: 10


## Hyperparameter and BPE Embeddings

In [15]:
weights = tensor(train_dataset.bpe.vectors)
hidden_size = 128
num_class = 2
epochs = 3

## Setup Model

In [16]:
model = LSTM(hidden_size, num_class, weights)
model.to(device)

LSTM(
  (word_embeddings): Embedding(100000, 300)
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

In [17]:
parameters = sum(p.numel() for p in model.parameters())
print(f'model has {parameters:,} trainable parameters')

model has 30,352,514 trainable parameters


## Setup Optimizer and Loss function

In [18]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
total_time = time.time()

for epoch_i in range(0, epochs):
    total_train_loss = 0
    total_train_acc = 0
    total_val_loss = 0
    total_val_acc = 0
    
    print(f'==== Epoch {epoch_i+1} / {epochs} ====')
    ## Training mode
    t0 = time.time()
    model.train()
    for idx, (x, y) in tqdm(enumerate(train_dataloader, 1)):
        optimizer.zero_grad()
        x = x.to(device)
        y = y.squeeze().to(device)

        model.zero_grad()
        y_pred = model(x)

        loss = criterion(y_pred, y)
        num_corrects = (torch.max(y_pred, 1)[1] == y).float().sum()
        acc = 100.0 * num_corrects/len(x)


        total_train_loss += loss.item()
        total_train_acc += acc

        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_acc = total_train_acc / len(train_dataloader)
    train_time = time.time() - t0

    ## Validation mode
    t0 = time.time()
    model.eval()
    for idx, (x, y) in tqdm(enumerate(val_dataloader, 1)):
        x = x.to(device)
        y = y.squeeze().to(device)

        y_pred = model(x)

        loss = criterion(y_pred, y)
        num_corrects = (torch.max(y_pred, 1)[1] == y).float().sum()
        acc = 100.0 * num_corrects/len(x)

        total_train_loss += loss.item()
        total_train_acc += acc

    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_acc = total_val_acc / len(val_dataloader)
    val_time = time.time() - t0

    print(f"\tTrain loss: {avg_train_loss:.2f}  |  Train accuracy: {avg_train_acc:.2f}")
    print(f"\tTrain time: {train_time}")
    print(f"\tVal loss: {avg_val_loss:.2f}  |  Val accuracy: {avg_val_acc:.2f}")
    print(f"\tVal time: {val_time}")