In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import seaborn as sns
import torch
from torch.utils.data import Dataset
import torch.nn.utils.rnn as rnn_utils

In [35]:
device = torch.device('cuda')

In [16]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable")


In [17]:
class Custom_Text_Dataset(Dataset):
    def __init__(self, df_dir, tokenizer):
        self.df = pd.read_csv(df_dir)
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        text = self.df['post_message'][idx]
        
        label = self.df['label'][idx]
        label = torch.tensor(label, dtype=torch.long)
        
        tokens = self.tokenizer.tokenize(text)
        
        ids = self.tokenizer.convert_tokens_to_ids(tokens)
        ids = torch.tensor(ids, dtype=torch.long)
        
        length = ids.shape
        length = torch.tensor(length, dtype=torch.long)
        # ids = torch.tensor(ids, dtype=torch.long)
        # Add 0 so that length of all ids is 7180
        # ids = torch.cat((ids, torch.zeros(7180 - len(ids), dtype=torch.long)))    
        
        return ids, length, label

In [18]:
train_ds = Custom_Text_Dataset('train.csv', tokenizer)
test_ds = Custom_Text_Dataset('test.csv', tokenizer)

In [19]:
from torch.utils.data import DataLoader

def custom_collate_fn(batch):
    sorted_batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
    return sorted_batch

train_dl = DataLoader(train_ds, batch_size=64, collate_fn=custom_collate_fn)
test_dl = DataLoader(test_ds, batch_size=64, collate_fn=lambda x: x)


In [20]:
training_dl = [ ]
for batch in train_dl:
    X = [torch.tensor(member[0]) for member in batch]
    padded_X = rnn_utils.pad_sequence(X, batch_first=True) 
    training_dl.append([padded_X, torch.tensor([member[1] for member in batch]), torch.tensor([member[2] for member in batch])])

  X = [torch.tensor(member[0]) for member in batch]


In [21]:
testing_dl = [ ]
for batch in test_dl:
    X = [torch.tensor(member[0]) for member in batch]
    padded_X = rnn_utils.pad_sequence(X, batch_first=True)
    testing_dl.append([padded_X, torch.tensor([member[1] for member in batch]), torch.tensor([member[2] for member in batch])])

  X = [torch.tensor(member[0]) for member in batch]


## Model LSTM

In [30]:
# Pytorch's nn module has lots of useful feature
import torch.nn as nn

class LSTMNet(nn.Module):
    
    def __init__(self,vocab_size,embedding_dim,hidden_dim,output_dim,n_layers,bidirectional,dropout):
        
        super(LSTMNet,self).__init__()
        
        # Embedding layer converts integer sequences to vector sequences
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        
        # LSTM layer process the vector sequences 
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout,
                            batch_first = True
                           )
        
        # Dense layer to predict 
        self.fc = nn.Linear(hidden_dim * 2,output_dim)
        # Prediction activation function
        self.sigmoid = nn.Sigmoid()
        
    
    def forward(self,text,text_lengths):
        embedded = self.embedding(text)
        
        # Thanks to packing, LSTM don't see padding tokens 
        # and this makes our model better
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(),batch_first=True)
        
        packed_output,(hidden_state,cell_state) = self.lstm(packed_embedded)
        
        # Concatenating the final forward and backward hidden states
        hidden = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)
        
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.sigmoid(dense_outputs)
        
        return outputs

In [24]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable")
vocab_size = tokenizer.vocab_size - 1
embedding_dim = 100
hidden_dim = 64
output_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.2

model = LSTMNet(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout)


In [34]:
import torch.optim as optim
model = model.to(device)
optimizer = optim.Adam(model.parameters(),lr=1e-4)
criterion = nn.BCELoss()
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model,iterator,optimizer,criterion):
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    model.train()
    
    for batch in iterator:
        
        # cleaning the cache of optimizer
        optimizer.zero_grad()
        
        text,text_lengths = batch.text
        
        # forward propagation and squeezing
        predictions = model(text,text_lengths).squeeze()
        
        # computing loss / backward propagation
        loss = criterion(predictions,batch.type)
        loss.backward()
        
        # accuracy
        acc = binary_accuracy(predictions,batch.type)
        
        # updating params
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    # It'll return the means of loss and accuracy
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model,iterator,criterion):
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    # deactivate the dropouts
    model.eval()
    
    # Sets require_grad flat False
    with torch.no_grad():
        for batch in iterator:
            text,text_lengths = batch.text
            
            predictions = model(text,text_lengths).squeeze()
              
            #compute loss and accuracy
            loss = criterion(predictions, batch.type)
            acc = binary_accuracy(predictions, batch.type)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
EPOCH_NUMBER = 15
for epoch in range(1,EPOCH_NUMBER+1):
    
    train_loss,train_acc = train(model,train_iterator,optimizer,criterion)
    
    valid_loss,valid_acc = evaluate(model,validation_iterator,criterion)
    
    # Showing statistics
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print()