In [None]:
import sys
import collections
sys.path.insert(0, '../')

import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

from utils import load_dataset, train, test

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else device)

In [None]:
dataset = load_dataset("../dataset/train.csv")
dataset_size = len(dataset)

In [None]:
tokenizer = torchtext.data.get_tokenizer("basic_english")

def create_vocab(dataset):
    counter = collections.Counter()
    for query, _ in dataset:
        counter.update(tokenizer(query))
    return torchtext.vocab.vocab(counter, min_freq=1)

vocab = create_vocab(dataset)
vocab_size = len(vocab)

In [None]:
batch_size = 16
query_len = 100

def pad_tensor(tensor, length):
    return torch.nn.functional.pad(tensor, (0, length - len(tensor)), mode='constant', value=0)

def process_batch(batch):
    labels = []
    tokens = [tokenizer(query) for query, _ in batch]
    encoded = [vocab.lookup_indices(tokens) for tokens in tokens]

    for _, label in batch:
        labels.append(int(label))

    l = max(map(len, tokens))

    padded_tensors = [pad_tensor(torch.LongTensor(t), l) for t in encoded]
    stacked_tensors = torch.stack(padded_tensors)

    return (stacked_tensors,
            torch.LongTensor(labels))

train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=process_batch, shuffle=True)

In [None]:
class BiLSTMClassifier(nn.Module):
    def __init__(self):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=128)
        self.bilstm = torch.nn.LSTM(128, 64, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(64*2, 2)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.bilstm(x)
        x = self.fc(x[:, -1, :])

        return x

network = BiLSTMClassifier().to(device)

In [None]:
hyperparameters = {
    "learning_rate": 0.01,
    "epoch": 30,
    "optimizer": optim.SGD(network.parameters(), lr=0.01),
    "lr_scheduler": None,
    "loss_fn": nn.CrossEntropyLoss(),
}

loss, accurancy = train(network, train_loader, device, dataset_size, 20, hyperparameters)
print(f"loss={loss}, accurancy={accurancy}")

# Save the model
torch.save(network.state_dict(), 'model.pth')

In [None]:
test_dataset = load_dataset("../dataset/test.csv")
model = BiLSTMClassifier()
model.load_state_dict(torch.load('model.pth'))

test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=process_batch, shuffle=True)
test(model, test_loader, device, dataset_size, 130)