In [None]:
import collections
import sys
sys.path.insert(0, '../')

import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from utils import load_dataset, count_model_parameters, train_model, test_model

In [None]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
device = torch.device('cuda' if torch.cuda.is_available() else device)

In [None]:
dataset = {
    'all': load_dataset('../dataset/dataset.csv'),
    'train': load_dataset('../dataset/train.csv'),
    'val': load_dataset('../dataset/validation.csv'),
    'test': load_dataset('../dataset/test.csv'),
}
train_size = len(dataset['train'])

In [None]:
tokenizer = torchtext.data.get_tokenizer('basic_english')

def create_vocab(dataset):
    counter = collections.Counter()
    for query, _ in dataset:
        counter.update(tokenizer(query))
    return torchtext.vocab.vocab(counter, min_freq=1)

vocab = create_vocab(dataset['all'])
vocab_size = len(vocab)

In [None]:
batch_size = 16

def pad_tensor(tensor, length):
    return torch.nn.functional.pad(tensor, (0, length - len(tensor)), mode='constant', value=0)

def process_batch(batch):
    tokens = [tokenizer(query) for query, _ in batch]
    encoded_tokens = [vocab.lookup_indices(token) for token in tokens]
    max_len = max(map(len, tokens))

    labels = []
    for _, label in batch:
        labels.append(int(label))

    padded_tensors = []
    for encoded_token in encoded_tokens:
        padded_tensors.append(pad_tensor(torch.LongTensor(encoded_token), max_len))
    stacked_tensors = torch.stack(padded_tensors)

    return (stacked_tensors.to(device), torch.LongTensor(labels).to(device))

train_loader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=process_batch, shuffle=True)
val_loader = DataLoader(dataset['val'], batch_size=batch_size, collate_fn=process_batch, shuffle=True)

In [None]:
class BiLSTMClassifier(nn.Module):
    def __init__(self):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=128)
        self.bilstm = torch.nn.LSTM(128, 64, batch_first=True, bidirectional=True, num_layers=2)
        self.fc = nn.Linear(64, 2)

    def forward(self, x):
        x = self.embedding(x)
        _, (x, _) = self.bilstm(x)
        x = torch.mean(x[2:4, :, :], dim=0)
        x = self.fc(x)

        return x

model = BiLSTMClassifier().to(device)

In [None]:
hyperparameters = {
    'epoch': 5,
    'optimizer': optim.SGD(model.parameters(), lr=0.01),
    'lr_scheduler': None,
    'loss_fn': nn.CrossEntropyLoss()
}

train_model(model, train_loader, val_loader, train_size, 180, hyperparameters)

In [None]:
# load model
model = BiLSTMClassifier()
model.load_state_dict(torch.load('model.pth'))
model = model.to(device)

In [None]:
count_model_parameters(model)

In [None]:
def process_test_batch(batch):
    raw_queries = [query for query, _ in batch]
    tokens = [tokenizer(query) for query, _ in batch]
    encoded_tokens = [vocab.lookup_indices(token) for token in tokens]
    max_len = max(map(len, tokens))

    labels = []
    for _, label in batch:
        labels.append(int(label))

    padded_tensors = []
    for encoded_token in encoded_tokens:
        padded_tensors.append(pad_tensor(torch.LongTensor(encoded_token), max_len))
    stacked_tensors = torch.stack(padded_tensors)

    return (stacked_tensors.to(device), torch.LongTensor(labels).to(device), raw_queries.to(device))

test_loader = DataLoader(dataset['test'], batch_size=128, collate_fn=process_test_batch, shuffle=True)
test_model(model, test_loader)