In [None]:
import sys
sys.path.insert(0, './..')

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import gensim.models

from utils import load_dataset, count_model_parameters, train_model, test_model

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else device)

In [None]:
dataset = load_dataset("../dataset/dataset.csv")
train_dataset = load_dataset("../dataset/train.csv")
test_dataset = load_dataset("../dataset/test.csv")
train_size = len(train_dataset)

In [None]:
class TrainDataset:
    def __iter__(self):
        for data in train_dataset:
            yield data[0]

word_model = gensim.models.Word2Vec(sentences=TrainDataset(), vector_size=32).wv

In [None]:
batch_size = 16
max_length = 256
p = 128
fill_zero = [0 for _ in range(32)]
fill_one = [1 for _ in range(32)]

def get_char_vector(query):
    char_vec = []

    if len(query) == 1:
        try:
            char_vec.append(word_model[query])
        except KeyError:
            char_vec.append(fill_one)
    else:
        for char in query:
            try:
                char_vec.append(word_model[char])
            except KeyError:
                char_vec.append(fill_one)

            if len(char_vec) == max_length:
                break
    while len(char_vec) < max_length:
        char_vec.append(fill_one)

    return char_vec


def process_batch(batch):
    query_vec = []
    labels = []

    for query, lable in batch:
        query_vec.append(get_char_vector(query))
        labels.append(int(lable))

    query_vec = np.array(query_vec)
    labels = np.array(labels)

    return (torch.from_numpy(query_vec).view(len(batch), 1, max_length, 32).to(device), torch.from_numpy(labels).long().to(device))

train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=process_batch, shuffle=True)

In [None]:
class EPCNNClassifier(nn.Module):
    def __init__(self):
        super(EPCNNClassifier, self).__init__()
        self.conv1_1 = nn.Conv2d(1, 32, kernel_size=(1, 1), padding=1)
        self.conv1_3 = nn.Conv2d(1, 32, kernel_size=(3, 3), padding=1)
        self.conv1_5 = nn.Conv2d(1, 32, kernel_size=(5, 5), padding=1)
        self.conv2_1 = nn.Conv2d(32, 64, kernel_size=(1, 1), padding=1)
        self.conv2_3 = nn.Conv2d(32, 64, kernel_size=(3, 3), padding=1)
        self.conv2_5 = nn.Conv2d(32, 64, kernel_size=(5, 5), padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), padding=1)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=(3, 3), padding=1)
        self.conv5 = nn.Conv2d(256, 512, kernel_size=(3, 3), padding=1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(40448, 128)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        x_1 = F.relu(self.conv1_1(x))
        x_1 = F.relu(self.conv2_1(x_1))
        x_1 = F.max_pool2d(x_1, 2)
        x_1 = self._common_formard(x_1)

        x_3 = F.relu(self.conv1_3(x))
        x_3 = F.relu(self.conv2_3(x_3))
        x_3 = F.max_pool2d(x_3, 2)
        x_3 = self._common_formard(x_3)

        x_5 = F.relu(self.conv1_5(x))
        x_5 = F.relu(self.conv2_5(x_5))
        x_5 = F.max_pool2d(x_5, 2)
        x_5 = self._common_formard(x_5)

        x = torch.cat([x_1, x_3, x_5], dim=1)
        x = F.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.fc2(x)
        out = F.log_softmax(x, dim=1)

        return out

    def _common_formard(self, x):
        x = F.relu(self.conv3(x))
        x = F.max_pool2d(x, 2)

        x = F.relu(self.conv4(x))
        x = self._elastic_pool(x)

        x = F.relu(self.conv5(x))
        x = F.max_pool2d(x, 2)

        x = self.dropout1(x)
        x = nn.Flatten()(x)

        return x

    def _elastic_pool(self, x):
        num_rows = x.shape[1] // p
        return F.max_pool2d(x, (num_rows, 2))

model = EPCNNClassifier().to(device)

In [None]:
hyperparameters = {
    "epoch": 4,
    "optimizer": optim.Adadelta(model.parameters(), lr=0.01),
    "lr_scheduler": None,
    "loss_fn": nn.CrossEntropyLoss(),
}

train_model(model, train_loader, train_size, 180, hyperparameters)

# Save the model
torch.save(model.state_dict(), 'model.pth')

In [None]:
model = EPCNNClassifier()
model.load_state_dict(torch.load('model.pth'))
model = model.to(device)

In [None]:
count_model_parameters(model)

In [None]:
def process_test_batch(batch):
    raw_queries = [query for query, _ in batch]
    query_vec = []
    labels = []

    for query, lable in batch:
        query_vec.append(get_char_vector(query))
        labels.append(int(lable))

    return (torch.Tensor(query_vec).view(len(batch), 1, max_length, 32).to(device),
            torch.LongTensor(labels).to(device),
            raw_queries.to(device))

test_loader = DataLoader(test_dataset, batch_size=128, collate_fn=process_test_batch, shuffle=True)
test_model(model, test_loader)