### Modeling Quora using Doc2Vec & LSTM

In [1]:
import os
import glob
import time
import pickle
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
transformed_path = '../data/transformed'
cross_val_path = '../data/cross_validation_data'

In [4]:
file_cross_val_path = glob.glob(os.path.join(cross_val_path, '*', '*'))

In [5]:
glob.glob(os.path.join(transformed_path, '*'))

[]

In [None]:
d_data = pickle.load(open('../data/transformed/data.pkl', 'rb'))

In [None]:
d_data.head()

In [None]:
file_cross_val_path

In [None]:
class DatasetPairs(Dataset):
    def __init__(self, d_data, cross_val_paths):
        self.dataset = d_data
        self.split_dict = self.get_id_cross_val(cross_val_paths)
        
        self.splited_data(k=1)
        self.set_split(split='train')
    
    def read_csv(self, path):
        d_data = pd.read_csv(path, sep='\t')
        return d_data
    
    def get_id_cross_val(self, paths):
        data_dict = {}
        path_dict = dict((file.split('/')[-2], file) for file in paths)
        for k, path in path_dict.items():
            train = self.read_csv(path)
            id_train = train.id.tolist()
            
            path = path.replace('train.csv', 'test.csv')
            test = self.read_csv(path)
            id_test = test.id.tolist()
            
            data_dict[int(k)] = (id_train, id_test)
            
        return data_dict

    def splited_data(self, k):
        id_train, id_test = self.split_dict[k]
        train = self.dataset[self.dataset.id.isin(id_train)]
        test = self.dataset[self.dataset.id.isin(id_test)]
        
        train.reset_index(drop=True, inplace=True)
        test.reset_index(drop=True, inplace=True)
        
        self.data_dict = {'train': (train, len(train)), 'test': (test, len(test))}
        
    def set_split(self, split='train'):
        self.data, self.length = self.data_dict[split]
    
    def __getitem__(self, idx):
        q1 = self.data.loc[idx, "q1_vector"]
        q2 = self.data.loc[idx, "q2_vector"]
        x  = np.concatenate((dataset.data.loc[0, "q1_vector"], 
                             dataset.data.loc[0, "q1_vector"]))
        y  = self.data.loc[idx, "is_duplicate"]
        
        x = torch.Tensor(x)
        y = torch.LongTensor([y])
        
        return (x, y)
    
    def __len__(self):
        return self.length

In [None]:
class Classifier(nn.Module):
    def __init__(self, vec_size, l1, l2, num_class):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(vec_size*2, l1)
        self.fc2 = nn.Linear(l1, l1)
        self.fc3 = nn.Linear(l1, l2)
        self.fc4 = nn.Linear(l2, num_class)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
        out = self.fc3(out)
        out = self.fc4(out)
        
        return out

In [None]:
dataset = DatasetPairs(d_data, file_cross_val_path)

In [None]:
vec_q = 128
l1 = 1024
l2 =512
num_class = 2

batchsize = 512

In [None]:
model = Classifier(vec_q, l1, l2, num_class)
model = model.to(device)

In [None]:
parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'the model has {parameters:,} trainable parameters')

In [None]:
optimizer = optim.Adam(model.parameters(), lr = 1e-4)
criterion = nn.CrossEntropyLoss()

In [None]:
def compute_accuracy(y, y_pred):
    y_label = y_pred.argmax(dim=1)
    n_correct = torch.eq(y, y_label).sum().item()
    accuracy = (n_correct / len(y_label)) * 100
    
    return accuracy

In [None]:
def compute_time(start, end):
    duration = end - start
    m = int(duration / 60)
    s = int(duration % 60)
    
    return m, s

In [None]:
for epoch in range(1, 51):
    
    start = time.time()
    
    running_loss = 0
    running_accu = 0
    running_loss_v = 0
    running_accu_v = 0
    
    dataset.set_split("train")
    data_gen = DataLoader(dataset, batch_size=batchsize)
    model.train()
    for batch_index, (x, y) in enumerate(data_gen, 1):
        optimizer.zero_grad()
        
        x = x.to(device)
        y = y.squeeze()
        
        out = model(x)
        out = out.to("cpu")
        
        loss = criterion(out, y)
        loss_ = loss.item()
        running_loss += (loss_ - running_loss) / batch_index
        
        accu = compute_accuracy(y, out)
        running_accu += (accu - running_accu) / batch_index
        
        loss.backward()
        optimizer.step()
        
    dataset.set_split("test")
    data_gen = DataLoader(dataset, batch_size=batchsize)
    model.eval()
    for batch_index, (x, y) in enumerate(data_gen, 1):
        optimizer.zero_grad()
        
        x = x.to(device)
        y = y.squeeze()
        
        out = model(x)
        out = out.to("cpu")
        
        loss = criterion(out, y)
        loss_ = loss.item()
        running_loss_v += (loss_ - running_loss_v) / batch_index
        
        accu = compute_accuracy(y, out)
        running_accu_v += (accu - running_accu_v) / batch_index
        
    end = time.time()
    m, s = compute_time(start, end)
    
    print(f'epoch {epoch} | {m}m {s}s')
    print(f'\ttrain loss: {running_loss:.2f} | train accuracy {running_accu:.2f}')
    print(f'\tval loss: {running_loss_v:.2f} | val accuracy {running_accu_v:.2f}')