In [2]:
import os
import glob
import time
import pickle
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
transformed_path = '../data/transformed'
cross_val_path = '../data/cross_validation_data'

In [4]:
file_cross_val_path = glob.glob(os.path.join(cross_val_path, '*', '*'))

In [5]:
glob.glob(os.path.join(transformed_path, '*'))

['../data/transformed/data.pkl']

In [3]:
d_data = pickle.load(open('../data/transformed/data.pkl', 'rb'))

In [4]:
d_data.head()

Unnamed: 0,index,id,qid1,qid2,question1,question2,is_duplicate,q1_clean,q2_clean,q1_token,q2_token,q1_vector,q2_vector
0,224417,305276,428643,428644,Which kind of tea should we drink if we want t...,I don't like milk but I like drinking tea a lo...,0,kind tea drink want mix milk,like milk like drinking tea lot milk good vari...,"[kind, tea, drink, want, mix, milk]","[like, milk, like, drinking, tea, lot, milk, g...","[0.0036476282, -0.014833373, -0.040485434, -0....","[-0.020281216, -0.076978564, 0.01739972, -0.06..."
1,247342,328201,454706,454707,What are the opportunities after doing an MBA ...,What are the opportunities for an MBA in finance?,0,opportunities mba finance,opportunities mba finance,"[opportunities, mba, finance]","[opportunities, mba, finance]","[0.013195766, -0.017697409, -0.015577622, 0.03...","[0.0044767293, -0.007943029, -0.015119746, 0.0..."
2,220551,301410,423210,424307,How do I have to learn english?,What is the easiest way to learn English?,0,learn english,easiest way learn english,"[learn, english]","[easiest, way, learn, english]","[0.0041956515, -0.020153206, -0.0086065605, 0....","[0.059110086, 0.0052670497, -0.028890401, 0.02..."
3,285439,366298,318327,496480,How is a Singapore GPA converted to Australian...,How do I convert my IIT CGPA to the USA GPA?,0,singapore gpa converted australian university ...,convert iit cgpa usa gpa,"[singapore, gpa, converted, australian, univer...","[convert, iit, cgpa, usa, gpa]","[0.012895356, 0.018507749, 0.065207005, 0.0409...","[-0.012635684, 0.05562323, 0.08929469, 0.00817..."
4,207397,288256,409186,409187,What is the salary after gate?,Are Google's salaries on Glassdoor after taxes?,0,salary gate,google salaries glassdoor taxes,"[salary, gate]","[google, salaries, glassdoor, taxes]","[-0.024693614, -0.008471563, 0.06587361, -0.02...","[-0.016694807, 0.012628836, 0.023681683, 0.045..."


In [7]:
file_cross_val_path

['../data/cross_validation_data/1/test.csv',
 '../data/cross_validation_data/1/train.csv',
 '../data/cross_validation_data/4/test.csv',
 '../data/cross_validation_data/4/train.csv',
 '../data/cross_validation_data/2/test.csv',
 '../data/cross_validation_data/2/train.csv',
 '../data/cross_validation_data/3/test.csv',
 '../data/cross_validation_data/3/train.csv',
 '../data/cross_validation_data/5/test.csv',
 '../data/cross_validation_data/5/train.csv']

In [8]:
class DatasetPairs(Dataset):
    def __init__(self, d_data, cross_val_paths):
        self.dataset = d_data
        self.split_dict = self.get_id_cross_val(cross_val_paths)
        
        self.splited_data(k=1)
        self.set_split(split='train')
    
    def read_csv(self, path):
        d_data = pd.read_csv(path, sep='\t')
        return d_data
    
    def get_id_cross_val(self, paths):
        data_dict = {}
        path_dict = dict((file.split('/')[-2], file) for file in paths)
        for k, path in path_dict.items():
            train = self.read_csv(path)
            id_train = train.id.tolist()
            
            path = path.replace('train.csv', 'test.csv')
            test = self.read_csv(path)
            id_test = test.id.tolist()
            
            data_dict[int(k)] = (id_train, id_test)
            
        return data_dict

    def splited_data(self, k):
        id_train, id_test = self.split_dict[k]
        train = self.dataset[self.dataset.id.isin(id_train)]
        test = self.dataset[self.dataset.id.isin(id_test)]
        
        train.reset_index(drop=True, inplace=True)
        test.reset_index(drop=True, inplace=True)
        
        self.data_dict = {'train': (train, len(train)), 'test': (test, len(test))}
        
    def set_split(self, split='train'):
        self.data, self.length = self.data_dict[split]
    
    def __getitem__(self, idx):
        q1 = self.data.loc[idx, "q1_vector"]
        q2 = self.data.loc[idx, "q2_vector"]
        x  = np.concatenate((dataset.data.loc[0, "q1_vector"], 
                             dataset.data.loc[0, "q1_vector"]))
        y  = self.data.loc[idx, "is_duplicate"]
        
        x = torch.Tensor(x)
        y = torch.LongTensor([y])
        
        return (x, y)
    
    def __len__(self):
        return self.length

In [9]:
class Classifier(nn.Module):
    def __init__(self, vec_size, l1, l2, num_class):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(vec_size*2, l1)
        self.fc2 = nn.Linear(l1, l1)
        self.fc3 = nn.Linear(l1, l2)
        self.fc4 = nn.Linear(l2, num_class)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
        out = self.fc3(out)
        out = self.fc4(out)
        
        return out

In [10]:
dataset = DatasetPairs(d_data, file_cross_val_path)

In [11]:
vec_q = 128
l1 = 1024
l2 =512
num_class = 2

batchsize = 512

In [12]:
model = Classifier(vec_q, l1, l2, num_class)
model = model.to(device)

In [13]:
parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'the model has {parameters:,} trainable parameters')

the model has 1,838,594 trainable parameters


In [14]:
optimizer = optim.Adam(model.parameters(), lr = 1e-4)
criterion = nn.CrossEntropyLoss()

In [15]:
def compute_accuracy(y, y_pred):
    y_label = y_pred.argmax(dim=1)
    n_correct = torch.eq(y, y_label).sum().item()
    accuracy = (n_correct / len(y_label)) * 100
    
    return accuracy

In [16]:
def compute_time(start, end):
    duration = end - start
    m = int(duration / 60)
    s = int(duration % 60)
    
    return m, s

In [17]:
for epoch in range(1, 51):
    
    start = time.time()
    
    running_loss = 0
    running_accu = 0
    running_loss_v = 0
    running_accu_v = 0
    
    dataset.set_split("train")
    data_gen = DataLoader(dataset, batch_size=batchsize)
    model.train()
    for batch_index, (x, y) in enumerate(data_gen, 1):
        optimizer.zero_grad()
        
        x = x.to(device)
        y = y.squeeze()
        
        out = model(x)
        out = out.to("cpu")
        
        loss = criterion(out, y)
        loss_ = loss.item()
        running_loss += (loss_ - running_loss) / batch_index
        
        accu = compute_accuracy(y, out)
        running_accu += (accu - running_accu) / batch_index
        
        loss.backward()
        optimizer.step()
        
    dataset.set_split("test")
    data_gen = DataLoader(dataset, batch_size=batchsize)
    model.eval()
    for batch_index, (x, y) in enumerate(data_gen, 1):
        optimizer.zero_grad()
        
        x = x.to(device)
        y = y.squeeze()
        
        out = model(x)
        out = out.to("cpu")
        
        loss = criterion(out, y)
        loss_ = loss.item()
        running_loss_v += (loss_ - running_loss_v) / batch_index
        
        accu = compute_accuracy(y, out)
        running_accu_v += (accu - running_accu_v) / batch_index
        
    end = time.time()
    m, s = compute_time(start, end)
    
    print(f'epoch {epoch} | {m}m {s}s')
    print(f'\ttrain loss: {running_loss:.2f} | train accuracy {running_accu:.2f}')
    print(f'\tval loss: {running_loss_v:.2f} | val accuracy {running_accu_v:.2f}')

epoch 1 | 0m 32s
	train loss: 0.66 | train accuracy 63.08
	val loss: 0.66 | val accuracy 63.08
epoch 2 | 0m 33s
	train loss: 0.66 | train accuracy 63.08
	val loss: 0.66 | val accuracy 63.08
epoch 3 | 0m 32s
	train loss: 0.66 | train accuracy 63.08
	val loss: 0.66 | val accuracy 63.08
epoch 4 | 0m 32s
	train loss: 0.66 | train accuracy 63.08
	val loss: 0.66 | val accuracy 63.08
epoch 5 | 0m 32s
	train loss: 0.66 | train accuracy 63.08
	val loss: 0.66 | val accuracy 63.08
epoch 6 | 0m 32s
	train loss: 0.66 | train accuracy 63.08
	val loss: 0.66 | val accuracy 63.08
epoch 7 | 0m 33s
	train loss: 0.66 | train accuracy 63.08
	val loss: 0.66 | val accuracy 63.08
epoch 8 | 0m 32s
	train loss: 0.66 | train accuracy 63.08
	val loss: 0.66 | val accuracy 63.08
epoch 9 | 0m 31s
	train loss: 0.66 | train accuracy 63.08
	val loss: 0.66 | val accuracy 63.08
epoch 10 | 0m 32s
	train loss: 0.66 | train accuracy 63.08
	val loss: 0.66 | val accuracy 63.08
epoch 11 | 0m 31s
	train loss: 0.66 | train accur

KeyboardInterrupt: 