In [1]:
import os
import time
import tqdm
import datetime
import pickle as pkl
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

In [2]:
CURR_PATH = os.getcwd()
DATA_PATH = '/data/'
VEC_PATH = '/wiki-news-300d-1M.vec'


In [3]:
import load_data
from load_data import create_weights, create_emb_layer

In [6]:
## added three cat_mode into model function
from models import LogisticRegression, NeuralNetwork

In [7]:
# added training accuracy, need to add training and validation loss later
from training import acc, train_model

In [8]:
## Load raw data sets
snli_train = pd.read_csv(CURR_PATH + DATA_PATH + "snli_train.tsv", sep='\t')
snli_val = pd.read_csv(CURR_PATH + DATA_PATH + "snli_val.tsv", sep='\t')

In [9]:
## Preprocess raw datasets
train_data = load_data.prepare_data(snli_train)
val_data = load_data.prepare_data(snli_val)

In [10]:
class LRClassifier(nn.Module):
    def __init__(self, n_in, n_out):
        """
        n_in: Number of features
        n_out: Number of output classes
        """
        super().__init__()
        
        # Set up out linear layer. This initializes the weights
        self.linear = nn.Linear(n_in, n_out)
        
        # Explicitly initialize the weights with the initialization
        self.init_weights()
    
    def forward(self, x):
        """
        x: Input data [N, k]
        ---
        Returns: log probabilities of each class [N, c]
        """
        # Apply the linear function to get our logit (real numbers)
        logit = self.linear(x)
        
        # Apply log_softmax to get logs of normalized probabilities
        return F.log_softmax(logit, dim=1)
    
    def init_weights(self):
        # Use some specific initialization schemes
        nn.init.xavier_normal_(self.linear.weight)
        nn.init.uniform_(self.linear.bias)

class NNClassifier(nn.Module):
    def __init__(self, n_in, h_s, n_out):
        super().__init__()
        self.linear1 = nn.Linear(n_in,h_s)
        self.linear2 = nn.Linear(h_s,h_s)
        self.linear3 = nn.Linear(h_s,n_out)
    
    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        x = F.relu(x)
        x = self.linear3(x)
        return F.log_softmax(x)
        

class BOWEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, class_in):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BOWEncoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,class_in)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        out = self.linear(out.float())
        return out


In [11]:
class NNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_in, h_s, n_out, combine_mode):
        super().__init__()
        self.encoder = BOWEncoder(vocab_size, embed_dim, n_in)
        if combine_mode == 'DIRECT':
            n_in = n_in * 2;
        self.classifier = NNClassifier(n_in,h_s, n_out)
    
    def forward(self, premise, len_premise, hypothesis, len_hypo, combine_mode):
        premise = self.encoder(premise, len_premise)
        hypothesis = self.encoder(hypothesis, len_hypo)
        if combine_mode == 'DIRECT':
            x = torch.cat((premise, hypothesis),1)
        elif combine_mode == 'MUL':
            x = torch.mul(premise, hypothesis)
        elif combine_mode == 'SUB':
            x = torch.sub(premise, hypothesis)
        x = self.classifier(x)
        return x

In [12]:
class LRModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_in, n_out, combine_mode):
        super().__init__()
        self.encoder = BOWEncoder(vocab_size, embed_dim, n_in)
        if combine_mode == 'DIRECT':
            n_in = n_in * 2;
        self.classifier = LRClassifier(n_in, n_out)
    
    def forward(self, premise, len_premise, hypothesis, len_hypo, combine_mode):
        premise = self.encoder(premise, len_premise)
        hypothesis = self.encoder(hypothesis, len_hypo)
        if combine_mode == 'DIRECT':
            x = torch.cat((premise, hypothesis),1)
        elif combine_mode == 'MUL':
            x = torch.mul(premise, hypothesis)
        elif combine_mode == 'SUB':
            x = torch.sub(premise, hypothesis)
        x = self.classifier(x)
        return x

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
criterion = torch.nn.CrossEntropyLoss()
num_class = 3
batch_size = 32
h_s = 100

In [17]:
MAX_SENTENCE_LENGTH = 30

VOCAB_SIZES = [5000, 10000, 20000, 40000,50000]
EMB_DIMS = [50, 100,200,300,500]
CAT_MODES = ["DIRECT","MUL","SUB"]
MODEL_TYPES = { 'log-reg': LogisticRegression, 'neural-net': NeuralNetwork}
SAVE_FOLDER = os.path.join('models', 'snli')
if not os.path.exists(SAVE_FOLDER):
    os.makedirs(SAVE_FOLDER)

In [None]:
for vocab_size in VOCAB_SIZES:
    # Load datasets
    vectors = pkl.load(open('pickle/'+str(vocab_size)+'_vectors.pkl', 'rb'))
    id2token = pkl.load(open('pickle/'+str(vocab_size)+'_id2token.pkl', 'rb'))
    token2id = pkl.load(open('pickle/'+str(vocab_size)+'_token2id.pkl', 'rb'))
    ## Convert to token lists to lists of corresponding indices
    indiced_train_data, train_target = load_data.token2index_dataset(train_data, token2id, MAX_SENTENCE_LENGTH)
    indiced_val_data, val_target = load_data.token2index_dataset(val_data, token2id, MAX_SENTENCE_LENGTH)
    train_dataset = load_data.SNLIDataset(indiced_train_data, train_target)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size,
                                           collate_fn=load_data.SNLI_collate_func,
                                           shuffle=True)
    val_dataset = load_data.SNLIDataset(indiced_val_data, val_target)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=batch_size,
                                           collate_fn=load_data.SNLI_collate_func,
                                           shuffle=True)
    
    for embed_dim in EMB_DIMS: 
        n_in = len(id2token)
        
        for cat_mode in CAT_MODES:
            
            for model_str, model_class in MODEL_TYPES.items():
                print('Vocab_size:{}, Embed_dim:{}, cat_mode:{}, Classifier:{}'.format(vocab_size, embed_dim, cat_mode, model_str))
                filename = '{}_{}_{}_{}.pt'.format(vocab_size, embed_dim, cat_mode, model_str)
                save_path = os.path.join(SAVE_FOLDER, filename)
                if model_class is NeuralNetwork:
                    model = NNModel(vocab_size, embed_dim, n_in , h_s, num_class, cat_mode)
                elif model_class is LogisticRegression:
                    model = LRModel(vocab_size, embed_dim, n_in, num_class, cat_mode)                

                model.to(device)
                optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
                num_epochs = 10
                history_train_acc, history_val_acc, history_train_loss, history_val_loss = [], [], [], []
                best_accuracy = 0
                
                for epoch in range(num_epochs):
                    for i, (premise, len_premise, hypothesis, len_hypo, labels) in enumerate(train_loader):
                        model.train()
                        # Load samples
                        premise = premise.to(device)
                        hypothesis = hypothesis.to(device)
                        labels = labels.to(device)
                        optimizer.zero_grad()
                        outputs = model(premise, len_premise, hypothesis, len_hypo, cat_mode )
                        loss = criterion(outputs, labels)
                        loss.backward()
                        optimizer.step()
                        
                        if i > 0 and i % 500 == 0:
                            train_loss = loss.data.item()

                            model.eval()
                            correct = 0
                            total = 0

                            for premise_val, len_premise_val, hypothesis_val, len_hypo_val, labels_val in val_loader:
                                # Load samples
                                premise_val = premise_val.to(device)
                                hypothesis_val = hypothesis_val.to(device)
                                labels_val = labels_val.to(device)

                                outputs_val = model(premise_val, len_premise_val, hypothesis_val, len_hypo_val, cat_mode)
                
                                val_loss = criterion(outputs_val, labels_val)
                                predicted = outputs_val.max(1, keepdim=True)[1]
                                total += labels_val.size(0)
                                # Total correct predictions
                                correct += predicted.eq(labels_val.view_as(predicted)).sum().item()
            
                            accuracy = 100. * correct / total
#                             print('Iter: {} | Train Loss: {} | Val Loss: {} | Val Accuracy: {}'.format(i, train_loss, val_loss.item(), round(accuracy, 2)))
                            # Append to history
                            history_val_loss.append(val_loss.data.item())
                            history_val_acc.append(round(accuracy, 2))
                            history_train_loss.append(train_loss)                
                            # Save model when accuracy beats best accuracy
                            if accuracy > best_accuracy:
                                best_accuracy = accuracy
                                torch.save(model.state_dict(), save_path)
                    print('Epoch: {} | Train Loss: {} | Val Loss: {} | Val Accuracy: {}'.format((epoch+1), train_loss, val_loss.item(), round(accuracy, 2)))


Vocab_size:5000, Embed_dim:50, cat_mode:DIRECT, Classifier:log-reg
Epoch: 1 | Train Loss: 0.8465526700019836 | Val Loss: 0.8521768450737 | Val Accuracy: 55.1
Epoch: 2 | Train Loss: 0.576822817325592 | Val Loss: 0.6295715570449829 | Val Accuracy: 59.6
Epoch: 3 | Train Loss: 0.7618913650512695 | Val Loss: 1.0731502771377563 | Val Accuracy: 59.9
Epoch: 4 | Train Loss: 0.8178608417510986 | Val Loss: 0.5459192991256714 | Val Accuracy: 60.3
Epoch: 5 | Train Loss: 0.8814212679862976 | Val Loss: 1.366952657699585 | Val Accuracy: 60.8
Epoch: 6 | Train Loss: 0.7352291941642761 | Val Loss: 1.39240562915802 | Val Accuracy: 60.9
Epoch: 7 | Train Loss: 0.8335384130477905 | Val Loss: 0.8920820355415344 | Val Accuracy: 60.7
Epoch: 8 | Train Loss: 0.5597620606422424 | Val Loss: 1.1012122631072998 | Val Accuracy: 60.7
Epoch: 9 | Train Loss: 0.7669667601585388 | Val Loss: 0.7370402812957764 | Val Accuracy: 60.8
Epoch: 10 | Train Loss: 0.5742509365081787 | Val Loss: 0.8449488282203674 | Val Accuracy: 62.0



Epoch: 1 | Train Loss: 0.8974243402481079 | Val Loss: 1.4534313678741455 | Val Accuracy: 58.6
Epoch: 2 | Train Loss: 0.7620750069618225 | Val Loss: 0.8860758543014526 | Val Accuracy: 62.1
Epoch: 3 | Train Loss: 0.8816721439361572 | Val Loss: 0.31522682309150696 | Val Accuracy: 62.9
Epoch: 4 | Train Loss: 0.5966107845306396 | Val Loss: 0.5474328398704529 | Val Accuracy: 64.2
Epoch: 5 | Train Loss: 0.5737042427062988 | Val Loss: 0.9308916926383972 | Val Accuracy: 65.4
Epoch: 6 | Train Loss: 0.6611303091049194 | Val Loss: 1.0396567583084106 | Val Accuracy: 65.9
Epoch: 7 | Train Loss: 0.5473144054412842 | Val Loss: 0.8822601437568665 | Val Accuracy: 66.9
Epoch: 8 | Train Loss: 0.5910141468048096 | Val Loss: 0.9317623376846313 | Val Accuracy: 66.9
Epoch: 9 | Train Loss: 0.5002008676528931 | Val Loss: 0.9600455164909363 | Val Accuracy: 65.4
Epoch: 10 | Train Loss: 0.6141411066055298 | Val Loss: 0.7764148116111755 | Val Accuracy: 64.5
Vocab_size:5000, Embed_dim:50, cat_mode:MUL, Classifier:lo

Epoch: 3 | Train Loss: 0.7173831462860107 | Val Loss: 0.8232154250144958 | Val Accuracy: 65.3
Epoch: 4 | Train Loss: 0.5890381932258606 | Val Loss: 1.1422160863876343 | Val Accuracy: 64.2
Epoch: 5 | Train Loss: 0.6097489595413208 | Val Loss: 0.41057902574539185 | Val Accuracy: 64.1
Epoch: 6 | Train Loss: 0.47436511516571045 | Val Loss: 1.1054563522338867 | Val Accuracy: 63.8
Epoch: 7 | Train Loss: 0.3876751959323883 | Val Loss: 2.8702638149261475 | Val Accuracy: 61.6
Epoch: 8 | Train Loss: 0.2953074872493744 | Val Loss: 1.401910424232483 | Val Accuracy: 62.5
Epoch: 9 | Train Loss: 0.3150230050086975 | Val Loss: 3.0116751194000244 | Val Accuracy: 61.7
Epoch: 10 | Train Loss: 0.2150649130344391 | Val Loss: 2.045009136199951 | Val Accuracy: 58.9
Vocab_size:5000, Embed_dim:100, cat_mode:SUB, Classifier:log-reg
Epoch: 1 | Train Loss: 0.8101513981819153 | Val Loss: 0.9155451655387878 | Val Accuracy: 55.2
Epoch: 2 | Train Loss: 0.986501932144165 | Val Loss: 1.1256775856018066 | Val Accuracy: 

Epoch: 5 | Train Loss: 0.6774144768714905 | Val Loss: 1.252835988998413 | Val Accuracy: 65.3
Epoch: 6 | Train Loss: 0.7716633677482605 | Val Loss: 0.711990475654602 | Val Accuracy: 64.4
Epoch: 7 | Train Loss: 0.4778229594230652 | Val Loss: 0.8367134928703308 | Val Accuracy: 65.0
Epoch: 8 | Train Loss: 0.8464012742042542 | Val Loss: 0.9007281064987183 | Val Accuracy: 64.7
Epoch: 9 | Train Loss: 0.7415516376495361 | Val Loss: 0.6638323664665222 | Val Accuracy: 64.9
Epoch: 10 | Train Loss: 0.4785460829734802 | Val Loss: 0.9063190221786499 | Val Accuracy: 65.7
Vocab_size:5000, Embed_dim:300, cat_mode:DIRECT, Classifier:log-reg
Epoch: 1 | Train Loss: 0.9514626264572144 | Val Loss: 0.8456024527549744 | Val Accuracy: 56.8
Epoch: 2 | Train Loss: 0.8788555264472961 | Val Loss: 0.7891958951950073 | Val Accuracy: 59.9
Epoch: 3 | Train Loss: 0.8271233439445496 | Val Loss: 1.2123138904571533 | Val Accuracy: 58.6
Epoch: 4 | Train Loss: 1.249463438987732 | Val Loss: 0.840334415435791 | Val Accuracy: 