In [None]:
import os
import time
import tqdm
import datetime
import pickle as pkl
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

In [None]:
CURR_PATH = os.getcwd()
DATA_PATH = '/data/'

In [None]:
import load_data

In [None]:
## Load raw data sets
snli_train = pd.read_csv(CURR_PATH + DATA_PATH + "snli_train.tsv", sep='\t')
snli_val = pd.read_csv(CURR_PATH + DATA_PATH + "snli_val.tsv", sep='\t')

In [None]:
## Preprocess raw datasets
train_data = load_data.prepare_data(snli_train)
val_data = load_data.prepare_data(snli_val)

In [None]:
## CLASSIFIER CLASSES

class LRClassifier(nn.Module):
    def __init__(self, n_in, n_out):
        """
        n_in: Number of features
        n_out: Number of output classes
        """
        super().__init__()
        
        # Set up out linear layer. This initializes the weights
        self.linear = nn.Linear(n_in, n_out)
        
        # Explicitly initialize the weights with the initialization
        self.init_weights()
    
    def forward(self, x):
        """
        x: Input data [N, k]
        ---
        Returns: log probabilities of each class [N, c]
        """
        # Apply the linear function to get our logit (real numbers)
        logit = self.linear(x)
        
        # Apply log_softmax to get logs of normalized probabilities
        return F.log_softmax(logit, dim=1)
    
    def init_weights(self):
        # Use some specific initialization schemes
        nn.init.xavier_normal_(self.linear.weight)
        nn.init.uniform_(self.linear.bias)

class NNClassifier(nn.Module):
    def __init__(self, n_in, h_s, n_out):
        super().__init__()
        self.linear1 = nn.Linear(n_in,h_s)
        self.linear2 = nn.Linear(h_s,h_s)
        self.linear3 = nn.Linear(h_s,n_out)
    
    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        x = F.relu(x)
        x = self.linear3(x)
        return F.log_softmax(x)
        



In [None]:
## ENCODER CLASS
class BOWEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, class_in, is_pretrained):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BOWEncoder, self).__init__()
        if is_pretrained:
            weights_matrix = self.create_weights(vectors, id2token)
            self.embed  = self.create_emb_layer(weights_matrix, is_pretrained)
        else:
            self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,class_in)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        out = self.linear(out.float())
        return out
    
    def create_weights(self, vectors, id2token):
        '''Create weights metrics from vectors and id2token
        Returns:
        weights_matrix: torch.Tensor, dimension of (vocab size x embedding dim)
        '''
        weights_matrix = torch.from_numpy(np.array([vectors[id2token[i]] for i in range(2, len(id2token))]))
        zero = torch.zeros(2, weights_matrix.size()[1], dtype=torch.float64)
        weights_matrix = torch.cat([zero, weights_matrix])
        return weights_matrix

    
    def create_emb_layer(self, weights_matrix, non_trainable=False):
        '''Create embedding layer that's used in a PyTorch model
        Returns:
        emb_layer: nn.Embedding()
        num_embeddings: int
        embedding_dim: int
        '''
        num_embeddings, embedding_dim = weights_matrix.size()
        emb_layer = nn.Embedding(num_embeddings, embedding_dim)
        emb_layer.load_state_dict({'weight': weights_matrix})
        if non_trainable:
            emb_layer.weight.requires_grad = False
        return emb_layer

In [None]:
# Encoder + NN Classifier model
class NNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_in, h_s, n_out, combine_mode, is_pretrained):
        super().__init__()
        self.encoder = BOWEncoder(vocab_size, embed_dim, n_in, is_pretrained)
        self.combine_mode = combine_mode
        if combine_mode == 'DIRECT':
            n_in = n_in * 2;
        self.classifier = NNClassifier(n_in,h_s, n_out)
    
    def forward(self, premise, len_premise, hypothesis, len_hypo):
        premise = self.encoder(premise, len_premise)
        hypothesis = self.encoder(hypothesis, len_hypo)
        if self.combine_mode == 'DIRECT':
            x = torch.cat((premise, hypothesis),1)
        elif self.combine_mode == 'MUL':
            x = torch.mul(premise, hypothesis)
        elif self.combine_mode == 'SUB':
            x = torch.sub(premise, hypothesis)
        x = self.classifier(x)
        return x

In [None]:
# Encoder + LR Classifier model
class LRModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_in, n_out, combine_mode, is_pretrained):
        super().__init__()
        self.encoder = BOWEncoder(vocab_size, embed_dim, n_in, is_pretrained)
        self.combine_mode = combine_mode
        if combine_mode == 'DIRECT':
            n_in = n_in * 2;
        self.classifier = LRClassifier(n_in, n_out)
    
    def forward(self, premise, len_premise, hypothesis, len_hypo):
        premise = self.encoder(premise, len_premise)
        hypothesis = self.encoder(hypothesis, len_hypo)
        if self.combine_mode == 'DIRECT':
            x = torch.cat((premise, hypothesis),1)
        elif self.combine_mode == 'MUL':
            x = torch.mul(premise, hypothesis)
        elif cself.ombine_mode == 'SUB':
            x = torch.sub(premise, hypothesis)
        x = self.classifier(x)
        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
criterion = torch.nn.CrossEntropyLoss()
num_class = 3
batch_size = 32
h_s = 100 # size of hidden layer

In [None]:
MAX_SENTENCE_LENGTH = 30
SAVE_FOLDER = os.path.join('models', 'snli')
if not os.path.exists(SAVE_FOLDER):
    os.makedirs(SAVE_FOLDER)

# # For hyperparameter tuning
# VOCAB_SIZES = [5000, 10000, 20000, 40000,50000]
# EMB_DIMS = [50, 100,200,300,500]
# CAT_MODES = ["DIRECT","MUL","SUB"]
# MODEL_TYPES = ['log-reg', 'neural-net']
# is_pretrained = False

# For frozen embeddings
VOCAB_SIZES = [100000]
EMB_DIMS = [300]
CAT_MODES = ["DIRECT"]
MODEL_TYPES = ['neural-net']
is_pretrained = True

In [None]:
# TRAINING LOOP FOR SNLI DATA
for vocab_size in VOCAB_SIZES:
    # Load datasets
    vectors = pkl.load(open('pickle/'+str(vocab_size)+'_vectors.pkl', 'rb'))
    id2token = pkl.load(open('pickle/'+str(vocab_size)+'_id2token.pkl', 'rb'))
    token2id = pkl.load(open('pickle/'+str(vocab_size)+'_token2id.pkl', 'rb'))
    ## Convert to token lists to lists of corresponding indices
    indiced_train_data, train_target = load_data.token2index_dataset(train_data, token2id, MAX_SENTENCE_LENGTH)
    indiced_val_data, val_target = load_data.token2index_dataset(val_data, token2id, MAX_SENTENCE_LENGTH)
    train_dataset = load_data.SNLIDataset(indiced_train_data, train_target)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size,
                                           collate_fn=load_data.SNLI_collate_func,
                                           shuffle=True)
    val_dataset = load_data.SNLIDataset(indiced_val_data, val_target)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=batch_size,
                                           collate_fn=load_data.SNLI_collate_func,
                                           shuffle=True)
    
    for embed_dim in EMB_DIMS: 
        n_in = len(id2token)
        
        for cat_mode in CAT_MODES:
            
            for model_str in MODEL_TYPES:
                print('Vocab_size:{}, Embed_dim:{}, cat_mode:{}, Classifier:{}'.format(vocab_size, embed_dim, cat_mode, model_str))
                filename = '{}_{}_{}_{}.pt'.format(vocab_size, embed_dim, cat_mode, model_str)
                save_path = os.path.join(SAVE_FOLDER, filename)
                if model_str == 'neural-net':
                    model = NNModel(vocab_size, embed_dim, n_in , h_s, num_class, cat_mode, is_pretrained)
                elif model_str == 'log-reg':
                    model = LRModel(vocab_size, embed_dim, n_in, num_class, cat_mode, is_pretrained)                

                model.to(device)
                optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
                num_epochs = 10
                best_accuracy = 0
                
                for epoch in range(num_epochs):
                    for i, (premise, len_premise, hypothesis, len_hypo, labels) in enumerate(train_loader):
                        model.train()
                        # Load samples
                        premise = premise.to(device)
                        hypothesis = hypothesis.to(device)
                        labels = labels.to(device)
                        optimizer.zero_grad()
                        outputs = model(premise, len_premise, hypothesis, len_hypo)
                        loss = criterion(outputs, labels)
                        loss.backward()
                        optimizer.step()
                        
                        if i > 0 and i % 500 == 0:
                            train_loss = loss.data.item()

                            model.eval()
                            correct = 0
                            total = 0

                            for premise_val, len_premise_val, hypothesis_val, len_hypo_val, labels_val in val_loader:
                                # Load samples
                                premise_val = premise_val.to(device)
                                hypothesis_val = hypothesis_val.to(device)
                                labels_val = labels_val.to(device)

                                outputs_val = model(premise_val, len_premise_val, hypothesis_val, len_hypo_val)
                
                                val_loss = criterion(outputs_val, labels_val)
                                predicted = outputs_val.max(1, keepdim=True)[1]
                                total += labels_val.size(0)
                                correct += predicted.eq(labels_val.view_as(predicted)).sum().item()
            
                            accuracy = 100. * correct / total
#                             print('Iter: {} | Train Loss: {} | Val Loss: {} | Val Accuracy: {}'.format(i, train_loss, val_loss.item(), round(accuracy, 2)))              
                            # Save model when accuracy beats best accuracy
                            if accuracy > best_accuracy:
                                best_accuracy = accuracy
                                torch.save(model.state_dict(), save_path)
                    print('Epoch: {} | Train Loss: {} | Val Loss: {} | Val Accuracy: {}'.format((epoch+1), train_loss, val_loss.item(), round(accuracy, 2)))

In [None]:
EMB_DIMS = [300]
VOCAB_SIZES = [100000]
CAT_MODES = ["DIRECT"]
vocab_size = 100000
vectors = pkl.load(open('pickle/'+str(vocab_size)+'_vectors.pkl', 'rb'))
id2token = pkl.load(open('pickle/'+str(vocab_size)+'_id2token.pkl', 'rb'))
token2id = pkl.load(open('pickle/'+str(vocab_size)+'_token2id.pkl', 'rb'))

In [None]:
pretrained_model = NNModel(100000, 300, 100000 , 100, 3, "DIRECT", is_pretrained = True)