In [None]:
# First lets improve libraries that we are going to be used in this lab session
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
# Get top 10k words for each language
#from wordfreq import top_n_list
#from transliterate import translit
import pprint
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

from operator import itemgetter

import os, tqdm
import string
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np

def unpickle_from_file(file_name):
    with open(file_name, 'rb') as handle:
        return pkl.load(handle)

random.seed(134)
pp = pprint.PrettyPrinter(indent=4)
PAD_IDX = 0
UNK_IDX = 1

idx_to_label = ['contradiction', 'entailment', 'neutral']
label_to_idx = {
    'contradiction': 0,
    'entailment': 1,
    'neutral': 2
}
    
def token2index_dataset(tokens_data, token2id):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens.split(" ")]
        indices_data.append(index_list)
    return indices_data

def label2index_dataset(labels_data, label2id):
    indices_data = []
    for label in labels_data:
        indices_data.append(label2id[label])
    return indices_data

def plot_train_validation(loss_lists, val_acc_lists, param, string_xticks=False):
    figure = plt.figure(figsize = (20, 5))
    figure.subplots_adjust(wspace = 0.05, hspace = 0.05)

    figure.add_subplot(1,3,1)
    for key, loss_list in loss_lists.items():
        plt.plot(list(range(len(loss_list))), loss_list, alpha=0.5, label=str(key))
    plt.legend()
    plt.title("{}, training".format(param))
    
    figure.add_subplot(1,3,2)
    val_max = 0
    val_max_key = 0
    val_key_list = []
    val_max_list = []
    for key, val_acc_list in val_acc_lists.items():
        plt.plot(list(range(len(val_acc_list))), val_acc_list, alpha=0.5, label=str(key))
        current_max = max(val_acc_list)
        val_key_list.append(key)
        val_max_list.append(current_max)
        if current_max > val_max:
            val_max = current_max
            val_max_key = key
            
    plt.legend()
    plt.title("{}, validation".format(param))
    
    figure.add_subplot(1,3,3)
    if string_xticks:
        x = list(range(len(val_key_list)))
        my_xticks = val_key_list
        plt.xticks(x, my_xticks)
        plt.plot(x, val_max_list)
    else:
        plt.plot(val_key_list, val_max_list)
    plt.title("best validation acc for each {}".format(param))
    print(val_key_list, val_max_list)

    plt.show()
    
    return val_max_key, val_max

def plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, param, string_xticks=False, log=False, bar=False):
    figure = plt.figure(figsize = (15, 10))
    figure.subplots_adjust(wspace = 0.05, hspace = 0.05)

    figure.add_subplot(2,2,1)
    for key, loss_list in loss_lists.items():
        plt.plot(list(range(len(loss_list))), loss_list, alpha=0.5, label=str(key))
    plt.legend()
    plt.title("{}, training loss".format(param))
    
    figure.add_subplot(2,2,2)
    for key, loss_list in val_loss_lists.items():
        plt.plot(list(range(len(loss_list))), loss_list, alpha=0.5, label=str(key))
    plt.legend()
    plt.title("{}, validation loss".format(param))
    figure.add_subplot(2,2,3) 
    val_max = 0
    val_max_key = 0
    val_key_list = []
    val_max_list = []
    for key, val_acc_list in val_acc_lists.items():
        plt.plot(list(range(len(val_acc_list))), val_acc_list, alpha=0.5, label=str(key))
        current_max = max(val_acc_list)
        val_key_list.append(key)
        val_max_list.append(current_max)
        if current_max > val_max:
            val_max = current_max
            val_max_key = key
            
    plt.legend()
    plt.title("{}, validation acc".format(param))
     
    
    figure.add_subplot(2,2,4)
    if string_xticks:
        x = list(range(len(val_key_list)))
        my_xticks = val_key_list
        plt.xticks(x, my_xticks)
        plt.plot(x, val_max_list)
    else:
        if bar:
            plt.bar(val_key_list, val_max_list)
        elif log:
            plt.semilogx(val_key_list,val_max_list)
        else:
            plt.plot(val_key_list, val_max_list)
    plt.title("best validation acc for each {}".format(param))
    print(val_key_list, val_max_list)

    plt.show()
    
    return val_max_key, val_max


def set_words_data(words_to_load, ft_home = './'):
    with open(ft_home + 'wiki-news-300d-1M.vec') as f:
        loaded_embeddings_ft = np.random.random((words_to_load, 300)) # check to take care of pad and eos token
        words_ft = {}
        idx2words_ft = {}
        ordered_words_ft = []

        words_ft['<pad>'] = 0
        idx2words_ft[0] = '<pad>' 
        words_ft['<unk>'] = 1
        idx2words_ft[1] = '<unk>' 

        lines = f.readlines()
        for i, line in enumerate(lines[1:]):
            if i+2 >= words_to_load: 
                break
            s = line.split()
            # don't skip just because punctuation, too much complication
            loaded_embeddings_ft[i+2, :] = np.asarray(s[1:])
            words_ft[s[0]] = i+2
            idx2words_ft[i+2] = s[0]
            ordered_words_ft.append(s[0])
    return words_ft, idx2words_ft, ordered_words_ft, loaded_embeddings_ft



In [None]:
test = torch.rand((3,4,5))
test.shape
test.squeeze(1).shape

In [None]:
# TODO: Test with one RNN with shared weights, rather than using separate two
class RNN(nn.Module):
    def __init__(self, loaded_embeddings_ft, emb_size, hidden_size, num_layers, num_classes, vocab_size, shuffle=False, interaction='concat'):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # vocab_size: vocabulary size
        super(RNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        if loaded_embeddings_ft is not None:
            self.embedding.load_state_dict({'weight': torch.cuda.FloatTensor(loaded_embeddings_ft)})
        
        self.shuffle=shuffle
        
        self.rnn1 = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True) 
        self.rnn2 = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.interaction = interaction
        if interaction == 'concat':
            self.linear1 = nn.Linear(hidden_size*2*2, hidden_size)
        elif interaction == 'featurewise_multiplication':
            self.linear1 = nn.Linear(hidden_size*2, hidden_size)
        #elif interaction == 'general':
        #    self.linear0 = nn.Linear(hidden_size*2, hidden_size*2)
        #    self.linear1 = nn.Linear(hidden_size*2, hidden_size)
        #elif interaction == 'dot':
        #    self.linear1 = nn.Linear(hidden_size*2, hidden_size)
        else:
            raise NotImplementedError()
        self.linear2 = nn.Linear(hidden_size, num_classes)

    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers*2, batch_size, self.hidden_size)

        return hidden.cuda()

    def forward(self, x1, x2, lengths1, length2, reorder_sent_2):
        # reset hidden state

        batch_size, seq_len1 = x1.size()
        batch_size, seq_len2 = x2.size()

        self.hidden1 = self.init_hidden(batch_size)
        self.hidden2 = self.init_hidden(batch_size)
        
        # get embedding of characters
        embed1 = self.embedding(x1)
        # pack padded sequence
        embed1 = torch.nn.utils.rnn.pack_padded_sequence(embed1, lengths1.cpu().numpy(), batch_first=True)
        # fprop though RNN
        rnn_out1, self.hidden1 = self.rnn1(embed1, self.hidden1)
        # undo packing
        
        # get embedding of characters
        embed2 = self.embedding(x2)
        # pack padded sequence
        embed2 = torch.nn.utils.rnn.pack_padded_sequence(embed2, length2.cpu().numpy(), batch_first=True)
        # fprop though RNN
        rnn_out2, self.hidden2 = self.rnn2(embed2, self.hidden2)
        # undo packing
        #rnn_out2, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out2, batch_first=True)
        # sum hidden activations of RNN across time
        #rnn_out2 = torch.sum(rnn_out2, dim=1)
        
        # reorder to match batches between two rnn outputs
        #rnn_out2 = rnn_out2[reorder_sent_2] 
        if not self.shuffle:
            self.hidden2 = self.hidden2[:,reorder_sent_2,:] 
        
        #print(self.hidden1.shape, self.hidden2.shape)
        # TODO: check if correct
        if self.interaction == 'concat':
            combined_representation = torch.cat([torch.cat([self.hidden1[0],self.hidden1[1]],1), 
                                torch.cat([self.hidden2[0],self.hidden2[1]],1)],
                               1)
        elif self.interaction == 'featurewise_multiplication':
            combined_representation = torch.mul(torch.cat([self.hidden1[0],self.hidden1[1]],1), 
                                                torch.cat([self.hidden2[0],self.hidden2[1]],1))

        combined_representation = self.linear1(combined_representation)
        logits = self.linear2(combined_representation)
        return logits

    
class CNN(nn.Module):
    def __init__(self, loaded_embeddings_ft, emb_size, hidden_size, kernel_size, num_layers, num_classes, vocab_size, shuffle=False, interaction='concat'):

        assert kernel_size % 2 == 1
        padding_size = kernel_size // 2
        super(CNN, self).__init__()

        self.num_layers, self.hidden_size, self.kernel_size = num_layers, hidden_size, kernel_size
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        if loaded_embeddings_ft is not None:
            self.embedding.load_state_dict({'weight': torch.cuda.FloatTensor(loaded_embeddings_ft)})
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=kernel_size, padding=padding_size)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=kernel_size, padding=padding_size)

        #self.linear1 = nn.Linear(hidden_size*2, hidden_size)
        self.interaction = interaction 
        
        if interaction == 'concat':
            self.linear1 = nn.Linear(hidden_size*2, hidden_size)
        elif interaction == 'featurewise_multiplication':
            self.linear1 = nn.Linear(hidden_size, hidden_size)
        else:
            raise NotImplementedError()
        '''
        elif interaction == 'general':
            self.linear0 = nn.Linear(hidden_size, hidden_size)
            self.linear1 = nn.Linear(hidden_size, hidden_size)
        elif interaction == 'dot':
            self.linear1 = nn.Linear(hidden_size, hidden_size)
        '''
        self.linear2 = nn.Linear(hidden_size, num_classes)

        self.shuffle=shuffle
    '''
    def forward(self, x, lengths):
        batch_size, seq_len = x.size()

        embed = self.embedding(x)
        hidden = self.conv1(embed.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden = torch.sum(hidden, dim=1)
        
        
        logits = self.linear(hidden)
        return logits 
    '''
    def forward(self, x1, x2, lengths1, length2, reorder_sent_2):
        # reset hidden state

        batch_size, seq_len1 = x1.size()
        batch_size, seq_len2 = x2.size()
        
        # get embedding of words
        embed1 = self.embedding(x1)
        
        hidden1 = self.conv1(embed1.transpose(1,2)).transpose(1,2)
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size, seq_len1, hidden1.size(-1))

        hidden1 = self.conv2(hidden1.transpose(1,2)).transpose(1,2)
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size, seq_len1, hidden1.size(-1))
                
        # get embedding of words
        embed2 = self.embedding(x2)
        
        #print('embed2 size', embed2.shape)
        
        hidden2 = self.conv1(embed2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size, seq_len2, hidden2.size(-1))

        hidden2 = self.conv2(hidden2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size, seq_len1, hidden2.size(-1))
        
        # reorder to match batches between two rnn outputs
        #print('hidden2 size', hidden2.shape)
        
        
        if not self.shuffle:
            hidden2 = hidden2[reorder_sent_2,:,:] 
        
        #print('hidden2 size', hidden2.shape)
        #print('hidden2.transpose(1,2) size', hidden2.transpose(1,2).shape)
        
        hidden1 = F.max_pool1d(hidden1.transpose(1,2), hidden1.size(1)).transpose(1,2)
        hidden2 = F.max_pool1d(hidden2.transpose(1,2), hidden2.size(1)).transpose(1,2)
        
        #print('hidden2 size', hidden2.shape)
        if self.interaction == 'concat':
            combined_representation = torch.cat([hidden1, hidden2], 2)
        elif self.interaction == 'featurewise_multiplication':
            combined_representation = torch.mul(hidden1, hidden2)
        '''
        elif self.interaction == 'general':
            hidden1 = hidden1.squeeze(1)
            hidden2 = hidden2.squeeze(1)
            W_times_hidden2 = self.linear0(hidden2)
            combined_representation = torch.matmul(hidden1, torch.t(W_times_hidden2))
        elif self.interaction == 'dot':
            hidden1 = hidden1.squeeze(1)
            hidden2 = hidden2.squeeze(1)
            combined_representation = torch.matmul(hidden1, torch.t(hidden2))
        '''
        #print('combined size', combined_representation.shape)

        combined_representation = self.linear1(combined_representation)
        #print('concat size', concat.shape)
        logits = self.linear2(combined_representation)
        #print('logits size', logits.shape)
        return logits.squeeze(1)
        



In [None]:
#mnli_train.tsv  mnli_val.tsv  snli_train.tsv  snli_val.tsv
snil_train = pd.read_csv('snli_train.tsv', delimiter='\t')
snil_val = pd.read_csv('snli_val.tsv', delimiter='\t')
mnil_train = pd.read_csv('mnli_train.tsv', delimiter='\t')
mnil_val = pd.read_csv('mnli_val.tsv', delimiter='\t')

In [None]:
MAX_SENTENCE_LENGTH = 200

class SNILDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, sent1_list, sent2_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.sent1_list = sent1_list
        self.sent2_list = sent2_list
        self.target_list = target_list
        assert (len(self.sent1_list) == len(self.target_list)) and (len(self.sent2_list) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        sent1_token_idx = self.sent1_list[key][:MAX_SENTENCE_LENGTH] 
        sent2_token_idx = self.sent2_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [sent1_token_idx, sent2_token_idx, len(sent1_token_idx), len(sent2_token_idx), label, key]

def snil_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    sent1_data_list = []
    sent2_data_list = []
    label_list = []
    sent2_length_list = []
    sent1_length_list = []
    indices_list = []
    for datum in batch:
        indices_list.append(datum[5])
        label_list.append(datum[4])
        sent2_length_list.append(datum[3])
        sent1_length_list.append(datum[2])
    # padding
    for datum in batch:
        sent1_padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[2])), 
                                mode="constant", constant_values=0)
        
        sent2_padded_vec = np.pad(np.array(datum[1]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])), 
                                mode="constant", constant_values=0)   # TODO: Is it always returning length 200?
        sent1_data_list.append(sent1_padded_vec)
        sent2_data_list.append(sent2_padded_vec)
        
    ind_dec_order = np.argsort(sent1_length_list)[::-1]
    sent1_data_list = np.array(sent1_data_list)[ind_dec_order]
    sent1_length_list = np.array(sent1_length_list)[ind_dec_order]
    sent2_data_list = np.array(sent2_data_list)[ind_dec_order]
    sent2_length_list = np.array(sent2_length_list)[ind_dec_order]
    label_list = np.array(label_list)[ind_dec_order]
    indices_list = np.array(indices_list)[ind_dec_order]
    
    # TODO: Make sure I did this alright
    '''
    data: [3,5,4,1,2]
    #original order array = [0,1,2,3,4]
    sort order = [3,4,0,2,1]
    after sorting = [1,2,3,4,5]
    #original order array, sorted = [3,4,0,2,1]
    sort order to reorder order:
      - first 3 means we put index 3 item into index 0
      - so we need to put index 0 item back to index 3
      - which means we put 0 in the location 3 in the reorder-order array
      - for i, each in enumerate(sort_order):
      -     reorder_order[i] = each
      -     reorder_order[each] = i
    reorder order = [2, 4, 3, 0, 1]
    '''
    ind_dec_order_sent2 = np.argsort(sent2_length_list)[::-1]
    sent2_data_list = np.array(sent2_data_list)[ind_dec_order_sent2]
    sent2_length_list = np.array(sent2_length_list)[ind_dec_order_sent2]
    
    reorder_sent_2_dict = dict()
    for i, each in enumerate(ind_dec_order_sent2):
        reorder_sent_2_dict[each] = i
    reorder_sent_2_list = []
    for key, value in sorted(reorder_sent_2_dict.items()):
        reorder_sent_2_list.append(value)
        

    return [torch.from_numpy(np.array(sent1_data_list)).cuda(), 
            torch.from_numpy(np.array(sent2_data_list)).cuda(), 
            torch.cuda.LongTensor(sent1_length_list), torch.cuda.LongTensor(sent2_length_list), 
            torch.cuda.LongTensor(label_list), torch.cuda.LongTensor(indices_list), 
            torch.cuda.LongTensor(reorder_sent_2_list)]


In [None]:
MAX_SENTENCE_LENGTH = 200

def show_result(model_path, hidden_size, l2_penalty, num_epochs = 10, max_vocab_size=50000, batch_size=32, kernel_size=3, learning_rate=3e-4, phase="test", model_type="rnn", interaction="concat"):
    words_ft, idx2words_ft, ordered_words_ft, loaded_embeddings_ft = set_words_data(max_vocab_size)
    
    mnil_val_sent1_indices, mnil_val_sent2_indices, mnil_val_label_indices = dict(), dict(), dict()
    for genre in mnil_val["genre"].unique():
        temp = mnil_val[mnil_val.genre==genre]
        mnil_val_sent1_indices[genre] = token2index_dataset(temp['sentence1'], words_ft)
        mnil_val_sent2_indices[genre] = token2index_dataset(temp['sentence2'], words_ft)
        mnil_val_label_indices[genre] = label2index_dataset(temp['label'], label_to_idx)
        
    mnil_val_loaders = dict()
    for genre in mnil_val["genre"].unique():
        temp = SNILDataset(mnil_val_sent1_indices[genre], mnil_val_sent2_indices[genre], mnil_val_label_indices[genre])
        mnil_val_loaders[genre] = torch.utils.data.DataLoader(dataset=temp, 
                                                   batch_size=batch_size,
                                                   collate_fn=snil_collate_func,
                                                   shuffle=False)
        
    snil_val_sent1_indices = token2index_dataset(snil_val['sentence1'], words_ft)
    snil_val_sent2_indices = token2index_dataset(snil_val['sentence2'], words_ft)
    snil_val_label_indices = label2index_dataset(snil_val['label'], label_to_idx)
    
    val_dataset = SNILDataset(snil_val_sent1_indices, snil_val_sent2_indices, snil_val_label_indices)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                               batch_size=batch_size,
                                               collate_fn=snil_collate_func,
                                               shuffle=False)
    if model_type=="rnn":
        model = RNN(None, 
                    emb_size=300, 
                    hidden_size=hidden_size, 
                    num_layers=1, 
                    num_classes=3, 
                    vocab_size=len(idx2words_ft),
                    interaction=interaction
                   ).cuda()
    elif model_type=="cnn":
        model = CNN(None, 
                    emb_size=300, 
                    hidden_size=hidden_size, 
                    num_layers=1, 
                    num_classes=3, 
                    kernel_size=kernel_size,
                    vocab_size=len(idx2words_ft),
                    interaction=interaction
                   ).cuda()
    else:
        raise NotImplementedError()
    
    total_params = 0
    for parameter in model.parameters():
        if parameter.requires_grad:
            print(parameter.shape, parameter.numel())
            total_params += parameter.numel()
    print("total params", total_params)
    
    model.load_state_dict(torch.load(model_path))
    model.eval()
    test_acc, _ = test_model(val_loader, model)
    print("val acc:", test_acc)
    if phase=="test":
        for genre in mnil_val["genre"].unique():
            test_acc, _ = test_model(mnil_val_loaders[genre], model)
            print(genre, "test acc:", test_acc)
    elif phase in ["fiction", "telephone", "slate", "government", "travel"]:
        experiment_name = "{}_mnli_train_fixed_{}_epochs_{}_maxvocab_{}_hid_{}_batch_{}_kernel_{}_lr_{}_l2_{}_interaction_{}".format(
            model_type,
            phase, 
            num_epochs,
            max_vocab_size,
            hidden_size, 
            batch_size, 
            kernel_size,
            learning_rate,
            l2_penalty,
            interaction
        )
        try:
            os.mkdir(experiment_name)
        except:
            print("this experiment is already done.")
            return
        
        train_temp = mnil_train[mnil_train.genre==phase]
        mnil_train_sent1_indices = token2index_dataset(train_temp['sentence1'], words_ft)
        mnil_train_sent2_indices = token2index_dataset(train_temp['sentence2'], words_ft)
        mnil_train_label_indices = label2index_dataset(train_temp['label'], label_to_idx)

        train_temp = SNILDataset(mnil_train_sent1_indices, mnil_train_sent2_indices, mnil_train_label_indices)
        mnil_train_loader = torch.utils.data.DataLoader(dataset=train_temp, 
                                                       batch_size=batch_size,
                                                       collate_fn=snil_collate_func,
                                                       shuffle=False)
        # Criterion and Optimizer
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l2_penalty)

        # Train the model
        total_step = len(mnil_train_loader)

        loss_lists = []
        val_acc_list = []           
        val_loss_lists = []           
        #print(model.embedding.weight[:,2:].shape)
        print("fixed, three objects will be saved in three separate files")
        
        for epoch in range(num_epochs):
            loss_list = []
            for i, (data1, data2, lengths1, lengths2, labels, keys, reorder_sent2_list) in enumerate(mnil_train_loader):
                model.train()
                optimizer.zero_grad()
                # Forward pass
                outputs = model(data1, data2, lengths1, lengths2, reorder_sent2_list)
                loss = criterion(outputs, labels)

                # Backward and optimize
                loss.backward()
                #print(model.embedding.weight.grad.data[2:,:].shape)
                model.embedding.weight.grad.data[2:,:].fill_(0)
                loss_list.append(loss.item())
                optimizer.step()
                # validate every 100 iterations

                if i > 0 and i % 100 == 0:
                    # validate
                    val_acc, _ = test_model(mnil_val_loaders[genre], model)
                    if len(val_acc_list) == 0 or val_acc > max(val_acc_list):
                        save_path = "{}/epoch_{}_step_{}.p".format(experiment_name, epoch, i)
                        torch.save(model.state_dict(), save_path)
                        print("saved", save_path)
                    val_acc_list.append(val_acc)
                    print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                               epoch+1, num_epochs, i+1, len(mnil_train_loader), val_acc))

            loss_lists.append(np.mean(np.array(loss_list)))

            # validate
            val_acc, val_loss_list = test_model(mnil_val_loaders[genre], model)
            val_loss_lists.append(np.mean(np.array(val_loss_list)))
            if len(val_acc_list) == 0 or val_acc > max(val_acc_list):
                save_path = "{}/epoch_{}_step_epoch_done.p".format(experiment_name, epoch)
                torch.save(model.state_dict(), save_path)
                print("saved", save_path)
            val_acc_list.append(val_acc)
            print('Epoch: [{}/{}], Validation Acc: {}, train mean loss: {}, val mean loss: {}'.format(
                        epoch+1, num_epochs, val_acc, loss_lists[-1], val_loss_lists[-1]))


            pkl.dump(loss_lists, open("{}/loss.p".format(experiment_name), "wb"))
            pkl.dump(val_acc_list, open("{}/acc.p".format(experiment_name), "wb"))
            pkl.dump(val_loss_lists, open("{}/val_loss.p".format(experiment_name), "wb"))
    elif phase=="val":
        k=10
        val_acc = test_model(val_loader, model)
        print("val acc:", val_acc)

        model.eval()
        correct_examples_list, wrong_examples_list = [], []
        for data1, data2, lengths1, lengths2, labels, keys, reorder_sent2_list in val_loader:
            if len(correct_examples_list) >= k and len(wrong_examples_list) >= k:
                break
            outputs = F.softmax(model(data1, data2, lengths1, lengths2, reorder_sent2_list), dim=1)
            predicted = outputs.max(1, keepdim=True)[1]


            correct_list = predicted.eq(labels.view_as(predicted)).flatten().data.cpu().numpy()
            labels = labels.flatten().data.cpu().numpy()
            prediction = predicted.flatten().data.cpu().numpy()

            for i in range(labels.shape[0]):
                is_correct = correct_list[i]
                if len(correct_examples_list) >= k and len(wrong_examples_list) >= k:
                    break
                if is_correct:
                    correct_examples_list.append( (labels[i], prediction[i], keys[i]) )
                else:
                    wrong_examples_list.append( (labels[i], prediction[i], keys[i]) )
            
        print("Correct examples:")

        for label, prediction, index in correct_examples_list[:k]:
            print("true label is", idx_to_label[label], ", prediction is", idx_to_label[prediction]) 
            print("original sentense")       
            print(snil_val.iloc[int(index)])
        print("\nWrong examples:")
        for label, prediction, index in wrong_examples_list[:k]:
            print("true label is", idx_to_label[label], ", prediction is", idx_to_label[prediction])
            print("original sentense")
            print(snil_val.iloc[int(index)])    
    else:
        raise NotImplementedError()

def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    val_loss_list = []
    for data1, data2, lengths1, lengths2, labels, keys, reorder_sent2_list in loader:
        #data_batch, lengths_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data1, data2, lengths1, lengths2, reorder_sent2_list), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
        
        criterion = torch.nn.CrossEntropyLoss()
        loss = criterion(outputs, labels)
        val_loss_list.append(loss.item())
    #print()
    return (100 * correct / total), val_loss_list

def train(name, hidden_size, max_vocab_size=50000, batch_size=32, kernel_size=3, learning_rate=3e-4, num_epochs=30, l2_penalty=0, model_type="rnn", shuffle=False, interaction="concat"):
    
    #assert n in [1,2,3,4]
    
    experiment_name = "{}_snli_{}_maxvocab_{}_hid_{}_batch_{}_kernel_{}_lr_{}_l2_{}_interaction_{}".format(
        model_type,
        name, 
        max_vocab_size,
        hidden_size, 
        batch_size, 
        kernel_size,
        learning_rate,
        l2_penalty,
        interaction
    )
    try:
        os.mkdir(experiment_name)
    except:
        print("this experiment is already done.")
        return
    
    #words_ft, idx2words_ft, ordered_words_ft = set_words_data(max_vocab_size)
    words_ft, idx2words_ft, ordered_words_ft, loaded_embeddings_ft = set_words_data(max_vocab_size)
    
    snil_train_sent1_indices = token2index_dataset(snil_train['sentence1'], words_ft)
    snil_train_sent2_indices = token2index_dataset(snil_train['sentence2'], words_ft)
    snil_val_sent1_indices = token2index_dataset(snil_val['sentence1'], words_ft)
    snil_val_sent2_indices = token2index_dataset(snil_val['sentence2'], words_ft)
    snil_train_label_indices = label2index_dataset(snil_train['label'], label_to_idx)
    snil_val_label_indices = label2index_dataset(snil_val['label'], label_to_idx)
    
    train_dataset = SNILDataset(snil_train_sent1_indices, snil_train_sent2_indices, snil_train_label_indices)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                               batch_size=batch_size,
                                               collate_fn=snil_collate_func,
                                               shuffle=True)    
    val_dataset = SNILDataset(snil_val_sent1_indices, snil_val_sent2_indices, snil_val_label_indices)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                               batch_size=batch_size,
                                               collate_fn=snil_collate_func,
                                               shuffle=False)
    

    if model_type == "rnn":
        model = RNN(loaded_embeddings_ft, 
                    emb_size=300, 
                    hidden_size=hidden_size,
                    num_layers=1, 
                    num_classes=3, 
                    shuffle=shuffle,
                    vocab_size=len(idx2words_ft),
                    interaction=interaction
                   ).cuda()
    elif model_type == "cnn":
        model = CNN(loaded_embeddings_ft, 
                    emb_size=300, 
                    hidden_size=hidden_size,
                    num_layers=1, 
                    kernel_size=kernel_size,
                    num_classes=3, 
                    shuffle=shuffle,
                    vocab_size=len(idx2words_ft),
                    interaction=interaction
                   ).cuda()
    else:
        raise NotImplementedError()

    total_params = 0
    for parameter in model.parameters():
        if parameter.requires_grad:
            print(parameter.shape, parameter.numel())
            total_params += parameter.numel()
    print("total params", total_params)
    #learning_rate = 3e-4
    #num_epochs = 10 # number epoch to train

    # Criterion and Optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l2_penalty)

    # Train the model
    total_step = len(train_loader)

    loss_lists = []
    val_acc_list = []           
    val_loss_lists = []           
    #print(model.embedding.weight[:,2:].shape)
    print("fixed, three objects will be saved in three separate files")
    for epoch in range(num_epochs):
        loss_list = []
        for i, (data1, data2, lengths1, lengths2, labels, keys, reorder_sent2_list) in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()
            # Forward pass
            outputs = model(data1, data2, lengths1, lengths2, reorder_sent2_list)
            loss = criterion(outputs, labels)

            # Backward and optimize
            loss.backward()
            #print(model.embedding.weight.grad.data[2:,:].shape)
            model.embedding.weight.grad.data[2:,:].fill_(0)
            loss_list.append(loss.item())
            optimizer.step()
            # validate every 100 iterations

            if i > 0 and i % 100 == 0:
                # validate
                val_acc, _ = test_model(val_loader, model)
                if len(val_acc_list) == 0 or val_acc > max(val_acc_list):
                    save_path = "{}/epoch_{}_step_{}.p".format(experiment_name, epoch, i)
                    torch.save(model.state_dict(), save_path)
                    print("saved", save_path)
                val_acc_list.append(val_acc)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                           epoch+1, num_epochs, i+1, len(train_loader), val_acc))

        loss_lists.append(np.mean(np.array(loss_list)))

        # validate
        val_acc, val_loss_list = test_model(val_loader, model)
        val_loss_lists.append(np.mean(np.array(val_loss_list)))
        if len(val_acc_list) == 0 or val_acc > max(val_acc_list):
            save_path = "{}/epoch_{}_step_epoch_done.p".format(experiment_name, epoch)
            torch.save(model.state_dict(), save_path)
            print("saved", save_path)
        val_acc_list.append(val_acc)
        print('Epoch: [{}/{}], Validation Acc: {}, train mean loss: {}, val mean loss: {}'.format(
                    epoch+1, num_epochs, val_acc, loss_lists[-1], val_loss_lists[-1]))


    

    
        pkl.dump(loss_lists, open("{}/loss.p".format(experiment_name), "wb"))
        pkl.dump(val_acc_list, open("{}/acc.p".format(experiment_name), "wb"))
        pkl.dump(val_loss_lists, open("{}/val_loss.p".format(experiment_name), "wb"))
    

# max seq length

In [None]:
len_list_1, len_list_2 = [], []
for each in snil_train['sentence1']:
    len_list_1.append(len(each))
for each in snil_train['sentence2']:
    len_list_2.append(len(each))


In [None]:
plt.hist(len_list_1)

In [None]:
plt.hist(len_list_2)

In [None]:
gt, lt, eq = 0, 0, 0
for each in len_list_1:
    if each > 200:
        gt+=1
    elif each < 200:
        lt += 1
    else:
        eq += 1
gt, lt, eq

In [None]:
gt, lt, eq = 0, 0, 0
for each in len_list_2:
    if each > 200:
        gt+=1
    elif each < 200:
        lt += 1
    else:
        eq += 1
gt, lt, eq

In [None]:
max(len_list_1), max(len_list_2)

In [None]:
np.mean(len_list_1), np.mean(len_list_2)

Maximum sequence length of sentence1 is 406, maximum sequence length of sentence2 is 227.

However, only 302 data points out of 100000 (0.3%) sentence1 have length greater than 200. Only 2 datapoints out of 100000 sentence2 have length greater than 200. Therefore, we use 200 for maximum sequence length for both sentence1 and sentence2. This is to have reasonable 

## Verify batch

In [None]:
    words_ft, idx2words_ft, ordered_words_ft, loaded_embeddings_ft = set_words_data(50000)
    
    #snil_train_sent1_indices = token2index_dataset(snil_train['sentence1'], words_ft)
    #snil_train_sent2_indices = token2index_dataset(snil_train['sentence2'], words_ft)
    snil_val_sent1_indices = token2index_dataset(snil_val['sentence1'], words_ft)
    snil_val_sent2_indices = token2index_dataset(snil_val['sentence2'], words_ft)
    #snil_train_label_indices = label2index_dataset(snil_train['label'], label_to_idx)
    snil_val_label_indices = label2index_dataset(snil_val['label'], label_to_idx)
    val_dataset = SNILDataset(snil_val_sent1_indices, snil_val_sent2_indices, snil_val_label_indices)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                               batch_size=32,
                                               collate_fn=snil_collate_func,
                                               shuffle=False)

In [None]:
# check all batches are correct
for i, (data1, data2, lengths1, lengths2, labels, keys, reorder_sent2_list) in enumerate(val_loader):
    for index_in_batch in range(len(data1)):
        key = keys.cpu().numpy()[index_in_batch]
        print(snil_val.iloc[key])
        result = ""
        for token in data1[index_in_batch].cpu().numpy():
            if idx2words_ft[token]=='<pad>':
                print(result)
                break
            result += idx2words_ft[token]
            result += " "
        result = ""
        for token in data2[reorder_sent2_list,:][index_in_batch].cpu().numpy():
            if idx2words_ft[token]=='<pad>':
                print(result)
                break
            result += idx2words_ft[token]
            result += " "
        print(idx_to_label[labels[index_in_batch].cpu().item()])
        #print(data1[key])
        #print(data2[reorder_sent2_list,:][key])
    break

for i, (data1, data2, lengths1, lengths2, labels, keys, reorder_sent2_list) in enumerate(val_loader):
    print(keys)
    print(reorder_sent2_list)
    break

In [None]:
    model = RNN(loaded_embeddings_ft, 
                emb_size=300, 
                hidden_size=8,
                num_layers=1, 
                num_classes=3, 
                vocab_size=len(idx2words_ft)).cuda()

In [None]:
    model.eval()
    criterion = torch.nn.CrossEntropyLoss()
    for data1, data2, lengths1, lengths2, labels, keys, reorder_sent2_list in val_loader:
        #data_batch, lengths_batch, label_batch = data, lengths, labels
        outputs = model(data1, data2, lengths1, lengths2, reorder_sent2_list)
        print(criterion(outputs, labels).item())
        break

        model = CNN(loaded_embeddings_ft, 
                    emb_size=300, 
                    hidden_size=hidden_size,
                    num_layers=1, 
                    kernel_size=kernel_size,
                    num_classes=3, 
                    vocab_size=len(idx2words_ft)).cuda()

In [None]:
for max_vocab_size in [20000,40000,60000,80000,100000,120000,140000,160000,200000]:
    train("fixed_kernel_size", 256, max_vocab_size, 32, model_type="cnn")

In [None]:
loss_lists, val_acc_lists = dict(), dict()
for n in [20000,40000,50000,60000,70000,80000,100000,120000]:
    loss_lists[n] = unpickle_from_file('cnn_snli_fixed_kernel_size_maxvocab_{}_hid_256_batch_32_lr_0.0003_l2_0/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('cnn_snli_fixed_kernel_size_maxvocab_{}_hid_256_batch_32_lr_0.0003_l2_0/acc.p'.format(n))
best_hyperparam, best_val = plot_train_validation(loss_lists, val_acc_lists, "max vocab size")
best_hyperparam, best_val

In [None]:
# definitely a bug, probably mixing batch order
show_result('cnn_snli_fixed_kernel_size_maxvocab_60000_hid_256_batch_32_lr_0.0003_l2_0/epoch_4_step_3000.p', 256, 60000, model_type="cnn")

In [None]:
show_result('cnn_snli_fixed_kernel_size_maxvocab_70000_hid_256_batch_32_lr_0.0003_l2_0/epoch_2_step_2200.p', 256, 70000, model_type="cnn")

In [None]:
train("all_kernel_size_fixed", 256, 70000, batch_size=32, num_epochs=15, kernel_size=1, model_type="cnn")

In [None]:
for kernel_size in [3,5,7,9,11]:
    train("all_kernel_size_fixed", 256, 70000, batch_size=32, num_epochs=15, kernel_size=kernel_size, model_type="cnn")

In [None]:
loss_lists, val_acc_lists = dict(), dict()
for n in [1,3,5,7,9,11]:
    loss_lists[n] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_256_batch_32_kernel_{}_lr_0.0003_l2_0/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_256_batch_32_kernel_{}_lr_0.0003_l2_0/acc.p'.format(n))
best_hyperparam, best_val = plot_train_validation(loss_lists, val_acc_lists, "kernel size")
best_hyperparam, best_val

In [None]:
### Better to align y axis for two loss graph

loss_lists, val_acc_lists, val_loss_lists = dict(), dict(), dict()
for n in [1,3,5,7,9,11]:
    loss_lists[n] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_256_batch_32_kernel_{}_lr_0.0003_l2_0/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_256_batch_32_kernel_{}_lr_0.0003_l2_0/acc.p'.format(n))
    val_loss_lists[n] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_256_batch_32_kernel_{}_lr_0.0003_l2_0/val_loss.p'.format(n))
best_hyperparam, best_val = plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, "kernel size")
best_hyperparam, best_val

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

for i in [1,3,5,7,9,11]:
    model = CNN(None, 
                emb_size=300, 
                hidden_size=256,
                num_layers=1,
                kernel_size=i,
                num_classes=3, 
                vocab_size=70000).cuda()
    total_params = 0
    for parameter in model.parameters():
        if parameter.requires_grad:
            print(parameter.shape, parameter.numel())
            total_params += parameter.numel()
    print("kernel size:", i, "total params", total_params)
    

In [None]:
for i in [1,2,4,8,16,32,64,128,512,1024]:
    model = CNN(None, 
                emb_size=300, 
                hidden_size=i,
                num_layers=1,
                kernel_size=1,
                num_classes=3, 
                vocab_size=70000).cuda()
    total_params = 0
    for parameter in model.parameters():
        if parameter.requires_grad:
            print(parameter.shape, parameter.numel())
            total_params += parameter.numel()
    print("kernel size:", i, "total params", total_params)

In [None]:
for hidden_size in [2,4,8,16,32,64,128,512,1024]:
    train("all_kernel_size_fixed", hidden_size, 70000, batch_size=32, num_epochs=30, kernel_size=1, model_type="cnn")

In [None]:
### Better to align y axis for two loss graph

loss_lists, val_acc_lists, val_loss_lists = dict(), dict(), dict()
for n in [2,4,8,16,32,64,128,256,512,1024]:
    loss_lists[n] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_{}_batch_32_kernel_1_lr_0.0003_l2_0/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_{}_batch_32_kernel_1_lr_0.0003_l2_0/acc.p'.format(n))
    val_loss_lists[n] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_{}_batch_32_kernel_1_lr_0.0003_l2_0/val_loss.p'.format(n))
best_hyperparam, best_val = plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, "hidden size")
best_hyperparam, best_val

In [None]:
### Better to align y axis for two loss graph

loss_lists, val_acc_lists, val_loss_lists = dict(), dict(), dict()
for n in [8,16,32,64,128,256,512,1024]:
    loss_lists[n] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_{}_batch_32_kernel_1_lr_0.0003_l2_0/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_{}_batch_32_kernel_1_lr_0.0003_l2_0/acc.p'.format(n))
    val_loss_lists[n] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_{}_batch_32_kernel_1_lr_0.0003_l2_0/val_loss.p'.format(n))
best_hyperparam, best_val = plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, "hidden size")
best_hyperparam, best_val

In [None]:
np.linspace(1e-06,1e-02,6)

In [None]:
np.logspace(-6,1,8)

In [None]:
np.logspace(-12,1,14)

In [None]:
for l2 in np.logspace(-12,1,14):
    train("l2", 256, 70000, batch_size=32, num_epochs=30, kernel_size=1, l2_penalty=l2, model_type="cnn")

In [None]:
### Better to align y axis for two loss graph

loss_lists, val_acc_lists, val_loss_lists = dict(), dict(), dict()
for n in np.logspace(-12,1,14):
    loss_lists[n] = unpickle_from_file('cnn_snli_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_{}/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('cnn_snli_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_{}/acc.p'.format(n))
    val_loss_lists[n] = unpickle_from_file('cnn_snli_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_{}/val_loss.p'.format(n))
best_hyperparam, best_val = plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, "hidden size", log=True)
best_hyperparam, best_val

In [None]:
### Better to align y axis for two loss graph

loss_lists, val_acc_lists, val_loss_lists = dict(), dict(), dict()
for n in np.logspace(-12,1,14):
    loss_lists[n] = unpickle_from_file('cnn_snli_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_{}/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('cnn_snli_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_{}/acc.p'.format(n))
    val_loss_lists[n] = unpickle_from_file('cnn_snli_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_{}/val_loss.p'.format(n))
best_hyperparam, best_val = plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, "l2", log=True)
best_hyperparam, best_val

In [None]:
#verified validation acc stays around 33.1
#train("test_shuffle", 256, 70000, batch_size=32, num_epochs=30, kernel_size=1, l2_penalty=1e-09, model_type="cnn", shuffle=True)

In [None]:
# Try featurewise multiplication

In [None]:
train("interaction_fixed", 256, 70000, batch_size=32, num_epochs=30, kernel_size=1, model_type="cnn", interaction='featurewise_multiplication')

In [None]:
# Less performance around 67.9
# train("interaction_fixed", 256, 70000, batch_size=32, num_epochs=30, kernel_size=3, model_type="cnn", interaction='featurewise_multiplication')

In [None]:
for l2 in np.logspace(-12,1,14):
    train("interaction_l2", 256, 70000, batch_size=32, num_epochs=30, kernel_size=1, l2_penalty=l2, model_type="cnn", interaction='featurewise_multiplication')

In [None]:
### Better to align y axis for two loss graph

loss_lists, val_acc_lists, val_loss_lists = dict(), dict(), dict()
loss_lists['concat'] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_0/loss.p')
val_acc_lists['concat'] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_0/acc.p')
val_loss_lists['concat'] = unpickle_from_file('cnn_snli_all_kernel_size_fixed_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_0/val_loss.p')
loss_lists['multiply'] = unpickle_from_file('cnn_snli_interaction_fixed_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_0_interaction_featurewise_multiplication/loss.p')
val_acc_lists['multiply'] = unpickle_from_file('cnn_snli_interaction_fixed_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_0_interaction_featurewise_multiplication/acc.p')
val_loss_lists['multiply'] = unpickle_from_file('cnn_snli_interaction_fixed_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_0_interaction_featurewise_multiplication/val_loss.p')
best_hyperparam, best_val = plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, "combining two sentences")
best_hyperparam, best_val

In [None]:
### Better to align y axis for two loss graph

loss_lists, val_acc_lists, val_loss_lists = dict(), dict(), dict()
for n in np.logspace(-12,-1,12):
    loss_lists[n] = unpickle_from_file('cnn_snli_interaction_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_{}_interaction_featurewise_multiplication/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('cnn_snli_interaction_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_{}_interaction_featurewise_multiplication/acc.p'.format(n))
    val_loss_lists[n] = unpickle_from_file('cnn_snli_interaction_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_{}_interaction_featurewise_multiplication/val_loss.p'.format(n))
best_hyperparam, best_val = plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, "l2, multiply", log=True)
best_hyperparam, best_val

In [None]:
# Try weight sharing

In [None]:
show_result('rnn_snli_ftgru_maxvocab_50000_hid_256_batch_64_lr_0.0003/epoch_8_step_3000.p',256)

In [None]:
show_result('cnn_snli_interaction_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_1e-10_interaction_featurewise_multiplication/epoch_16_step_2500.p',256,max_vocab_size=70000,kernel_size=1,model_type="cnn",interaction="featurewise_multiplication")

In [None]:
for hidden_size in [2,4,8,16,32,64,128,512,1024]:
    train("step2", hidden_size, 70000, batch_size=32, num_epochs=30, model_type="rnn",interaction="featurewise_multiplication")

In [None]:
train("step2", 256, 70000, batch_size=32, num_epochs=30, model_type="rnn",interaction="featurewise_multiplication")

In [None]:
### Better to align y axis for two loss graph

loss_lists, val_acc_lists, val_loss_lists = dict(), dict(), dict()
for n in [2,4,8,16,32,64,128,256,512,1024]:
    loss_lists[n] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_{}_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_{}_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/acc.p'.format(n))
    val_loss_lists[n] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_{}_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/val_loss.p'.format(n))
best_hyperparam, best_val = plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, "hidden size")
best_hyperparam, best_val

In [None]:
for i in [2,4,8,16,32,64,128,256,512,1024]:
    model = RNN(None, 
                emb_size=300, 
                hidden_size=i,
                num_layers=1,
                num_classes=3, 
                vocab_size=70000,
               interaction="featurewise_multiplication")
    total_params = 0
    for parameter in model.parameters():
        if parameter.requires_grad:
            print(parameter.shape, parameter.numel())
            total_params += parameter.numel()
    print("kernel size:", i, "total params", total_params)

In [None]:
### Better to align y axis for two loss graph

loss_lists, val_acc_lists, val_loss_lists = dict(), dict(), dict()
for n in np.logspace(-12,-10,3):
    loss_lists[n] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_{}_interaction_featurewise_multiplication/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_{}_interaction_featurewise_multiplication/acc.p'.format(n))
    val_loss_lists[n] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_{}_interaction_featurewise_multiplication/val_loss.p'.format(n))
best_hyperparam, best_val = plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, "l2 penalty (temp)")
best_hyperparam, best_val

In [None]:
np.logspace(-16,-13,4)

In [None]:
for each in reversed(np.logspace(-12,-2,11)):
    print(each)

In [None]:
train("step2", 512, 70000, batch_size=32, num_epochs=30, model_type="rnn",interaction="concat")
for l2 in np.logspace(-12,1,14):
    train("step2", 512, 70000, batch_size=32, num_epochs=30, l2_penalty=l2, model_type="rnn",interaction="featurewise_multiplication")

In [None]:
for l2 in reversed(np.logspace(-16,-13,4)):
    train("step2", 512, 70000, batch_size=32, num_epochs=30, l2_penalty=l2, model_type="rnn",interaction="featurewise_multiplication")

In [None]:
### Better to align y axis for two loss graph

loss_lists, val_acc_lists, val_loss_lists = dict(), dict(), dict()
loss_lists['concat'] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_concat/loss.p')
val_acc_lists['concat'] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_concat/acc.p')
val_loss_lists['concat'] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_concat/val_loss.p')
loss_lists['multiply'] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/loss.p')
val_acc_lists['multiply'] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/acc.p')
val_loss_lists['multiply'] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/val_loss.p')
best_hyperparam, best_val = plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, "combining two sentences")
best_hyperparam, best_val

In [None]:
np.logspace(-16,-2,15)

In [None]:
### Better to align y axis for two loss graph

loss_lists, val_acc_lists, val_loss_lists = dict(), dict(), dict()
for n in [0]:
    loss_lists[n] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_{}_interaction_featurewise_multiplication/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_{}_interaction_featurewise_multiplication/acc.p'.format(n))
    val_loss_lists[n] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_{}_interaction_featurewise_multiplication/val_loss.p'.format(n))
for n in np.logspace(-16,-3,14):
    loss_lists[n] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_{}_interaction_featurewise_multiplication/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_{}_interaction_featurewise_multiplication/acc.p'.format(n))
    val_loss_lists[n] = unpickle_from_file('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_{}_interaction_featurewise_multiplication/val_loss.p'.format(n))
best_hyperparam, best_val = plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, "l2 penalty (temp)", log=True)
best_hyperparam, best_val

# Bonus

In [None]:
show_result('cnn_snli_interaction_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_1e-10_interaction_featurewise_multiplication/epoch_16_step_2500.p',256,l2_penalty=1e-10,phase="fiction",max_vocab_size=70000,kernel_size=1,model_type="cnn",interaction="featurewise_multiplication")

In [None]:
show_result('cnn_snli_interaction_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_1e-10_interaction_featurewise_multiplication/epoch_16_step_2500.p',256,l2_penalty=1e-10,phase="telephone",max_vocab_size=70000,kernel_size=1,model_type="cnn",interaction="featurewise_multiplication")

In [None]:
show_result('cnn_snli_interaction_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_1e-10_interaction_featurewise_multiplication/epoch_16_step_2500.p',256,l2_penalty=1e-10,phase="slate",max_vocab_size=70000,kernel_size=1,model_type="cnn",interaction="featurewise_multiplication")

In [None]:
show_result('cnn_snli_interaction_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_1e-10_interaction_featurewise_multiplication/epoch_16_step_2500.p',256,l2_penalty=1e-10,phase="government",max_vocab_size=70000,kernel_size=1,model_type="cnn",interaction="featurewise_multiplication")

In [None]:
show_result('cnn_snli_interaction_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_1e-10_interaction_featurewise_multiplication/epoch_16_step_2500.p',256,l2_penalty=1e-10,phase="travel",max_vocab_size=70000,kernel_size=1,model_type="cnn",interaction="featurewise_multiplication")

In [None]:
### Better to align y axis for two loss graph

loss_lists, val_acc_lists, val_loss_lists = dict(), dict(), dict()
for n in ["fiction", "telephone", "slate", "government", "travel"]:
    loss_lists[n] = unpickle_from_file('cnn_mnli_train_fixed_{}_epochs_10_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_1e-10_interaction_featurewise_multiplication/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('cnn_mnli_train_fixed_{}_epochs_10_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_1e-10_interaction_featurewise_multiplication/acc.p'.format(n))
    val_loss_lists[n] = unpickle_from_file('cnn_mnli_train_fixed_{}_epochs_10_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_1e-10_interaction_featurewise_multiplication/val_loss.p'.format(n))
best_hyperparam, best_val = plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, "finetuning category")
best_hyperparam, best_val

In [None]:
show_result('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/epoch_7_step_1500.p',512,l2_penalty=0,phase="fiction",max_vocab_size=70000,model_type="rnn",interaction="featurewise_multiplication")

In [None]:
show_result('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/epoch_7_step_1500.p',512,l2_penalty=0,phase="telephone",max_vocab_size=70000,model_type="rnn",interaction="featurewise_multiplication")

In [None]:
show_result('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/epoch_7_step_1500.p',512,l2_penalty=0,phase="slate",max_vocab_size=70000,model_type="rnn",interaction="featurewise_multiplication")

In [None]:
show_result('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/epoch_7_step_1500.p',512,l2_penalty=0,phase="government",max_vocab_size=70000,model_type="rnn",interaction="featurewise_multiplication")

In [None]:
show_result('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/epoch_7_step_1500.p',512,l2_penalty=0,phase="travel",max_vocab_size=70000,model_type="rnn",interaction="featurewise_multiplication")

In [None]:
show_result('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/epoch_7_step_1500.p',512,l2_penalty=0,phase="test",max_vocab_size=70000,model_type="rnn",interaction="featurewise_multiplication")

In [None]:
### Better to align y axis for two loss graph

loss_lists, val_acc_lists, val_loss_lists = dict(), dict(), dict()
for n in ["fiction", "telephone", "slate", "government", "travel"]:
    loss_lists[n] = unpickle_from_file('rnn_mnli_train_fixed_{}_epochs_10_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/loss.p'.format(n))
    val_acc_lists[n] = unpickle_from_file('rnn_mnli_train_fixed_{}_epochs_10_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/acc.p'.format(n))
    val_loss_lists[n] = unpickle_from_file('rnn_mnli_train_fixed_{}_epochs_10_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/val_loss.p'.format(n))
best_hyperparam, best_val = plot_train_validation_new(loss_lists, val_loss_lists, val_acc_lists, "finetuning category", bar=False)
best_hyperparam, best_val

In [None]:
show_result('rnn_snli_step2_maxvocab_70000_hid_512_batch_32_kernel_3_lr_0.0003_l2_0_interaction_featurewise_multiplication/epoch_7_step_1500.p',512,l2_penalty=0,phase="val",max_vocab_size=70000,model_type="rnn",interaction="featurewise_multiplication")

In [None]:
show_result('cnn_snli_interaction_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_1e-10_interaction_featurewise_multiplication/epoch_16_step_2500.p',256,l2_penalty=1e-10,phase="val",max_vocab_size=70000,kernel_size=1,model_type="cnn",interaction="featurewise_multiplication")

In [None]:
show_result('cnn_snli_interaction_l2_maxvocab_70000_hid_256_batch_32_kernel_1_lr_0.0003_l2_1e-10_interaction_featurewise_multiplication/epoch_16_step_2500.p',256,l2_penalty=1e-10,phase="val",max_vocab_size=70000,kernel_size=1,model_type="cnn",interaction="featurewise_multiplication")

In [None]:
words_ft, idx2words_ft, _, _ = set_words_data(70000)

In [None]:
words_ft['mural']

In [None]:
for phase in ["fiction", "telephone", "slate", "government", "travel"]:
    print(len(mnil_train[mnil_train.genre==phase]))