In [3]:
import numpy as np, pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
import nltk
from nltk.util import ngrams
from torch.utils.data.sampler import SubsetRandomSampler
random.seed(15)

import string
#PAD_IDX = 0
#UNK_IDX = 1

PAD_IDX = int(52948)
UNK_IDX = int(77808)
BATCH_SIZE = 32

In [4]:
#convert a sentence into a sequnce of tokens
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [word2idx[token] if token in word2idx else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data


In [5]:
def tokenize_dataset(dataset):
    token_dataset = []
    all_tokens = []
    
    for sample in dataset:
        #don't remove punctuation
        #sample = sample.translate(str.maketrans('','',string.punctuation))
        tokens = nltk.word_tokenize(sample.lower())
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

In [6]:
def test_model(model, train=False):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total_sample = 0
    total_loss = 0
    
    model.eval()
    # get a random sample
    if train:
        loader = torch.utils.data.DataLoader(
            SNLIDataset(train_sent1_indices, train_sent2_indices, train_label),batch_size=BATCH_SIZE,collate_fn=SNLIvocab_collate_func,
            sampler=SubsetRandomSampler(range(10*BATCH_SIZE)))
    else:
        loader = torch.utils.data.DataLoader(
            SNLIDataset(val_sent1_indices, val_sent2_indices, val_label),batch_size=BATCH_SIZE,collate_fn=SNLIvocab_collate_func,
            sampler=SubsetRandomSampler(range(10*BATCH_SIZE)))

    for i, sample in enumerate(loader):
            size = sample[0].shape[0]
            #print(sample[0].shape)
            outputs = F.softmax(model(sample[0], sample[1]), dim=1)
            minibatch_loss = criterion(outputs, sample[2])
            total_loss += minibatch_loss.item()
            predicted = outputs.max(1, keepdim=True)[1].view(-1)
            
            total_sample += size
            label = sample[2]
            correct += predicted.eq(label.view_as(predicted)).sum().item()

    total_batch = i + 1
    acc = 100 * correct / total_sample
    los = total_loss / total_batch
    return acc, los



In [7]:
torch.__version__

'0.4.1'

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/zh1087/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Data Preparation

### Load dataset

In [5]:
with open("hw2_data/snli_train.tsv") as f:
    train = f.read().split('\n')
    #print(train)
    train_data = [row.split('\t') for row in train[1:-1]]
    train_sent1 = [row[0] for row in train_data]
    train_sent2 = [row[1] for row in train_data]
    train_label = [int(0) if row[2] == 'entailment' else int(1) if row[2] == 'contradiction' else int(2) for row in train_data]
    
with open("hw2_data/snli_val.tsv") as f:
    val = f.read().split('\n')
    #print(train)
    val_data = [row.split('\t') for row in val[1:-1]]
    val_sent1 = [row[0] for row in val_data]
    val_sent2 = [row[1] for row in val_data]
    val_label = [int(0) if row[2] == 'entailment' else int(1) if row[2] == 'contradiction' else int(2) for row in val_data]
       

In [6]:
pkl.dump(train_label, open("train_label.p", "wb"))
pkl.dump(val_label, open("val_label.p", "wb"))

In [7]:
#train_sent1

### Tokenize dataset

In [7]:
# train set tokens
print ("Tokenizing train data")
train_sent1_tokens, all_train_sent1_tokens = tokenize_dataset(train_sent1)
train_sent2_tokens, all_train_sent2_tokens = tokenize_dataset(train_sent2)

pkl.dump(train_sent1_tokens, open("train_sent1_tokens.p", "wb"))
pkl.dump(train_sent2_tokens, open("train_sent2_tokens.p", "wb"))

# val set tokens
print ("Tokenizing val data")
val_sent1_tokens, _ = tokenize_dataset(val_sent1)
val_sent2_tokens, _ = tokenize_dataset(val_sent2)

pkl.dump(val_sent1_tokens, open("val_sent1_tokens.p", "wb"))
pkl.dump(val_sent2_tokens, open("val_sent2_tokens.p", "wb"))

Tokenizing train data
Tokenizing val data


In [8]:
#load tokenized data:
train_sent1_tokens = pkl.load(open('train_sent1_tokens.p','rb'))
train_sent2_tokens = pkl.load(open('train_sent2_tokens.p','rb'))

val_sent1_tokens = pkl.load(open('val_sent1_tokens.p','rb'))
val_sent2_tokens = pkl.load(open('val_sent2_tokens.p','rb'))

In [9]:
len(list(set(all_train_sent1_tokens+all_train_sent2_tokens)))

21015

### Build vocabulary from pretrained embedding matrix, build word2id, id2word, word2vec

In [11]:
#does the dataset contain 'PAD' and 'UNK'
all_word_pretrained = [] 
with open('./wiki-news-300d-1M.vec') as f:
    next(f)
    all_words_pretrained = []
    for i, line in enumerate(f):
        line = line.split()
        all_word_pretrained.append(line[0])
        
        

In [11]:
#check if 'PAD' 'UNK' already have pretrained word embedding
print('PAD in pretrained:','PAD' in all_word_pretrained)
print('UNK in pretrained:','PAD' in all_word_pretrained)

PAD in pretrained: True
UNK in pretrained: True


In [10]:
words_to_load = 500000
all_words = [] #all words for our vocabulary
word2idx = {}
idx2word = {}
word2vec = {}
weights_matrix = np.zeros((words_to_load, 300))
    
with open('./wiki-news-300d-1M.vec') as f:
    next(f)
    #'PAD', 'UNK' already in pretrained
    #randomized a vector representation for PAD
    #weights_matrix[0,:] = np.random.normal(scale=0.6, size=(300, ))
    #randomized a vector representation for UNK
    #weights_matrix[1,:] = np.random.normal(scale=0.6, size=(300, ))
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        line = line.split()
        if line[0] == 'PAD':
            print('find')
            print(i)
            PAD_IDX = i
        if line[0] == 'UNK':
            print('find')
            print(i)
            UNK_IDX = i
        all_words.append(line[0])
        weights_matrix[i, :] = np.array(line[1:])
        word2idx[line[0]] = i
        idx2word[i] = line[0]
print('PAD_IDX is {}'.format(PAD_IDX))
print('UNK_IDX is {}'.format(UNK_IDX))
word2vec = {w: weights_matrix[word2idx[w],:] for w in all_words}

find
52948
find
77808
PAD_IDX is 52948
UNK_IDX is 77808


In [13]:
word2idx['PAD']

52948

In [13]:
word2vec['UNK']

array([ 0.4084,  0.1814,  0.0993, -0.0427,  0.0475,  0.1457, -0.1938,
        0.083 , -0.0065,  0.1158,  0.1674, -0.1335,  0.0726, -0.0866,
        0.0229,  0.1958,  0.2004,  0.1319,  0.0562, -0.1416, -0.5682,
       -0.2068,  0.1826,  0.0426,  0.2019, -0.0824,  0.0769,  0.0712,
        0.1564,  0.0922, -0.1993,  0.2104, -0.0782, -0.0809, -0.2806,
       -0.2862,  0.1417, -0.022 , -0.002 , -0.1562,  0.1064,  0.0134,
        0.0584, -0.0326,  0.0428, -0.0953,  0.087 ,  0.0858, -0.1559,
       -0.0927,  0.1273,  0.1616, -0.8534, -0.1217,  0.0891, -0.1072,
       -0.1505, -0.0263, -0.0411,  0.0292,  0.089 ,  0.0444, -0.1071,
       -0.168 , -0.413 ,  0.1468,  0.1117, -0.0966,  0.0731, -0.1675,
       -0.0708,  0.0626,  0.2237, -0.009 , -0.003 ,  0.111 , -0.211 ,
        0.1294, -0.0849,  0.0745, -0.0402,  0.0142, -0.0717, -0.1138,
       -0.0544, -0.0934,  0.159 , -0.0286,  0.1799,  0.1802, -0.0691,
        0.0164, -0.1306, -0.0316,  0.07  ,  0.1684,  0.2396, -0.0592,
        0.1818,  0.3

In [14]:
weights_matrix.shape

(500000, 300)

In [14]:
pkl.dump(all_words, open("all_words.p", "wb"))
pkl.dump(word2idx, open("word2idx.p", "wb"))
pkl.dump(idx2word, open("idx2word.p", "wb"))
pkl.dump(word2vec, open("word2vec.p", "wb"))
pkl.dump(weights_matrix, open('weights_matrix.p','wb'))

In [15]:
#load back data:
all_words = pkl.load(open('all_words.p','rb'))
word2idx = pkl.load(open('word2idx.p','rb'))
idx2word = pkl.load(open('idx2word.p','rb'))
word2vec = pkl.load(open('word2vec.p','rb'))
weights_matrix = pkl.load(open('weights_matrix.p','rb'))

In [16]:
# Lets check the dictionary by loading random token from it

random_token_id = random.randint(0, len(idx2word)-1)
random_token = idx2word[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, idx2word[random_token_id]))
print ("Token {}; token id {}".format(random_token, word2idx[random_token]))

Token id 109565 ; token Auriemma
Token Auriemma; token id 109565


### convert token to id in the dataset

In [6]:
train_sent1_indices = token2index_dataset(train_sent1_tokens)
train_sent2_indices = token2index_dataset(train_sent2_tokens)

val_sent2_indices = token2index_dataset(val_sent2_tokens)
val_sent1_indices = token2index_dataset(val_sent1_tokens)

# double checking
print ("Train sent1 size is {}".format(len(train_sent1_indices)))
print ("Train sent2 size is {}".format(len(train_sent2_indices)))
print ("Val dataset size is {}".format(len(val_sent1_indices)))
print ("Val dataset size is {}".format(len(val_sent2_indices)))



In [20]:
pkl.dump(train_sent1_indices, open("train_sent1_indices.p", "wb"))
pkl.dump(train_sent2_indices, open("train_sent2_indices.p", "wb"))
pkl.dump(val_sent2_indices, open("val_sent2_indices.p", "wb"))
pkl.dump(val_sent1_indices, open("val_sent1_indices.p", "wb"))

## Create Dataset and Dataloader

In [11]:
##load back all necessary data:
all_words = pkl.load(open('all_words.p','rb'))
word2idx = pkl.load(open('word2idx.p','rb'))
idx2word = pkl.load(open('idx2word.p','rb'))
word2vec = pkl.load(open('word2vec.p','rb'))
weights_matrix = pkl.load(open('weights_matrix.p','rb'))

train_sent1_indices = pkl.load(open("train_sent1_indices.p", "rb"))
train_sent2_indices = pkl.load(open("train_sent2_indices.p", "rb"))
val_sent2_indices = pkl.load(open("val_sent2_indices.p", "rb"))
val_sent1_indices = pkl.load(open("val_sent1_indices.p", "rb"))

train_label = pkl.load(open("train_label.p", "rb"))
val_label = pkl.load(open("val_label.p", "rb"))



In [12]:
MAX_SENTENCE_LENGTH = 25
class SNLIDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, sent1_data, sent2_data, target_list):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.sent1_data = sent1_data
        self.sent2_data = sent2_data
        self.target_list = target_list
        assert (len(self.sent1_data) == len(self.target_list))
        assert (len(self.sent2_data) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        item = dict()
        
        sent1_index_list = self.sent1_data[key][:MAX_SENTENCE_LENGTH]
        sent2_index_list = self.sent2_data[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [sent1_index_list, sent2_index_list, label]

In [13]:
PAD_IDX

52948

In [14]:
#note since PAD is already in dataset, here we need to pad with PAD_IDX not 0
def SNLIvocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    sent1_list = []
    sent2_list = []
    label_list = []

    for datum in batch:
        label_list.append(datum[2])
        
    # padding
    for datum in batch:
        padded_vec0 = np.pad(np.array(datum[0]),
                                pad_width=((0 ,MAX_SENTENCE_LENGTH-len(datum[0]))),
                                mode="constant", constant_values=PAD_IDX)
        sent1_list.append(list(padded_vec0))
    
        padded_vec1 = np.pad(np.array(datum[1]),
                                pad_width=((0 ,MAX_SENTENCE_LENGTH-len(datum[1]))),
                                mode="constant", constant_values=PAD_IDX)
        sent2_list.append(list(padded_vec1))

    return [torch.from_numpy(np.array(sent1_list)),torch.from_numpy(np.array(sent2_list)), torch.LongTensor(label_list)]



In [15]:
BATCH_SIZE = 32
train_dataset = SNLIDataset(train_sent1_indices, train_sent2_indices, train_label)
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                          batch_size = BATCH_SIZE,
                                          collate_fn = SNLIvocab_collate_func,
                                          shuffle = True)

val_dataset = SNLIDataset(val_sent1_indices, val_sent2_indices, val_label)
val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                        batch_size = BATCH_SIZE,
                                        collate_fn = SNLIvocab_collate_func,
                                        shuffle = False)


## RNN model:

In [9]:
class RNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers, num_classes):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # vocab_size: vocabulary size
        super().__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights_matrix),freeze=True)
        emb_size = weights_matrix.shape[1]
        
        self.rnn1 = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, dropout=0.5, bidirectional=True)
        self.rnn2 = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, dropout=0.5, bidirectional=True)
        
        self.fc1 = nn.Linear(hidden_size * 2 * 2, 300)
        self.dropout1 = nn.Dropout(0.3) 
        self.out = nn.Linear(300, num_classes)
        
    
        
    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers*2, batch_size, self.hidden_size)

        return hidden

    def forward(self, sent1, sent2):
        # reset hidden state

        batch_size = BATCH_SIZE

        self.hidden1 = self.init_hidden(batch_size)
        self.hidden2 = self.init_hidden(batch_size)

        # get embedding of characters
        sent1_embed = self.embedding(sent1)
        sent2_embed = self.embedding(sent2)
        
        # fprop though RNN
        sent1_rnn_out, self.hidden1 = self.rnn1(sent1_embed, self.hidden1)
        sent2_rnn_out, self.hidden2 = self.rnn2(sent2_embed, self.hidden2)
       
    
        # sum hidden activations of RNN across time
        sent1_rnn_out = torch.sum(sent1_rnn_out, dim=1)
        sent2_rnn_out = torch.sum(sent2_rnn_out, dim=1)
        
        rnn_out = torch.cat([sent1_rnn_out, sent2_rnn_out], 1)
        #print(rnn_out.shape)
        
        x = self.fc1(rnn_out)
        x = F.relu(x)
        x = self.dropout1(x)
        
        logits = self.out(x)
        return logits



In [12]:
model = RNN(weights_matrix=weights_matrix, hidden_size=150, num_layers=1, num_classes=3)
print(model)


RNN(
  (embedding): Embedding(500000, 300)
  (rnn1): GRU(300, 150, batch_first=True, dropout=0.5, bidirectional=True)
  (rnn2): GRU(300, 150, batch_first=True, dropout=0.5, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=300, bias=True)
  (dropout1): Dropout(p=0.3)
  (out): Linear(in_features=300, out_features=3, bias=True)
)


  "num_layers={}".format(dropout, num_layers))


In [13]:
#all trainable parameters:
params = sum([np.prod(p.size()) for p in model.parameters()if p.requires_grad ])
params

994803

### Only run 2 epoch for demo purpose

In [14]:

model = RNN(weights_matrix=weights_matrix, hidden_size=150, num_layers=1, num_classes=3)

learning_rate = 3e-4
num_epochs = 1 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


RNNval_accuracy = []
RNNval_loss = []
RNNtrain_accuracy = []
RNNtrain_loss = []
best_val_acc = 0
for epoch in range(num_epochs):
    for i, sample in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        output = model(sample[0], sample[1])
        #print(output)
        label = sample[2]
        loss = criterion(output, label)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 300 == 0:
            #validate
            val_acc, val_los = test_model(model, train=False)
            tra_acc, tra_los = test_model(model, train=True)
            RNNval_accuracy.append(val_acc)
            RNNval_loss.append(val_los)
            RNNtrain_accuracy.append(tra_acc)
            RNNtrain_loss.append(tra_los)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                print('find new record, save the model!')
                torch.save(model.state_dict(), 'RNN150_best_model.pkl')

OUT_DICT = {'val_accuracy':RNNval_accuracy, 'val_loss': RNNval_loss, 'train_accuracy':RNNtrain_accuracy, 'train_loss':RNNtrain_loss, 'trainable parameters': sum([np.prod(p.size()) for p in model.parameters()if p.requires_grad ])}

pkl.dump(OUT_DICT, open("SNL_RNN_HIDDEN150.p", "wb"))               

Epoch: [1/1], Step: [301/3125], Validation Acc: 39.375
find new record, save the model!
Epoch: [1/1], Step: [601/3125], Validation Acc: 41.5625
find new record, save the model!
Epoch: [1/1], Step: [901/3125], Validation Acc: 40.625
Epoch: [1/1], Step: [1201/3125], Validation Acc: 38.75
Epoch: [1/1], Step: [1501/3125], Validation Acc: 38.4375
Epoch: [1/1], Step: [1801/3125], Validation Acc: 43.125
find new record, save the model!
Epoch: [1/1], Step: [2101/3125], Validation Acc: 50.3125
find new record, save the model!
Epoch: [1/1], Step: [2401/3125], Validation Acc: 52.1875
find new record, save the model!
Epoch: [1/1], Step: [2701/3125], Validation Acc: 52.1875
Epoch: [1/1], Step: [3001/3125], Validation Acc: 50.625


## CNN model

In [7]:
class CNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_classes,):

        super(CNN, self).__init__()

        self.hidden_size =  hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights_matrix),freeze=True)
        emb_size = weights_matrix.shape[1]
        
    
        self.sent1_conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        self.sent1_conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)

        self.sent2_conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        self.sent2_conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)


        self.fc1 = nn.Linear(hidden_size * 2 , 300)
        self.dropout1 = nn.Dropout(0.3) 
        self.out = nn.Linear(300, num_classes)

    def forward(self, sent1, sent2):
        batch_size, seq_len = sent1.size()
        #print('batch_size1 is {}, batch_size2 is {}'.format(batch_size1, batch_size2))
        #print('seq_len1 is {}, seq_len2 is {}'.format(seq_len1,seq_len2))

        sent1_embed = self.embedding(sent1)
        sent2_embed = self.embedding(sent2)
        
        sent1_hidden = self.sent1_conv1(sent1_embed.transpose(1,2)).transpose(1,2)
        sent1_hidden = F.relu(sent1_hidden.contiguous().view(-1, sent1_hidden.size(-1))).view(batch_size, seq_len, sent1_hidden.size(-1))

        sent1_hidden = self.sent1_conv2(sent1_hidden.transpose(1,2)).transpose(1,2)
        sent1_hidden = F.relu(sent1_hidden.contiguous().view(-1, sent1_hidden.size(-1))).view(batch_size, seq_len, sent1_hidden.size(-1))

        sent1_hidden = torch.sum(sent1_hidden, dim=1)
        #print('sent1_hidden shape is {}'.format(sent1_hidden.shape)) #[32, 150]
        
        sent2_hidden = self.sent2_conv1(sent2_embed.transpose(1,2)).transpose(1,2)
        sent2_hidden = F.relu(sent2_hidden.contiguous().view(-1, sent2_hidden.size(-1))).view(batch_size, seq_len, sent2_hidden.size(-1))

        sent2_hidden = self.sent2_conv2(sent2_hidden.transpose(1,2)).transpose(1,2)
        sent2_hidden = F.relu(sent2_hidden.contiguous().view(-1, sent2_hidden.size(-1))).view(batch_size, seq_len, sent2_hidden.size(-1))

        sent2_hidden = torch.sum(sent2_hidden, dim=1)
        #print('sent2_hidden shape is {}'.format(sent2_hidden.shape)) #[32, 150]
        
        cnn_out = torch.cat([sent1_hidden, sent2_hidden], 1)
        #print('cnn out shape is {}'.format(cnn_out.shape)) #[32, 300]
        
        x = self.fc1(cnn_out)
        x = F.relu(x)
        x = self.dropout1(x)
        
        logits = self.out(x)

        return logits

In [13]:
model = CNN(weights_matrix=weights_matrix, hidden_size=150,  num_classes=3)
print(model)

CNN(
  (embedding): Embedding(500000, 300)
  (sent1_conv1): Conv1d(300, 150, kernel_size=(3,), stride=(1,), padding=(1,))
  (sent1_conv2): Conv1d(150, 150, kernel_size=(3,), stride=(1,), padding=(1,))
  (sent2_conv1): Conv1d(300, 150, kernel_size=(3,), stride=(1,), padding=(1,))
  (sent2_conv2): Conv1d(150, 150, kernel_size=(3,), stride=(1,), padding=(1,))
  (fc1): Linear(in_features=300, out_features=300, bias=True)
  (dropout1): Dropout(p=0.3)
  (out): Linear(in_features=300, out_features=3, bias=True)
)


### Only train 2 epoch for demo purpose

In [12]:
### CNN:
model = CNN(weights_matrix=weights_matrix, hidden_size=150,  num_classes=3)
print(type(model))
learning_rate = 3e-4
num_epochs = 2 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

CNNval_accuracy = []
CNNval_loss = []
CNNtrain_accuracy = []
CNNtrain_loss = []
best_val_acc = 0
for epoch in range(num_epochs):
    for i, sample in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(sample[0],sample[1])
        labels = sample[2]
        loss = criterion(outputs, labels)
       
        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            val_acc, val_los = test_model(model, train=False)
            tra_acc, tra_los = test_model(model, train=True)
            CNNval_accuracy.append(val_acc)
            CNNval_loss.append(val_los)
            CNNtrain_accuracy.append(tra_acc)
            CNNtrain_loss.append(tra_los)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                print('find new record, save the model!')
                torch.save(model.state_dict(), 'CNN150_best_model.pkl')

OUT_DICT = {'val_accuracy':CNNval_accuracy, 'val_loss': CNNval_loss, 'train_accuracy':CNNtrain_accuracy, 'train_loss':CNNtrain_loss, 'trainable parameters': sum([np.prod(p.size()) for p in model.parameters()if p.requires_grad ])}

pkl.dump(OUT_DICT, open("SNL_CNN_HIDDEN150.p", "wb"))

<class '__main__.CNN'>
Epoch: [1/2], Step: [101/3125], Validation Acc: 41.5625
find new record, save the model!
Epoch: [1/2], Step: [201/3125], Validation Acc: 46.875
find new record, save the model!
Epoch: [1/2], Step: [301/3125], Validation Acc: 50.9375
find new record, save the model!
Epoch: [1/2], Step: [401/3125], Validation Acc: 55.3125
find new record, save the model!
Epoch: [1/2], Step: [501/3125], Validation Acc: 50.9375
Epoch: [1/2], Step: [601/3125], Validation Acc: 53.4375
Epoch: [1/2], Step: [701/3125], Validation Acc: 54.6875
Epoch: [1/2], Step: [801/3125], Validation Acc: 51.5625
Epoch: [1/2], Step: [901/3125], Validation Acc: 58.125
find new record, save the model!
Epoch: [1/2], Step: [1001/3125], Validation Acc: 58.75
find new record, save the model!
Epoch: [1/2], Step: [1101/3125], Validation Acc: 52.5
Epoch: [1/2], Step: [1201/3125], Validation Acc: 56.875
Epoch: [1/2], Step: [1301/3125], Validation Acc: 53.75
Epoch: [1/2], Step: [1401/3125], Validation Acc: 54.0625


### Hyper parameter tunning1: Different hidden size for RNN and CNN

### Hyper parameter tunning2: Varying CNN kernel size

In [10]:
kernel_size_list = [5,7,9]
hidden_size = 150

In [11]:
class CNN_kernel(nn.Module):
    def __init__(self, weights_matrix, hidden_size, kernel_size, num_classes,):

        super(CNN_kernel, self).__init__()

        self.hidden_size =  hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights_matrix),freeze=True)
        emb_size = weights_matrix.shape[1]
        
    
        self.sent1_conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size= kernel_size, padding= int((kernel_size - 1)/2))
        self.sent1_conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size= kernel_size, padding= int((kernel_size - 1)/2))

        self.sent2_conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size= kernel_size, padding= int((kernel_size - 1)/2))
        self.sent2_conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size= kernel_size, padding= int((kernel_size - 1)/2))


        self.fc1 = nn.Linear(hidden_size * 2 , 300)
        self.dropout1 = nn.Dropout(0.3) 
        self.out = nn.Linear(300, num_classes)

    def forward(self, sent1, sent2):
        batch_size, seq_len = sent1.size()
        #print('batch_size1 is {}, batch_size2 is {}'.format(batch_size1, batch_size2))
        #print('seq_len1 is {}, seq_len2 is {}'.format(seq_len1,seq_len2))

        sent1_embed = self.embedding(sent1) #[32, 25, 300]
        sent2_embed = self.embedding(sent2)
        #print('sent1_embed shape {}'.format(sent1_embed.size()))
        #print('sent2_embed shape {}'.format(sent2_embed.size()))
        
        sent1_hidden = self.sent1_conv1(sent1_embed.transpose(1,2)).transpose(1,2) #[32, 25, 150]
        #print('shape is {}'.format(sent1_hidden.size()))
        sent1_hidden = F.relu(sent1_hidden.contiguous().view(-1, sent1_hidden.size(-1))).view(batch_size, seq_len, -1) #[32, 25, 150]
        #print('shape is {}'.format(sent1_hidden.size()))

        sent1_hidden = self.sent1_conv2(sent1_hidden.transpose(1,2)).transpose(1,2)#[32, 25, 150]
        #print('shape is {}'.format(sent1_hidden.size()))

        sent1_hidden = F.relu(sent1_hidden.contiguous().view(-1, sent1_hidden.size(-1))).view(batch_size, seq_len, -1)#[32, 25, 150]
        #print('shape is {}'.format(sent1_hidden.size()))

        sent1_hidden = torch.sum(sent1_hidden, dim=1)
        #print('sent1_hidden shape is {}'.format(sent1_hidden.shape)) #[32, 150]
        
        sent2_hidden = self.sent2_conv1(sent2_embed.transpose(1,2)).transpose(1,2)
        sent2_hidden = F.relu(sent2_hidden.contiguous().view(-1, sent2_hidden.size(-1))).view(batch_size, seq_len, -1)

        sent2_hidden = self.sent2_conv2(sent2_hidden.transpose(1,2)).transpose(1,2)
        sent2_hidden = F.relu(sent2_hidden.contiguous().view(-1, sent2_hidden.size(-1))).view(batch_size, seq_len, -1)

        sent2_hidden = torch.sum(sent2_hidden, dim=1)
        #print('sent2_hidden shape is {}'.format(sent2_hidden.shape)) #[32, 150]
        
        cnn_out = torch.cat([sent1_hidden, sent2_hidden], 1)
        #print('cnn out shape is {}'.format(cnn_out.shape)) #[32, 300]
        
        x = self.fc1(cnn_out)
        x = F.relu(x)
        x = self.dropout1(x)
        
        logits = self.out(x)

        return logits

### each run 2 epoch, for demo purpose

In [12]:
for kernel_size in kernel_size_list:     
    ### CNN:
    model = CNN_kernel(weights_matrix=weights_matrix, hidden_size=150, kernel_size= kernel_size, num_classes=3)
    learning_rate = 3e-4
    num_epochs = 1 # number epoch to train

    # Criterion and Optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    CNNval_accuracy = []
    CNNval_loss = []
    CNNtrain_accuracy = []
    CNNtrain_loss = []
    best_val_acc = 0
    for epoch in range(num_epochs):
        for i, sample in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()
            # Forward pass
            outputs = model(sample[0],sample[1])
            labels = sample[2]
            loss = criterion(outputs, labels)

            # Backward and optimize
            loss.backward()
            optimizer.step()
            # validate every 100 iterations
            if i > 0 and i % 300 == 0:
                val_acc, val_los = test_model(model, train=False)
                tra_acc, tra_los = test_model(model, train=True)
                CNNval_accuracy.append(val_acc)
                CNNval_loss.append(val_los)
                CNNtrain_accuracy.append(tra_acc)
                CNNtrain_loss.append(tra_los)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                           epoch+1, num_epochs, i+1, len(train_loader), val_acc))

                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    print('find new record, save the model!')
                    model_name = 'CNN150_KERNEL' + str(kernel_size) + '_best_model.pkl' 
                    torch.save(model.state_dict(), model_name)

    OUT_DICT = {'val_accuracy':CNNval_accuracy, 'val_loss': CNNval_loss, 'train_accuracy':CNNtrain_accuracy, 'train_loss':CNNtrain_loss, 'trainable parameters': sum([np.prod(p.size()) for p in model.parameters()if p.requires_grad ])}
    OUT_DICT_NAME = 'SNL_CNN_HIDDEN150_KERNEL' + str(kernel_size) + '.p'
    pkl.dump(OUT_DICT, open(OUT_DICT_NAME, "wb"))

Epoch: [1/1], Step: [301/3125], Validation Acc: 52.1875
find new record, save the model!
Epoch: [1/1], Step: [601/3125], Validation Acc: 53.75
find new record, save the model!
Epoch: [1/1], Step: [901/3125], Validation Acc: 55.0
find new record, save the model!
Epoch: [1/1], Step: [1201/3125], Validation Acc: 58.75
find new record, save the model!
Epoch: [1/1], Step: [1501/3125], Validation Acc: 55.625
Epoch: [1/1], Step: [1801/3125], Validation Acc: 55.625
Epoch: [1/1], Step: [2101/3125], Validation Acc: 56.5625
Epoch: [1/1], Step: [2401/3125], Validation Acc: 60.3125
find new record, save the model!
Epoch: [1/1], Step: [2701/3125], Validation Acc: 62.1875
find new record, save the model!
Epoch: [1/1], Step: [3001/3125], Validation Acc: 62.8125
find new record, save the model!
Epoch: [1/1], Step: [301/3125], Validation Acc: 51.25
find new record, save the model!
Epoch: [1/1], Step: [601/3125], Validation Acc: 51.875
find new record, save the model!
Epoch: [1/1], Step: [901/3125], Vali

## Report 3 correct and 3 Incorrect example for the best model

### Load best model

In [16]:
#load the model for testing
#example:

best_model = RNN(weights_matrix=weights_matrix, hidden_size=300, num_layers=1, num_classes=3)
best_model.load_state_dict(torch.load('./parameters_tunning/RNN300_best_model.pkl'))


  "num_layers={}".format(dropout, num_layers))


In [17]:
#print 3 correct and 3 incorrect prediction of the final model on validation set
count = 0
for sample in val_loader:
    if count == 1:
        break
    sent1_batch, sent2_batch, label_batch = sample[0], sample[1], sample[2]
    #print('sent1_batch is {}'.format(sent1_batch[0]))
    #print('sent2_batch is {}'.format(sent2_batch[0]))
    print('true label_batch is {}'.format(label_batch))
    
    outputs = F.softmax(best_model(sent1_batch, sent2_batch), dim=1)
    predicted = outputs.max(1, keepdim=True)[1]
    print('predicted labels is {}'.format(predicted.view_as(label_batch)))
    
    print((predicted.view_as(label_batch)!=label_batch))
    
    count+=1

true label_batch is tensor([1, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 2, 1, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0,
        1, 2, 1, 0, 0, 2, 2, 2])
predicted labels is tensor([0, 0, 0, 0, 0, 1, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 2, 1, 2, 0, 0, 2, 0, 1,
        1, 2, 0, 0, 0, 2, 2, 2])
tensor([1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
        0, 0, 1, 0, 0, 0, 0, 0], dtype=torch.uint8)


In [20]:
incorrect_example = [5, 7, 9]
correct_example = [2, 3, 4]

In [21]:
# 0: entailmewnt, 1: contradiction, 2: Neutral
for i in incorrect_example:
    sent1_word = ' '.join([idx2word[j.item()] for j in sent1_batch[i]])
    sent2_word = ' '.join([idx2word[j.item()] for j in sent2_batch[i]])
    print('example:')
    print('true label is {} predicted to be {}'.format(label_batch[i],predicted[i].data[0]))
    print('\n')
    print('sent1 is: {}'.format(sent1_word))
    print('sent2 is: {}'.format(sent2_word))
    print('')
    #print(' '.join(sent1_word))
    print('\n')

example:
true label is 0 predicted to be 1


sent1 is: two people are in a green forest . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD
sent2 is: the forest is not dead . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD



example:
true label is 1 predicted to be 2


sent1 is: two women , one walking her dog the other pushing a stroller . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD
sent2 is: there is a snowstorm . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD



example:
true label is 1 predicted to be 0


sent1 is: three people and a white dog are sitting in the sand on a beach . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD
sent2 is: three dogs and a person are sitting in the snow . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD





In [22]:
# 0: entailmewnt, 1: contradiction, 2: Neutral
for i in correct_example:
    sent1_word = ' '.join([idx2word[j.item()] for j in sent1_batch[i]])
    sent2_word = ' '.join([idx2word[j.item()] for j in sent2_batch[i]])
    print('example:')
    print('true label is {} predicted to be {}'.format(label_batch[i],predicted[i].data[0]))
    print('\n')
    print('sent1 is: {}'.format(sent1_word))
    print('sent2 is: {}'.format(sent2_word))
    print('')
    #print(' '.join(sent1_word))
    print('\n')

example:
true label is 0 predicted to be 0


sent1 is: bicycles stationed while a group of people socialize . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD
sent2 is: people get together near a stand of bicycles . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD



example:
true label is 0 predicted to be 0


sent1 is: man in overalls with two horses . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD
sent2 is: a man in overalls with two horses PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD



example:
true label is 0 predicted to be 0


sent1 is: man observes a wavelength given off by an electronic device . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD
sent2 is: the man is examining what wavelength is given off by the device . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD





## PART2: MultiNLI

In [12]:
#genre_data = {}
with open("hw2_data/mnli_val.tsv") as f:
    val = f.read().split('\n')
    #print(train)
    MNLval_data = [row.split('\t') for row in val[1:-1]]
    MNLval_sent1 = [row[0] for row in MNLval_data]
    MNLval_sent2 = [row[1] for row in MNLval_data]
    MNLval_label = [int(0) if row[2] == 'entailment' else int(1) if row[2] == 'contradiction' else int(2) for row in MNLval_data]
    MNLval_genre = [row[3] for row in MNLval_data]

In [85]:
pkl.dump(MNLval_label, open("MNLval_label.p", "wb"))
pkl.dump(MNLval_genre, open("MNLval_genre.p", "wb"))

In [86]:
set(MNLval_genre)

{'fiction', 'government', 'slate', 'telephone', 'travel'}

### tokenize dataset:

In [87]:
# val set tokens
print ("Tokenizing MNLIval data")
MNLval_sent1_tokens, _ = tokenize_dataset(MNLval_sent1)
MNLval_sent2_tokens, _ = tokenize_dataset(MNLval_sent2)

pkl.dump(MNLval_sent1_tokens, open("MNLval_sent1_tokens.p", "wb"))
pkl.dump(MNLval_sent2_tokens, open("MNLval_sent2_tokens.p", "wb"))

Tokenizing MNLIval data


### convert token to id in the dataset

In [88]:
MNLval_sent2_indices = token2index_dataset(MNLval_sent2_tokens)
MNLval_sent1_indices = token2index_dataset(MNLval_sent1_tokens)

# double checking
print ("MNLVal dataset size is {}".format(len(MNLval_sent1_indices)))
print ("MNLVal dataset size is {}".format(len(MNLval_sent2_indices)))

pkl.dump(MNLval_sent2_indices, open("MNLval_sent2_indices.p", "wb"))
pkl.dump(MNLval_sent1_indices, open("MNLval_sent1_indices.p", "wb"))

MNLVal dataset size is 5000
MNLVal dataset size is 5000


### load back all data

In [8]:
##load back all necessary data:
all_words = pkl.load(open('all_words.p','rb'))
word2idx = pkl.load(open('word2idx.p','rb'))
idx2word = pkl.load(open('idx2word.p','rb'))
word2vec = pkl.load(open('word2vec.p','rb'))
weights_matrix = pkl.load(open('weights_matrix.p','rb'))

MNLval_sent2_indices = pkl.load(open("MNLval_sent2_indices.p", "rb"))
MNLval_sent1_indices = pkl.load(open("MNLval_sent1_indices.p", "rb"))

MNLval_label = pkl.load(open("MNLval_label.p", "rb"))
MNLval_genre = pkl.load(open("MNLval_genre.p", "rb"))



### Dataset and Data loader

In [9]:
MAX_SENTENCE_LENGTH = 25
class MNLDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, sent1_data, sent2_data, target_list):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.sent1_data = sent1_data
        self.sent2_data = sent2_data
        self.target_list = target_list
        assert (len(self.sent1_data) == len(self.target_list))
        assert (len(self.sent2_data) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        item = dict()
        
        sent1_index_list = self.sent1_data[key][:MAX_SENTENCE_LENGTH]
        sent2_index_list = self.sent2_data[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [sent1_index_list, sent2_index_list, label]

In [10]:
#note since PAD is already in dataset, here we need to pad with PAD_IDX not 0
def MNLvocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    sent1_list = []
    sent2_list = []
    label_list = []

    for datum in batch:
        label_list.append(datum[2])
        
    # padding
    for datum in batch:
        padded_vec0 = np.pad(np.array(datum[0]),
                                pad_width=((0 ,MAX_SENTENCE_LENGTH-len(datum[0]))),
                                mode="constant", constant_values=PAD_IDX)
        sent1_list.append(list(padded_vec0))
    
        padded_vec1 = np.pad(np.array(datum[1]),
                                pad_width=((0 ,MAX_SENTENCE_LENGTH-len(datum[1]))),
                                mode="constant", constant_values=PAD_IDX)
        sent2_list.append(list(padded_vec1))

    return [torch.from_numpy(np.array(sent1_list)),torch.from_numpy(np.array(sent2_list)), torch.LongTensor(label_list)]



In [10]:
list(set(MNLval_genre))

['government', 'travel', 'slate', 'fiction', 'telephone']

In [11]:
mask = np.array(MNLval_genre)=='fiction'
mask

array([ True, False, False, ..., False, False,  True])

## Evaluate on Best CNN

### Load best model for CNN

In [13]:
best_model = CNN_kernel(weights_matrix=weights_matrix, hidden_size=300, kernel_size= 3, num_classes=3)
best_model.load_state_dict(torch.load('./parameters_tunning/CNN300_best_model.pkl'))


### Evaluation

In [15]:
result_dict = {}
for i in list(set(MNLval_genre)):
    print('current genre is {}'.format(i))
    mask = np.array(MNLval_genre)== i
    
    genre_sent1_indices = np.array(MNLval_sent2_indices)[mask]
    genre_sent2_indices = np.array(MNLval_sent2_indices)[mask]
    genre_label = np.array(MNLval_label)[mask]
    
    BATCH_SIZE = 32

    MNLval_dataset = MNLDataset(genre_sent1_indices, genre_sent2_indices, genre_label)
    MNLval_loader = torch.utils.data.DataLoader(dataset = MNLval_dataset,
                                        batch_size = BATCH_SIZE,
                                        collate_fn = MNLvocab_collate_func,
                                        shuffle = False)
    
    total_correct = 0
    total_sample = 0 
    
    for sample in MNLval_loader:
        sent1_batch, sent2_batch, label_batch = sample[0], sample[1], sample[2]
        #print('sent1_batch is {}'.format(sent1_batch[0]))
        #print('sent2_batch is {}'.format(sent2_batch[0]))
        #print('label_batch is {}'.format(label_batch))
        sample_size = label_batch.shape[0]
        #print(sample_size)
        
        outputs = F.softmax(best_model(sent1_batch, sent2_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        #print('predicted labels is {}'.format(predicted.view_as(label_batch)))
        
        #print((predicted.view_as(label_batch)!=label_batch))
        correct = np.sum((predicted.view_as(label_batch)!=label_batch).data.numpy(),axis = 0)
        #print(correct)

        total_correct += correct
        #print(total_correct)
        total_sample += sample_size
        #print(total_sample)
    
    MNLval_acc = 100 * total_correct/total_sample
    result_dict[i] = MNLval_acc
    

current genre is slate
current genre is travel
current genre is telephone
current genre is fiction
current genre is government


In [16]:
print('BEST CNN validation accuracy for {} is {}'.format('fiction',result_dict['fiction']))
print('BEST CNN validation accuracy for {} is {}'.format('government',result_dict['government']))
print('BEST CNN validation accuracy for {} is {}'.format('slate',result_dict['slate']))
print('BEST CNN validation accuracy for {} is {}'.format('telephone',result_dict['telephone']))
print('BEST CNN validation accuracy for {} is {}'.format('travel',result_dict['travel']))



BEST CNN validation accuracy for fiction is 58.19095477386934
BEST CNN validation accuracy for government is 53.74015748031496
BEST CNN validation accuracy for slate is 59.98003992015968
BEST CNN validation accuracy for telephone is 60.09950248756219
BEST CNN validation accuracy for travel is 54.58248472505092


## Evaluate on Best RNN

### Load best model for RNN

In [22]:
best_model  = RNN(weights_matrix=weights_matrix, hidden_size=300, num_layers=1, num_classes=3)
best_model.load_state_dict(torch.load('./parameters_tunning/RNN300_best_model.pkl'))


### Evaluation

In [23]:
result_dict = {}
for i in list(set(MNLval_genre)):
    print('current genre is {}'.format(i))
    mask = np.array(MNLval_genre)== i
    
    genre_sent1_indices = np.array(MNLval_sent2_indices)[mask]
    genre_sent2_indices = np.array(MNLval_sent2_indices)[mask]
    genre_label = np.array(MNLval_label)[mask]
    
    BATCH_SIZE = 32

    MNLval_dataset = MNLDataset(genre_sent1_indices, genre_sent2_indices, genre_label)
    MNLval_loader = torch.utils.data.DataLoader(dataset = MNLval_dataset,
                                        batch_size = BATCH_SIZE,
                                        collate_fn = MNLvocab_collate_func,
                                        shuffle = False,sampler=SubsetRandomSampler(range(10*BATCH_SIZE)))
    
    total_correct = 0
    total_sample = 0 
    
    for sample in MNLval_loader:
        sent1_batch, sent2_batch, label_batch = sample[0], sample[1], sample[2]
        #print('sent1_batch is {}'.format(sent1_batch[0]))
        #print('sent2_batch is {}'.format(sent2_batch[0]))
        #print('label_batch is {}'.format(label_batch))
        sample_size = label_batch.shape[0]
        #print(sample_size)
        #output = model(sample[0], sample[1])
        outputs = F.softmax(best_model(sent1_batch, sent2_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        #print('predicted labels is {}'.format(predicted.view_as(label_batch)))
        
        #print((predicted.view_as(label_batch)!=label_batch))
        correct = np.sum((predicted.view_as(label_batch)!=label_batch).data.numpy(),axis = 0)
        #print(correct)

        total_correct += correct
        #print(total_correct)
        total_sample += sample_size
        #print(total_sample)
    
    MNLval_acc = 100 * total_correct/total_sample
    result_dict[i] = MNLval_acc
    

current genre is telephone
current genre is travel
current genre is fiction
current genre is government
current genre is slate


In [24]:
print('BEST RNN validation accuracy for {} is {}'.format('fiction',result_dict['fiction']))
print('BEST RNN validation accuracy for {} is {}'.format('government',result_dict['government']))
print('BEST RNN validation accuracy for {} is {}'.format('slate',result_dict['slate']))
print('BEST RNN validation accuracy for {} is {}'.format('telephone',result_dict['telephone']))
print('BEST RNN validation accuracy for {} is {}'.format('travel',result_dict['travel']))



BEST RNN validation accuracy for fiction is 60.0
BEST RNN validation accuracy for government is 56.875
BEST RNN validation accuracy for slate is 56.875
BEST RNN validation accuracy for telephone is 53.4375
BEST RNN validation accuracy for travel is 59.6875
