In [1]:
# First lets improve libraries that we are going to be used in this lab session
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
random.seed(134)
import pandas as pd
import io

PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32

## Help functions

In [2]:
def build_vocab(all_tokens, embedding, max_vocab_size = 10000):

    # save index 1 for unk and 0 for pad
    PAD_IDX = 0
    UNK_IDX = 1
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    all_tokens = [item for sublist in all_tokens for item in sublist]
    max_len = max([len(word) for word in all_tokens])
    
    unique_words = list(embedding.keys())
    
    id2token =  unique_words #list of words available in embedding
    id2token = ['<pad>', '<unk>'] + id2token #add pad and unknown to the beginning
    
    token2id = dict(zip(unique_words, range(2,2+len(unique_words)))) # dictionary of words and indices 
    token2id['<pad>'] = PAD_IDX  #add pad symbol to the dictionary
    token2id['<unk>'] = UNK_IDX  #add unkown symbol to the dictionary
    
    return token2id, id2token, max_len

# convert token to id in the dataset
def token2index_dataset(tokens_data):
    PAD_IDX = 0
    UNK_IDX = 1
    indices_data = []
    for tokens in tokens_data:
        index_list = [list(token2id[token]) if token in token2id else UNK_IDX for token in tokens] #tokenizes 10k words
        indices_data.append(index_list) #list of lists: indices of tokens for each sentence
    return indices_data


In [3]:
def load_embedding(fname, max_count=None):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    counter=0
    for line in fin:
        counter+=1
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
        if counter==max_count:
            break
    return data

In [4]:
def read_data(file_loc, sep="\t"):
    #Read in data subsets
    data = pd.read_csv(file_loc, sep=sep, encoding='latin-1')
    return data

def tokenize(data):
    data['input1'] = data.sentence1.str.split()

    data['input2'] = data.sentence2.str.split()
    return data

def assign_target(name):
    if name == 'contradiction':
        return 0
    elif name == 'neutral':
        return 1
    else:
        return 2

In [5]:
class VocabDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data_tuple, word2id):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.data_list1, self.data_list2, self.target_list = zip(*data_tuple)
        assert (len(self.data_list1) == len(self.target_list) == len(self.data_list2))
        self.word2id = word2id

    def __len__(self):
        return len(self.data_list1)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        word_idx1 = [self.word2id[c] if c in self.word2id.keys() 
                    else UNK_IDX  for c in self.data_list1[key][:MAX_WORD_LENGTH]]
                                                                   
        word_idx2 = [self.word2id[c] if c in self.word2id.keys() 
                    else UNK_IDX  for c in self.data_list2[key][:MAX_WORD_LENGTH]]                                                                   
                                                                   
        label = self.target_list[key]
        return [word_idx1, word_idx2, len(word_idx1), len(word_idx2), label]

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list1 = []
    data_list2 = []
    label_list = []
    length_list1 = []
    length_list2 = []

        
    # padding
    for datum in batch:
        x1 = datum[0]
        x2 = datum[1]
        len1 = datum[2]
        len2 = datum[3]
        label = datum[4]
        
        label_list.append(label)
        length_list1.append(len1)
        length_list2.append(len2)
        #Pad first sentences
        padded_vec1 = np.pad(np.array(x1),
                                pad_width=((0,MAX_WORD_LENGTH-len1)),
                                mode="constant", constant_values=0)
        data_list1.append(padded_vec1)
        
        #Pad second sentences
        padded_vec2 = np.pad(np.array(x2),
                        pad_width=((0,MAX_WORD_LENGTH-len2)),
                        mode="constant", constant_values=0)
        data_list2.append(padded_vec2)
        
    data_list1 = np.array(data_list1)
    data_list2 = np.array(data_list2)
    length_list1 = np.array(length_list1)
    lenth_list2 = np.array(length_list2)
    label_list = np.array(label_list)
    
    return [torch.from_numpy(np.array(data_list1)), 
            torch.from_numpy(np.array(data_list2)),
            torch.LongTensor(length_list1), 
            torch.LongTensor(length_list2),
            torch.LongTensor(label_list)]


## Read and process data

In [93]:
#Read in data subsets
train_data = read_data('../hw2_data.nosync/snli_train.tsv', sep='\t')[0:1000]
val_data = read_data('../hw2_data.nosync/snli_val.tsv', sep='\t')[0:100]

#Tokenize
train_data = tokenize(train_data)
val_data = tokenize(val_data)


#Assign label
train_data['target'] = train_data.label.apply(lambda x: assign_target(x))
val_data['target'] = val_data.label.apply(lambda x: assign_target(x))

In [94]:
#Read in pretrained embedding vectors - subset for now
embeddings_map = load_embedding('../hw2_data.nosync/wiki-news-300d-1M.vec', max_count=50000)

#Convert embedding values to lists
embeddings = {}

for key, value in embeddings_map.items():
    embeddings[key] = list(value)

In [95]:
#Build vocabulary on train set
token2id, id2token, max_len = build_vocab(train_data['input1'] + train_data['input2'],
                              embeddings)

all_tokens = [item for sublist in train_data['input1'] + train_data['input2'] for item in sublist]
max_len = max([len(word) for word in all_tokens])

In [96]:
# Build train, valid and test dataloaders

#Embed each input and create loaders

MAX_WORD_LENGTH = max_len

train_dataset = VocabDataset(zip(train_data.input1,train_data.input2, 
                                           train_data.target), token2id)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

val_dataset = VocabDataset(zip(val_data.input1[:1000],val_data.input2, 
                                           val_data.target), token2id)

val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

#test_dataset = VocabDataset(test_data, char2id)
#test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
 #                                          batch_size=BATCH_SIZE,
 #                                          collate_fn=vocab_collate_func,
 #                                          shuffle=False)

In [97]:
#Convert embedding to tensor
import numpy as np

y=np.array([np.array(list(xi)) for xi in embeddings.values()])
padding = np.zeros((1, y.shape[1]))
unknown = np.random.rand(1, y.shape[1]) # to account for Padding and Unknown
full_size = np.concatenate([padding, unknown, y], axis=0)
emb_weights = torch.from_numpy(full_size)

In [98]:
emb_weights.shape

torch.Size([50002, 300])

## Modeling

### Now lets implement bidirectional GRU Recurrent Neural Net model

In [126]:
class GRU(nn.Module):
    def __init__(self, emb_weights, emb_size, hidden_size, num_layers, num_classes, vocab_size, dropout=0.5):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # vocab_size: vocabulary size
        super(GRU, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding =  nn.Embedding(vocab_size, emb_size, 
                                       padding_idx=PAD_IDX).from_pretrained(emb_weights, 
                                                                freeze=True) #load preset

        self.gru = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True,
                         bidirectional=True, dropout=dropout) #creates bidirectional GRU
        self.linear1 = nn.Linear(hidden_size*2*2, hidden_size*2*2) #2 for bidirectional, 2 for concatenated
        self.linear2 = nn.Linear(hidden_size*2*2, num_classes) #2 for bidirectional, 2 for concatenated

    
    def forward(self, x1, x2, len1, len2):
        
        batch_size, seq_len = x1.size()        
        
        # get embedding of characters - make sure pretrained weights do not get updated
        embed1 = self.embedding(x1)
        embed2 = self.embedding(x2)
        
      
        # fprop though RNN
        rnn_out1, h1 = self.gru(embed1.float())
        rnn_out2, h2 = self.gru(embed2.float())
        
        # [num_dir, batch_size, dim] => [batch_size, dim x num_dir]
        num_dir, batch_size, dim = h1.shape
        h1 = h1.transpose(0, 1).contiguous().view(batch_size, -1)
        h2 = h2.transpose(0, 1).contiguous().view(batch_size, -1)
 
        #Concatenate two vectors
        combined_vector = torch.cat([h1, h2], dim=1)
        
        logits1 = self.linear1(combined_vector) #FC layer
        logits2 = self.linear2(logits1) #second FC layer
        
        return logits2



In [72]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for sentence1, sentence2, lengths1, lengths2, labels in loader:

        outputs = F.softmax(model(sentence1, sentence2, lengths1, lengths2), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [88]:
def calculate_loss(loader, model, criterion):
    
    model.eval()
    loss_hist = []
    for sentence1, sentence2, lengths1, lengths2, labels in loader:
        y_hat = model(sentence1, sentence2, lengths1, lengths2)
        loss = criterion(y_hat, labels)
        loss_hist.append(loss.item())
    average_loss = np.mean(loss_hist)
    return average_loss

In [85]:
def make_plots(train_acc_hist, val_acc_hist, train_loss_epoch, train_loss_hist, val_loss_hist):
    
    #Accuracy
    plt.plot(train_acc_hist, label='Train Set Accuracy')
    plt.plot(val_acc_hist, label='Val Set Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Iteration')
    plt.title('Training Curves')
    plt.legend()
    plt.show()
    
    #Loss
    plt.plot(train_loss_hist)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training Loss per Iteration')
    plt.show()
    
    plt.plot(val_loss_hist)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Validation Loss per Iteration')
    plt.show()
    
    print ("Val Acc Last Epoch {}".format(val_acc_hist[-1]))
    print( "Max Val Acc {}".format(max(val_acc_hist)))
    print( "Avg Val Acc {}".format(np.average(val_acc_hist)))

In [116]:

def train(train_loader, val_loader, model, num_epochs, learning_rate):
    
    # Criterion and Optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []

    for epoch in range(num_epochs):
        for i, (sentence1, sentence2, lengths1, lengths2, labels) in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()

            # Forward pass
            outputs = model(sentence1, sentence2, lengths1, lengths2)
            #print(outputs)
            loss = criterion(outputs, labels)

            # Backward and optimize
            loss.backward()

            optimizer.step()

            # validate every 100 iterations
            if i > 0 and i % 1 == 0:
                # validate
                val_acc = test_model(val_loader, model)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}, Training Loss: {}'.format(
                           epoch+1, num_epochs, i+1, len(train_loader), val_acc, loss.item()))
                
                train_acc = test_model(train_loader, model)
                train_accs.append(train_acc)
                val_accs.append(val_acc)
                
                train_losses.append(calculate_loss(train_loader, model, criterion))
                val_losses.append(calculate_loss(val_loader, model, criterion))  
        
             
    return train_losses, train_accs, val_losses, val_accs 

In [125]:
#Single layer bidirectional GRU
gru_model = GRU(emb_weights, emb_size=emb_weights.shape[1], 
            hidden_size=200, num_layers=1, num_classes=3, 
            vocab_size=len(id2token), dropout=0.5)

gru_train_losses, gru_train_acc, gru_val_losses, gru_val_acc  = train(train_loader, 
                                                                      val_loader, gru_model, 
                                                                      num_epochs=1, learning_rate = 3e-4)

  "num_layers={}".format(dropout, num_layers))


Epoch: [1/1], Step: [2/32], Validation Acc: 40.0, Training Loss: 1.081879734992981
Epoch: [1/1], Step: [3/32], Validation Acc: 40.0, Training Loss: 1.1594635248184204


KeyboardInterrupt: 

In [123]:
gru_model = GRU(emb_weights, emb_size=emb_weights.shape[1], 
            hidden_size=200, num_layers=1, num_classes=3, 
            vocab_size=len(id2token), dropout=0)

gru_train_losses, gru_train_acc, gru_val_losses, gru_val_acc  = train(train_loader, 
                                                                      val_loader, gru_model, 
                                                                      num_epochs=1, learning_rate = 3e-4)

Epoch: [1/1], Step: [2/32], Validation Acc: 40.0, Training Loss: 1.0764132738113403
Epoch: [1/1], Step: [3/32], Validation Acc: 40.0, Training Loss: 1.0964765548706055
Epoch: [1/1], Step: [4/32], Validation Acc: 40.0, Training Loss: 1.1579071283340454
Epoch: [1/1], Step: [5/32], Validation Acc: 40.0, Training Loss: 1.144288182258606
Epoch: [1/1], Step: [6/32], Validation Acc: 40.0, Training Loss: 1.0825755596160889
Epoch: [1/1], Step: [7/32], Validation Acc: 36.0, Training Loss: 1.0659681558609009
Epoch: [1/1], Step: [8/32], Validation Acc: 37.0, Training Loss: 1.0935747623443604
Epoch: [1/1], Step: [9/32], Validation Acc: 33.0, Training Loss: 1.1189136505126953
Epoch: [1/1], Step: [10/32], Validation Acc: 32.0, Training Loss: 1.1239734888076782
Epoch: [1/1], Step: [11/32], Validation Acc: 31.0, Training Loss: 1.1283109188079834
Epoch: [1/1], Step: [12/32], Validation Acc: 30.0, Training Loss: 1.094236969947815
Epoch: [1/1], Step: [13/32], Validation Acc: 30.0, Training Loss: 1.0931586

In [86]:
make_plots(gru_train_losses, gru_train_acc, gru_val_losses, gru_val_acc)

NameError: name 'gru_train_losses' is not defined

## Hyperparameter tuning

### Hidden Size dimensions

In [111]:
hidden_sizes = [50, 100, 200, 300, 500]

hidden_results = {}

for h in hidden_sizes:
    
    gru_model = GRU(emb_weights, emb_size=emb_weights.shape[1], 
                hidden_size=h, num_layers=1, num_classes=3, 
                vocab_size=len(id2token))

    gru_train_losses, gru_train_acc, gru_val_losses, gru_val_acc  = train(train_loader, 
                                                                          val_loader, gru_model, 
                                                                          num_epochs=1, learning_rate = 3e-4)
    
    hidden_results[h] = [gru_train_losses, gru_train_acc, gru_val_losses, gru_val_acc]
    



Epoch: [1/1], Step: [2/32], Validation Acc: 34.0, Training Loss: 1.0942107439041138
Epoch: [1/1], Step: [3/32], Validation Acc: 34.0, Training Loss: 1.0986782312393188
Epoch: [1/1], Step: [4/32], Validation Acc: 36.0, Training Loss: 1.0928051471710205
Epoch: [1/1], Step: [5/32], Validation Acc: 37.0, Training Loss: 1.1051435470581055
Epoch: [1/1], Step: [6/32], Validation Acc: 41.0, Training Loss: 1.0966055393218994
Epoch: [1/1], Step: [7/32], Validation Acc: 37.0, Training Loss: 1.0905410051345825
Epoch: [1/1], Step: [8/32], Validation Acc: 38.0, Training Loss: 1.080612063407898
Epoch: [1/1], Step: [9/32], Validation Acc: 38.0, Training Loss: 1.1300971508026123
Epoch: [1/1], Step: [10/32], Validation Acc: 35.0, Training Loss: 1.1146891117095947
Epoch: [1/1], Step: [11/32], Validation Acc: 35.0, Training Loss: 1.1196895837783813
Epoch: [1/1], Step: [12/32], Validation Acc: 35.0, Training Loss: 1.1068370342254639
Epoch: [1/1], Step: [13/32], Validation Acc: 38.0, Training Loss: 1.119123

Epoch: [1/1], Step: [7/32], Validation Acc: 42.0, Training Loss: 1.11130952835083
Epoch: [1/1], Step: [8/32], Validation Acc: 33.0, Training Loss: 1.0682414770126343
Epoch: [1/1], Step: [9/32], Validation Acc: 30.0, Training Loss: 1.0564875602722168
Epoch: [1/1], Step: [10/32], Validation Acc: 30.0, Training Loss: 1.0630210638046265
Epoch: [1/1], Step: [11/32], Validation Acc: 30.0, Training Loss: 1.1325324773788452
Epoch: [1/1], Step: [12/32], Validation Acc: 31.0, Training Loss: 1.090908408164978
Epoch: [1/1], Step: [13/32], Validation Acc: 30.0, Training Loss: 1.173345685005188
Epoch: [1/1], Step: [14/32], Validation Acc: 29.0, Training Loss: 1.1883749961853027
Epoch: [1/1], Step: [15/32], Validation Acc: 33.0, Training Loss: 1.1041111946105957
Epoch: [1/1], Step: [16/32], Validation Acc: 39.0, Training Loss: 1.1010730266571045
Epoch: [1/1], Step: [17/32], Validation Acc: 42.0, Training Loss: 1.0474096536636353
Epoch: [1/1], Step: [18/32], Validation Acc: 40.0, Training Loss: 1.0658

In [118]:
hidden_results[50][3]

[34.0,
 34.0,
 36.0,
 37.0,
 41.0,
 37.0,
 38.0,
 38.0,
 35.0,
 35.0,
 35.0,
 38.0,
 42.0,
 39.0,
 39.0,
 36.0,
 36.0,
 33.0,
 34.0,
 33.0,
 32.0,
 34.0,
 33.0,
 34.0,
 33.0,
 31.0,
 33.0,
 33.0,
 37.0,
 32.0,
 34.0]

### Regularization

### Now lets implement basic Convolutional Neural Net model for text


In [75]:
class CNN(nn.Module):
    def __init__(self, emb_weights, emb_size, hidden_size, num_layers, num_classes, vocab_size):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding =  nn.Embedding(vocab_size, emb_size, 
                                       padding_idx=PAD_IDX).from_pretrained(emb_weights, 
                                                                freeze=True) #load preset
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)

        self.linear1 = nn.Linear(hidden_size*2, hidden_size) #2 for concatenated
        self.linear2 = nn.Linear(hidden_size, num_classes) #2 for bidirectional, 2 for concatenated
    
     

    def forward(self, x1, x2, len1, len2):
        
        batch_size, seq_len = x1.size()
            
        embed1 = self.embedding(x1)
        hidden = self.conv1(embed1.float().transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))
        #max pool over time
        hidden1 = torch.max(hidden, dim=1)[0]
        
        #Second sentence pass
        
        embed2 = self.embedding(x2)
        hidden2 = self.conv1(embed2.float().transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden2 = self.conv2(hidden2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size, seq_len, hidden.size(-1))
        #max pool over time
        hidden2 = torch.max(hidden2, dim=1)[0]
        
        #Concatenate two vectors
        combined_vector = torch.cat([hidden1, hidden2], dim=1)
        
        logits1 = self.linear1(combined_vector)
        logits2 = self.linear2(logits1)
        
        return logits2

In [76]:
#Single layer bidirectional GRU
cnn_model = CNN(emb_weights, emb_size=emb_weights.shape[1], 
            hidden_size=200, num_layers=2, num_classes=3, 
            vocab_size=len(id2token))

cnn_losses, cnn_accuracies = train(train_loader, val_loader, gru_model, num_epochs=50, learning_rate = 3e-4)

Epoch: [1/50], Step: [101/3125], Validation Acc: 54.9, Training Loss: 0.9760991930961609
Epoch: [1/50], Step: [201/3125], Validation Acc: 57.5, Training Loss: 0.9982364177703857
Epoch: [1/50], Step: [301/3125], Validation Acc: 57.1, Training Loss: 0.9656170606613159
Epoch: [1/50], Step: [401/3125], Validation Acc: 55.5, Training Loss: 0.9312114119529724
Epoch: [1/50], Step: [501/3125], Validation Acc: 56.5, Training Loss: 0.8374489545822144
Epoch: [1/50], Step: [601/3125], Validation Acc: 56.8, Training Loss: 0.8368998765945435


KeyboardInterrupt: 