In [1]:
!pip3 install torchtext==0.4

Collecting torchtext==0.4
  Downloading torchtext-0.4.0-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.3 MB/s eta 0:00:01
Collecting torch
  Downloading torch-1.7.0-cp38-none-macosx_10_9_x86_64.whl (108.1 MB)
[K     |████████████████████████████████| 108.1 MB 23.8 MB/s eta 0:00:01   |███                             | 10.5 MB 3.0 MB/s eta 0:00:33
Collecting dataclasses
  Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Installing collected packages: dataclasses, torch, torchtext
Successfully installed dataclasses-0.6 torch-1.7.0 torchtext-0.4.0


In [2]:
import math
import numpy as np
import pandas as pd
import pdb

import torch
import torch.nn.functional as F
import torch.distributions
import torch.optim as optim
from torch import nn 
from torch.autograd import Variable

import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe

In [63]:
class Model(torch.nn.Module):
    def __init__(self, batch_size, num_classes, mlp_out_size, vocab_size, embedding_length, weights, hidden_size = 100, biDirectional = False):
        super(Model, self).__init__() 
        """
        Arguments
        ---------
        batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
        output_size : 28 = (For full classification)
        hidden_sie : Size of the hidden_state of the LSTM   (// Later BiLSTM)
        vocab_size : Size of the vocabulary containing unique words
        embedding_length : Embeddding dimension of GloVe word embeddings
        weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
        --------

        """

        self.batch_size = batch_size
        self.num_classes = num_classes
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length
        self.hidden_size = hidden_size

        self.mlp_out_size = mlp_out_size
        self.biDirectional = biDirectional

        self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
        self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)
        
        self.lstm_layer = LSTM(self.batch_size, self.hidden_size, self.embedding_length, self.biDirectional)

        if(self.biDirectional):
            self.mlp = MLP(self.hidden_size*2, self.mlp_out_size)
            self.FF = nn.Linear(self.hidden_size*2, num_classes)
        else:
            self.mlp = MLP(self.hidden_size, self.mlp_out_size)
            self.FF = nn.Linear(self.hidden_size, self.num_classes)

    def forward(self, input_sequence):
        input_ = self.word_embeddings(input_sequence)
        out_lstm, final_hidden_state = self.lstm_layer(input_)
        if self.biDirectional:
            final_hidden_state = final_hidden_state.view(2, 2, input_.shape[0], self.hidden_size) # num_layer x num_dir x batch x hidden
            final_hidden_state = final_hidden_state[-1]
            final_hidden_state = final_hidden_state.transpose(0,1).reshape(input_.shape[0], self.hidden_size*2)
        else:
            final_hidden_state = final_hidden_state[-1]
        
        ff_output = self.FF(final_hidden_state)
#         print("FF out size: ", ff_output.shape)
#         predictions = torch.softmax(ff_output, dim = -1)
        return ff_output

In [46]:
class LSTM(torch.nn.Module):
    """
        Arguments
        ---------
        batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
        hidden_size : Size of the hidden_state of the LSTM   (* Later BiLSTM, check dims for BiLSTM *)
        embedding_length : Embeddding dimension of GloVe word embeddings
        --------
    """
    def __init__(self, batch_size, hidden_size, embedding_length, biDirectional = False, num_layers = 2):

        super(LSTM, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.embedding_length = embedding_length
        self.biDirectional	= biDirectional
        self.num_layers = num_layers

        self.lstm = nn.LSTM(self.embedding_length, self.hidden_size, bidirectional = self.biDirectional, batch_first = True, num_layers = self.num_layers)   # Dropout  

    def forward(self, input_sequence, batch_size=None):
        out_lstm, (final_hidden_state, final_cell_state) = self.lstm(input_sequence)   # ouput dim: ( batch_size x seq_len x hidden_size )
        return out_lstm, final_hidden_state

In [5]:
# If want to add extra MLP Layer
class MLP(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MLP, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.ff_1 = nn.Linear(self.input_dim, self.output_dim)
        self.relu = nn.ReLU()
        self.ff_2 = nn.Linear(self.output_dim,1)
        self.sigmoid = nn.Sigmoid()

    def forward(self,x):
        out_1 = self.ff_1(x)
        out_relu = self.relu(out_1)
        out_2 = self.ff_2(out_relu)
        out_sigmoid = self.sigmoid(out_2)

        return out_sigmoid

In [6]:
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)

In [83]:
def train_model(model, optim, train_iter, epoch, batch_size, num_classes):
    total_epoch_loss = 0
    total_epoch_acc = 0
    
    steps = 0
    model.train()
    for idx, batch in enumerate(train_iter):
        text = batch.text[0]
        target = batch.labels
        target = torch.autograd.Variable(target)
        if torch.cuda.is_available():
            text = text.cuda()
            target = target.cuda()
        if (text.size()[0] is not batch_size): # One of the batch returned by BucketIterator has length different than 32.
            continue
        optim.zero_grad()
        output = model(text)
#         print("prediction = ", prediction.shape)
#         print("target = ", target.shape)
#         print("prediction = ", prediction)
#         print("target = ", target)

        # Sigmoid layer and the BCELoss in one single class
        loss = loss_fn(output, target.type_as(output))
        print("loss = ", loss)
        
        score = torch.sigmoid(output).cpu()       
        predicted = torch.round(score)
        num_corrects = (predicted == target).sum()
        
#         num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
        acc = 100.0 * num_corrects/len(batch)*num_classes
        loss.backward()
        clip_gradient(model, 1e-1)
        optim.step()
        steps += 1
        
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()

        
    return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)

In [80]:
def eval_model(model, val_iter, num_classes):
    total_epoch_loss = 0
    total_epoch_acc = 0
    total_attention =  0
    total_samples = 0 
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(val_iter):
            text = batch.text[0]
            if (text.size()[0] != 32):
                continue
            target = batch.labels
            target = torch.autograd.Variable(target)
            if torch.cuda.is_available():
                text = text.cuda()
                target = target.cuda()
            output = model(text)
            
            score = torch.sigmoid(output).cpu()       
            predicted = torch.round(score)
            total += target.size(0)
            num_corrects = (predicted == target).sum()
            # print("Test Prediction: ", prediction)

            loss =  loss_fn(output, target.type_as(output))
            
            if math.isnan(loss.item()):
                print(output, target)
            
#             num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
            acc = 100.0 * num_corrects/len(batch)*num_classes
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()
            
    return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)

In [36]:
def vectorize(x):
    vec = np.zeros(28)
    labels = x.split(',')
    for x in labels:
        vec[int(x)-1] = 1.0
    return vec

def load_data(batch_size= 16, embedding_length = 100):
    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=30)
    LABELS = data.Field(sequential=True, tokenize=vectorize, batch_first=True)

    train, val, test = data.TabularDataset.splits(
      path='/Users/prakruti/Documents/GoEmotions-classification/data', train='train.tsv',
      validation='dev.tsv', test='test.tsv', format='tsv',
      fields=[('text', TEXT), ('labels', LABELS)])
    
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
      (train, val, test), batch_sizes=(16, 16, 16),
      sort_key=lambda x: len(x.text), device=0)

    # build the vocabulary
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=embedding_length))
    LABELS.build_vocab(train)
    print(LABELS.vocab.__dict__)

    word_embeddings = TEXT.vocab.vectors
    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, val_iter, test_iter

In [37]:
TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_data()

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


{'freqs': Counter({0.0: 1164377, 1.0: 51103}), 'itos': ['<unk>', '<pad>', 0.0, 1.0], 'unk_index': 0, 'stoi': defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fc0c5443fa0>>, {'<unk>': 0, '<pad>': 1, 0.0: 2, 1.0: 3}), 'vectors': None}


In [38]:
next(iter(train_iter))


[torchtext.data.batch.Batch of size 16]
	[.text]:('[torch.LongTensor of size 16x30]', '[torch.LongTensor of size 16]')
	[.labels]:[torch.LongTensor of size 16x28]

In [85]:
# Over-writing the loss function to simple cross entropy loss
loss_fn = loss = nn.BCEWithLogitsLoss()

learning_rate = 2e-4
batch_size = 16
output_size = 2
embedding_length = 100
num_classes = 28
mlp_out_size = 32
weights = word_embeddings
hidden_size = 100

model = Model(batch_size, num_classes, mlp_out_size, vocab_size, embedding_length, weights, hidden_size, biDirectional=False)
optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))

In [86]:
num_bad_epochs = 0
epoch = 0
least_loss = float('inf')
training_stats = pd.DataFrame(columns=['Epoch', 'Train_Loss', 'Train_Acc', 'Val_Loss', 'Val_Acc'])

while(True):
    train_loss, train_acc = train_model(model, optim, train_iter, epoch, batch_size, num_classes)
    val_loss, val_acc = eval_model(model, valid_iter, num_classes) 
    print(f'Epoch: {epoch+1:02}')
    if val_loss < least_loss:
        least_loss = val_loss
        num_bad_epochs = 0
        print("*** Least validation loss")
        torch.save(model.state_dict(), "LSTM")
    else:
        num_bad_epochs += 1
#     print(f'Epoch: {epoch+1:2}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%')
    print(f'Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%') 
    print(f'Val Loss: {val_loss:3f}, Val Acc: {val_acc:.2f}%')
    print("-------------")
    
    training_stats = training_stats.append(
        pd.Series([epoch+1, train_loss, train_acc, val_loss, val_acc], index=training_stats.columns), 
        ignore_index=True)
    if num_bad_epochs >= 10:
        break
        
    epoch += 1
    if epoch == 10:
        break

loss =  tensor(0.7231, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(0.6778, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(0.6307, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(0.5804, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(0.5088, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(0.4231, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(0.3265, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(0.1792, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(0.0183, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-0.1099, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-0.2712, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-0.4982, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-0.6723, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-0.8416, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
l

loss =  tensor(-13.5196, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-13.6839, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-13.6767, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-13.8028, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-13.9158, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-14.0591, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-14.1500, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-14.2468, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-14.3324, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-14.4851, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-14.6605, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-14.6391, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-14.7834, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-14.8837, grad_fn=<BinaryCrossEntrop

loss =  tensor(-26.0787, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-26.0313, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-26.3543, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-26.0707, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-26.2550, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-26.5166, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-26.6366, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-26.6809, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-26.7165, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-26.8222, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-26.9969, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-27.0029, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-27.2812, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-27.2645, grad_fn=<BinaryCrossEntrop

loss =  tensor(-38.2280, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-38.3341, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-38.1070, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-38.3057, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-38.6576, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-38.8447, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-38.5336, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-38.7371, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-39.1727, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-39.0266, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-39.1284, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-39.5797, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-39.5869, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-39.4475, grad_fn=<BinaryCrossEntrop

loss =  tensor(-50.3948, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-50.7142, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-50.3947, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-50.4932, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-50.7061, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-50.5903, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-50.9284, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-50.8107, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-51.0169, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-51.0320, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-51.3383, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-51.1133, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-51.6794, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-51.5622, grad_fn=<BinaryCrossEntrop

loss =  tensor(-62.5443, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-62.1049, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-62.2254, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-62.4571, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-62.5582, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-62.9395, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-63.1608, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-62.7652, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-63.5047, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-63.3646, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-63.4741, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-63.4565, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-63.8171, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-63.5187, grad_fn=<BinaryCrossEntrop

loss =  tensor(-74.1330, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-74.8715, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-74.5058, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-74.6160, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-75.0405, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-75.2983, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-74.5872, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-75.5071, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-74.9786, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-75.8951, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-75.3329, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-75.2833, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-75.7098, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
loss =  tensor(-76.3144, grad_fn=<BinaryCrossEntrop

KeyboardInterrupt: 