In [218]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
random.seed(0)
seed = 0

In [219]:
filename = 'data/train_conll_spanglish.csv'
import torchtext

def label2int(label):
    if label=='positive':
        return 1
    elif label=='negative':
        return -1
    else:
        return 0

text_field = torchtext.data.Field(sequential=True,      # text sequence
                                  tokenize=lambda x: x, # because are building a character-RNN
                                  include_lengths=False, # to track the length of sequences, for batching
                                  batch_first=True,
                                  use_vocab=True)       # to turn each character into an integer index
label_field = torchtext.data.Field(sequential=False,    # not a sequence
                                   use_vocab=False,     # don't need to track vocabulary
                                   is_target=True,      
                                   batch_first=True,
                                   preprocessing=lambda x: label2int(x)) # convert text to 0 and 1

fields = [('id', None),('text', text_field), ('label', label_field)]
dataset = torchtext.data.TabularDataset(filename, # name of the file
                                        "tsv",               # fields are separated by a tab
                                        fields)

In [220]:
for i in range(0,10):
    print(dataset[i].text, "---", dataset[i].label)

So that means tomorrow cruda segura lol --- 1
Tonight peda segura --- 0
Eres tan mala vieja bruja interesada#jamming --- -1
Yo kiero Pretzels lol --- 0
Fuck that ni ke el me vaya a mantener toda la vida lol --- -1
I always tell my dad ke me kiero kasar con una vieja rika and me regaña telling me ke no sea interesada ha --- -1
Ke me compre un carrito pa irme con mis friends and party lol --- 0
Why can I just find a rich bitch ke me mantenga y ya ha --- 0
Since I started working ya ni disfruto la vida lol --- -1
My dad me regano cuzs I was telling that to my brother and lo andaba molestando lol --- -1


In [221]:
train, val, test = dataset.split(split_ratio=[0.8,0.1,0.1])

In [222]:
print(len(train), len(val), len(test), len(dataset))

12000 1500 1500 15000


In [223]:
text_field.build_vocab(dataset)
text_field.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fb5335ed7d0>>,
            {'<unk>': 0,
             '<pad>': 1,
             ' ': 2,
             'e': 3,
             'a': 4,
             'o': 5,
             't': 6,
             's': 7,
             'n': 8,
             'i': 9,
             'r': 10,
             'l': 11,
             'c': 12,
             'd': 13,
             'u': 14,
             'm': 15,
             'h': 16,
             'p': 17,
             'y': 18,
             '.': 19,
             '/': 20,
             'g': 21,
             'b': 22,
             'v': 23,
             ':': 24,
             '@': 25,
             '!': 26,
             'f': 27,
             'A': 28,
             'j': 29,
             'E': 30,
             'T': 31,
             'R': 32,
             'q': 33,
             'k': 34,
             '#': 35,
             'w': 36,
             'M': 37,
             'S': 38,
             'O': 39,
             'L':

In [224]:
text_field.vocab.itos

['<unk>',
 '<pad>',
 ' ',
 'e',
 'a',
 'o',
 't',
 's',
 'n',
 'i',
 'r',
 'l',
 'c',
 'd',
 'u',
 'm',
 'h',
 'p',
 'y',
 '.',
 '/',
 'g',
 'b',
 'v',
 ':',
 '@',
 '!',
 'f',
 'A',
 'j',
 'E',
 'T',
 'R',
 'q',
 'k',
 '#',
 'w',
 'M',
 'S',
 'O',
 'L',
 'I',
 'N',
 'P',
 'C',
 'D',
 'z',
 ',',
 'G',
 'B',
 '1',
 'Y',
 '0',
 'J',
 'H',
 'U',
 'F',
 'x',
 'V',
 '2',
 '_',
 'í',
 'W',
 '"',
 '3',
 'K',
 '7',
 '?',
 'Z',
 '6',
 'Q',
 '4',
 '5',
 '8',
 '9',
 'á',
 "'",
 'ñ',
 '…',
 'é',
 'X',
 'ó',
 '-',
 '😂',
 ')',
 '(',
 '️',
 '|',
 '❤',
 '😍',
 'ú',
 '’',
 '“',
 '”',
 '^',
 '&',
 '¿',
 '😭',
 '*',
 '>',
 '$',
 ';',
 '😁',
 '¡',
 '👌',
 '・',
 '🔥',
 '🏼',
 '😱',
 '☺',
 '🚨',
 '🎉',
 '👏',
 'Í',
 '😘',
 'É',
 '😝',
 '😎',
 '🏻',
 '🏽',
 '🙌',
 '💕',
 '😊',
 '~',
 '♥',
 '•',
 'Á',
 'Ñ',
 '<',
 '🎵',
 '😩',
 '💀',
 '+',
 '😜',
 '👍',
 '😋',
 '😒',
 '✌',
 '😏',
 '😡',
 '%',
 'Ó',
 '👗',
 '💃',
 '💋',
 '💪',
 '=',
 '☀',
 '💁',
 '😔',
 '🎶',
 '😉',
 '😳',
 '😌',
 '🤔',
 '💯',
 '🍻',
 '👇',
 '😈',
 '🙄',
 '🙋',
 '👠',
 '🌸',
 '😅',
 '😫',
 '

In [225]:
small_batch = 32
train_iter = torchtext.data.BucketIterator(train,
                                           batch_size=small_batch,
                                           sort_key=lambda x: len(x.text), # to minimize padding
                                           sort_within_batch=True,        # sort within each batch
                                          )
#                                            repeat=True)                   # repeat the iterator for multiple epochs
val_iter = torchtext.data.BucketIterator(val,
                                           batch_size=small_batch,
                                           sort_key=lambda x: len(x.text), # to minimize padding
                                           sort_within_batch=True,        # sort within each batch
                                        )
#                                            repeat=True)                   # repeat the iterator for multiple epochs
test_iter = torchtext.data.BucketIterator(test,
                                           batch_size=small_batch,
                                           sort_key=lambda x: len(x.text), # to minimize padding
                                           sort_within_batch=True,        # sort within each batch
                                         )
#                                            repeat=True)                   # repeat the iterator for multiple epochs



In [226]:
for i, batch in enumerate(train_iter):
    if i >= 10:
        break
    print(batch.text)
    print(batch.label)

tensor([[43,  4, 10,  ..., 37, 30, 72],
        [32, 31, 25,  ..., 44, 27, 58],
        [32, 31, 25,  ..., 20, 78,  1],
        ...,
        [39, 37, 48,  ...,  3,  1,  1],
        [44, 14,  6,  ...,  6,  1,  1],
        [25, 44,  9,  ..., 11,  1,  1]])
tensor([-1,  1,  0,  1,  1,  1,  0,  0,  0,  1,  1,  0, -1,  1,  0,  0, -1,  1,
        -1,  1,  0,  0,  1,  0, -1,  1,  0,  1,  0,  1,  1,  1])
tensor([[ 35,  32,   3,  ...,   4,   4,  40],
        [ 25,  22,  10,  ...,  24,  84, 124],
        [ 54,   3,  18,  ...,  29,  74,  80],
        ...,
        [ 44,   5,   8,  ...,   6,  11,   1],
        [ 44,  10,   3,  ...,  23,  14,   1],
        [ 54,   3,  12,  ...,  31,  45,   1]])
tensor([ 1,  1,  1,  1,  1,  1,  0,  0, -1,  1,  1,  0,  1,  1, -1,  0,  1,  0,
         0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1])
tensor([[ 25,   3,  11,  ...,  16,  24,  84],
        [ 30,  11,   2,  ...,  11,  56,  27],
        [ 25,  53,   4,  ...,   3, 187, 104],
        ...,
        [ 40,  

In [227]:
from torch import nn

class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
        super().__init__()
        # params: "n_" means dimension
        self.n_vocab = n_vocab     # number of unique words in vocabulary
        self.n_layers = n_layers   # number of LSTM layers 
        self.n_hidden = n_hidden   # number of hidden nodes in LSTM
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
    def forward (self, input_words):
                                             # INPUT   :  (batch_size, seq_length)
        batch_size = len(input_words)
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        lstm_out, h = self.lstm(embedded_words)         # (batch_size, seq_length, n_hidden)
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
        fc_out = self.fc(lstm_out)                      # (batch_size*seq_length, n_output)
        sigmoid_out = self.sigmoid(fc_out)              # (batch_size*seq_length, n_output)
        sigmoid_out = sigmoid_out.view(batch_size, -1)  # (batch_size, seq_length*n_output)
        
        # extract the output of ONLY the LAST output of the LAST element of the sequence
        sigmoid_last = sigmoid_out[:, -1].view([batch_size,1])               # (batch_size, 1)
        # print("this",sigmoid_last.shape, sigmoid_out.shape, input_words.shape)
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):  # initialize hidden weights (h,c) to 0
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h

In [228]:
n_vocab = len(text_field.vocab.itos)
n_embed = 400
n_hidden = 512
n_output = 3   # 1 ("positive") or 0 ("negative")
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)

In [261]:
from torch import optim
device = 'cuda' if torch.cuda.is_available else 'cpu'

optimizer = optim.Adam(net.parameters(), lr = 0.001)
# criterion = nn.BCELoss()
criterion = nn.BCEWithLogitsLoss()
model = net.to(device)
criterion = criterion.to(device)

NameError: name 'BCEWithLogitsLoss' is not defined

In [253]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = torch.sum(correct)/ len(correct)

    return acc

In [254]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        h = model.init_hidden(len(batch))
        batch.text = batch.text.to(device)
        predictions, _ = model(batch.text)
        predictions = predictions.squeeze(1)
        # print(batch.text.shape, predictions.shape, batch.label.shape)

        target = torch.tensor(batch.label, dtype=torch.float, device=device)

        loss = criterion(predictions, target)

        acc = binary_accuracy(predictions, target)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [255]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            h = model.init_hidden(len(batch))
            batch.text = batch.text.to(device)
            predictions, _ = model(batch.text)
            predictions = predictions.squeeze(1)
            target = torch.tensor(batch.label, dtype=torch.float, device=device)


            loss = criterion(predictions, target)
            
            acc = binary_accuracy(predictions, target)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [256]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [258]:
N_EPOCHS = 100

best_valid_loss = float('inf')


for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, val_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


Epoch: 01 | Epoch Time: 0m 8s
	Train Loss: -1.381 | Train Acc: 52.15%
	 Val. Loss: 0.438 |  Val. Acc: 49.01%
Epoch: 02 | Epoch Time: 0m 8s
	Train Loss: -1.673 | Train Acc: 52.30%
	 Val. Loss: 0.201 |  Val. Acc: 49.44%
Epoch: 03 | Epoch Time: 0m 8s
	Train Loss: -2.001 | Train Acc: 52.48%
	 Val. Loss: 0.305 |  Val. Acc: 48.98%
Epoch: 04 | Epoch Time: 0m 8s
	Train Loss: -2.155 | Train Acc: 53.03%
	 Val. Loss: 0.655 |  Val. Acc: 48.03%
Epoch: 05 | Epoch Time: 0m 8s
	Train Loss: -2.450 | Train Acc: 52.84%
	 Val. Loss: 0.347 |  Val. Acc: 48.77%
Epoch: 06 | Epoch Time: 0m 8s
	Train Loss: -2.357 | Train Acc: 52.66%
	 Val. Loss: 0.360 |  Val. Acc: 48.76%
Epoch: 07 | Epoch Time: 0m 8s
	Train Loss: -2.605 | Train Acc: 53.19%
	 Val. Loss: 0.195 |  Val. Acc: 48.19%
Epoch: 08 | Epoch Time: 0m 8s
	Train Loss: -2.450 | Train Acc: 52.77%
	 Val. Loss: 0.493 |  Val. Acc: 48.51%
Epoch: 09 | Epoch Time: 0m 8s
	Train Loss: -2.893 | Train Acc: 53.22%
	 Val. Loss: 0.168 |  Val. Acc: 48.39%
Epoch: 10 | Epoch T

Epoch: 77 | Epoch Time: 0m 9s
	Train Loss: -7.419 | Train Acc: 59.97%
	 Val. Loss: 1.513 |  Val. Acc: 47.91%
Epoch: 78 | Epoch Time: 0m 8s
	Train Loss: -7.121 | Train Acc: 60.04%
	 Val. Loss: 1.611 |  Val. Acc: 48.63%
Epoch: 79 | Epoch Time: 0m 9s
	Train Loss: -7.341 | Train Acc: 59.67%
	 Val. Loss: 1.837 |  Val. Acc: 48.39%
Epoch: 80 | Epoch Time: 0m 9s
	Train Loss: -7.481 | Train Acc: 59.46%
	 Val. Loss: 2.769 |  Val. Acc: 47.45%
Epoch: 81 | Epoch Time: 0m 9s
	Train Loss: -7.156 | Train Acc: 59.85%
	 Val. Loss: 2.174 |  Val. Acc: 48.16%
Epoch: 82 | Epoch Time: 0m 9s
	Train Loss: -7.105 | Train Acc: 59.42%
	 Val. Loss: 2.070 |  Val. Acc: 48.23%
Epoch: 83 | Epoch Time: 0m 9s
	Train Loss: -7.393 | Train Acc: 59.50%
	 Val. Loss: 1.698 |  Val. Acc: 48.24%
Epoch: 84 | Epoch Time: 0m 9s
	Train Loss: -7.136 | Train Acc: 59.68%
	 Val. Loss: 1.492 |  Val. Acc: 49.03%
Epoch: 85 | Epoch Time: 0m 9s
	Train Loss: -7.216 | Train Acc: 59.68%
	 Val. Loss: 2.636 |  Val. Acc: 48.35%
Epoch: 86 | Epoch T

In [260]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: -0.107 | Test Acc: 51.59%
