In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
random.seed(0)
seed = 0

In [16]:
filename = 'data/train_conll_spanglish.csv'
import torchtext

def label2int(label):
    if label=='positive':
        return 1
    elif label=='negative':
        return -1
    else:
        return 0

text_field = torchtext.data.Field(sequential=True,      # text sequence
                                  tokenize=lambda x: x, # because are building a character-RNN
                                  include_lengths=True, # to track the length of sequences, for batching
                                  batch_first=True,
                                  use_vocab=True)       # to turn each character into an integer index
label_field = torchtext.data.Field(sequential=False,    # not a sequence
                                   use_vocab=False,     # don't need to track vocabulary
                                   is_target=True,      
                                   batch_first=True,
                                   preprocessing=lambda x: label2int(x)) # convert text to 0 and 1

fields = [('id', None),('text', text_field), ('label', label_field)]
dataset = torchtext.data.TabularDataset(filename, # name of the file
                                        "tsv",               # fields are separated by a tab
                                        fields)


In [17]:
for i in range(0,10):
    print(dataset[i].text, "---", dataset[i].label)

So that means tomorrow cruda segura lol --- 1
Tonight peda segura --- 0
Eres tan mala vieja bruja interesada#jamming --- -1
Yo kiero Pretzels lol --- 0
Fuck that ni ke el me vaya a mantener toda la vida lol --- -1
I always tell my dad ke me kiero kasar con una vieja rika and me regaña telling me ke no sea interesada ha --- -1
Ke me compre un carrito pa irme con mis friends and party lol --- 0
Why can I just find a rich bitch ke me mantenga y ya ha --- 0
Since I started working ya ni disfruto la vida lol --- -1
My dad me regano cuzs I was telling that to my brother and lo andaba molestando lol --- -1


In [18]:
train, val, test = dataset.split(split_ratio=[0.8,0.1,0.1])

In [19]:
print(len(train), len(val), len(test), len(dataset))

12000 1500 1500 15000


In [20]:
text_field.build_vocab(dataset)
text_field.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f1c81f7c2d0>>,
            {'<unk>': 0,
             '<pad>': 1,
             ' ': 2,
             'e': 3,
             'a': 4,
             'o': 5,
             't': 6,
             's': 7,
             'n': 8,
             'i': 9,
             'r': 10,
             'l': 11,
             'c': 12,
             'd': 13,
             'u': 14,
             'm': 15,
             'h': 16,
             'p': 17,
             'y': 18,
             '.': 19,
             '/': 20,
             'g': 21,
             'b': 22,
             'v': 23,
             ':': 24,
             '@': 25,
             '!': 26,
             'f': 27,
             'A': 28,
             'j': 29,
             'E': 30,
             'T': 31,
             'R': 32,
             'q': 33,
             'k': 34,
             '#': 35,
             'w': 36,
             'M': 37,
             'S': 38,
             'O': 39,
             'L':

In [21]:
text_field.vocab.itos

['<unk>',
 '<pad>',
 ' ',
 'e',
 'a',
 'o',
 't',
 's',
 'n',
 'i',
 'r',
 'l',
 'c',
 'd',
 'u',
 'm',
 'h',
 'p',
 'y',
 '.',
 '/',
 'g',
 'b',
 'v',
 ':',
 '@',
 '!',
 'f',
 'A',
 'j',
 'E',
 'T',
 'R',
 'q',
 'k',
 '#',
 'w',
 'M',
 'S',
 'O',
 'L',
 'I',
 'N',
 'P',
 'C',
 'D',
 'z',
 ',',
 'G',
 'B',
 '1',
 'Y',
 '0',
 'J',
 'H',
 'U',
 'F',
 'x',
 'V',
 '2',
 '_',
 'í',
 'W',
 '"',
 '3',
 'K',
 '7',
 '?',
 'Z',
 '6',
 'Q',
 '4',
 '5',
 '8',
 '9',
 'á',
 "'",
 'ñ',
 '…',
 'é',
 'X',
 'ó',
 '-',
 '😂',
 ')',
 '(',
 '️',
 '|',
 '❤',
 '😍',
 'ú',
 '’',
 '“',
 '”',
 '^',
 '&',
 '¿',
 '😭',
 '*',
 '>',
 '$',
 ';',
 '😁',
 '¡',
 '👌',
 '・',
 '🔥',
 '🏼',
 '😱',
 '☺',
 '🚨',
 '🎉',
 '👏',
 'Í',
 '😘',
 'É',
 '😝',
 '😎',
 '🏻',
 '🏽',
 '🙌',
 '💕',
 '😊',
 '~',
 '♥',
 '•',
 'Á',
 'Ñ',
 '<',
 '🎵',
 '😩',
 '💀',
 '+',
 '😜',
 '👍',
 '😋',
 '😒',
 '✌',
 '😏',
 '😡',
 '%',
 'Ó',
 '👗',
 '💃',
 '💋',
 '💪',
 '=',
 '☀',
 '💁',
 '😔',
 '🎶',
 '😉',
 '😳',
 '😌',
 '🤔',
 '💯',
 '🍻',
 '👇',
 '😈',
 '🙄',
 '🙋',
 '👠',
 '🌸',
 '😅',
 '😫',
 '

In [22]:
train_iter = torchtext.data.BucketIterator(train,
                                           batch_size=32,
                                           sort_key=lambda x: len(x.text), # to minimize padding
                                           sort_within_batch=True,        # sort within each batch
                                           repeat=True)                   # repeat the iterator for multiple epochs
val_iter = torchtext.data.BucketIterator(val,
                                           batch_size=32,
                                           sort_key=lambda x: len(x.text), # to minimize padding
                                           sort_within_batch=True,        # sort within each batch
                                           repeat=True)                   # repeat the iterator for multiple epochs
test_iter = torchtext.data.BucketIterator(test,
                                           batch_size=32,
                                           sort_key=lambda x: len(x.text), # to minimize padding
                                           sort_within_batch=True,        # sort within each batch
                                           repeat=True)                   # repeat the iterator for multiple epochs



In [23]:
for i, batch in enumerate(train_iter):
    if i >= 10:
        break
    print(batch.text)
    print(batch.label)

(tensor([[43,  4, 10,  ..., 37, 30, 72],
        [32, 31, 25,  ..., 44, 27, 58],
        [32, 31, 25,  ..., 20, 78,  1],
        ...,
        [39, 37, 48,  ...,  3,  1,  1],
        [44, 14,  6,  ...,  6,  1,  1],
        [25, 44,  9,  ..., 11,  1,  1]]), tensor([132, 132, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131,
        131, 131, 131, 131, 131, 131, 131, 131, 131, 130, 130, 130, 130, 130,
        130, 130, 130, 130]))
tensor([-1,  1,  0,  1,  1,  1,  0,  0,  0,  1,  1,  0, -1,  1,  0,  0, -1,  1,
        -1,  1,  0,  0,  1,  0, -1,  1,  0,  1,  0,  1,  1,  1])
(tensor([[ 35,  32,   3,  ...,   4,   4,  40],
        [ 25,  22,  10,  ...,  24,  84, 124],
        [ 54,   3,  18,  ...,  29,  74,  80],
        ...,
        [ 44,   5,   8,  ...,   6,  11,   1],
        [ 44,  10,   3,  ...,  23,  14,   1],
        [ 54,   3,  12,  ...,  31,  45,   1]]), tensor([122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 121, 121, 121,
        121, 121, 121, 121, 121, 121, 121, 

In [24]:
from torch import nn

class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
        super().__init__()
        # params: "n_" means dimension
        self.n_vocab = n_vocab     # number of unique words in vocabulary
        self.n_layers = n_layers   # number of LSTM layers 
        self.n_hidden = n_hidden   # number of hidden nodes in LSTM
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
                                             # INPUT   :  (batch_size, seq_length)
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        lstm_out, h = self.lstm(embedded_words)         # (batch_size, seq_length, n_hidden)
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
        fc_out = self.fc(lstm_out)                      # (batch_size*seq_length, n_output)
        sigmoid_out = self.sigmoid(fc_out)              # (batch_size*seq_length, n_output)
        sigmoid_out = sigmoid_out.view(batch_size, -1)  # (batch_size, seq_length*n_output)
        
        # extract the output of ONLY the LAST output of the LAST element of the sequence
        sigmoid_last = sigmoid_out[:, -1]               # (batch_size, 1)
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):  # initialize hidden weights (h,c) to 0
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h

In [25]:
n_vocab = len(text_field.vocab.itos)
n_embed = 400
n_hidden = 512
n_output = 3   # 1 ("positive") or 0 ("negative")
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)

In [26]:
from torch import optim

criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)


In [12]:
def get_accuracy(model, data_iter):
    correct, total = 0, 0
    for i, batch in enumerate(data_iter):
        output = model(batch.text[0]) # You may need to modify this, depending on your model setup
        print(output)
        pred = output.max(1, keepdim=True)[1]
        correct += pred.eq(batch.label.view_as(pred)).sum().item()
        total += batch.text[1].shape[0]
    return correct / total

In [13]:
###############################################################################
##########################  11. TRAIN THE NETWORK!  ###########################
###############################################################################
print_every = 100
step = 0
n_epochs = 4  # validation loss increases from ~ epoch 3 or 4
clip = 5  # for gradient clip to prevent exploding gradient problem in LSTM/RNN
device = 'cuda' if torch.cuda.is_available else 'cpu'

for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_iter:
        step += 1
        
        # making requires_grad = False for the latest set of h
        h = tuple([each.data for each in h])   
        
        net.zero_grad()
        output, h = net(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            ######################
            ##### VALIDATION #####
            ######################
            net.eval()
            valid_losses = []
            v_h = net.init_hidden(batch_size)
            
            for v_inputs, v_labels in valid_loader:
                # v_inputs, v_labels = inputs.to(device), labels.to(device)
        
                v_h = tuple([each.data for each in v_h])
                
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()

NameError: name 'net' is not defined