#Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!python -m spacy download es_core_news_sm

Collecting es_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.2.5/es_core_news_sm-2.2.5.tar.gz (16.2MB)
[K     |████████████████████████████████| 16.2MB 19.6MB/s 
Building wheels for collected packages: es-core-news-sm
  Building wheel for es-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for es-core-news-sm: filename=es_core_news_sm-2.2.5-cp37-none-any.whl size=16172935 sha256=6599daf09e03dd62d221593bc6b0fba9ed48f49b7f0e248e103d83477ebb5426
  Stored in directory: /tmp/pip-ephem-wheel-cache-zw1q2cly/wheels/05/4f/66/9d0c806f86de08e8645d67996798c49e1512f9c3a250d74242
Successfully built es-core-news-sm
Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_sm')


In [None]:
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy',
                  tokenizer_language='es_core_news_sm',
                  include_lengths = True)

SENTIMENT = data.LabelField(dtype = torch.float)

In [None]:
fields = {'texto': ('t', TEXT), 'sentimiento': ('s', SENTIMENT)}

In [None]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                            path = 'drive/MyDrive/Saturdays.AI/data_sentimiento',
                            train = 'train.json',
                            validation = 'valid.json',
                            test = 'test.json',
                            format = 'json',
                            fields = fields
)

In [None]:
import torchtext.vocab as vocab

MAX_VOCAB_SIZE = 4000

spanish_embeddings = vocab.Vectors('SBW-vectors-300-min5.txt', cache='drive/MyDrive/Saturdays.AI')


TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, vectors = spanish_embeddings, unk_init = torch.Tensor.normal_)
SENTIMENT.build_vocab(train_data)

In [None]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key = lambda x: len(x.t),
    device = device
)

#Modelo

In [None]:
import torch.nn as nn

class BiLSTM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)

    self.rnn = nn.LSTM(embedding_dim, 
                       hidden_dim, 
                       num_layers=n_layers, 
                       bidirectional=bidirectional, 
                       dropout=dropout)
    
    self.fc = nn.Linear(hidden_dim*2, output_dim)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, text, text_lengths):
    embedded = self.dropout(self.embedding(text))
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
    packed_output, (hidden, cell) = self.rnn(packed_embedded)

    output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))

    return self.fc(hidden)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 450
OUTPUT_DIM = 1
N_LAYERS = 3
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = BiLSTM(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 13,643,101 trainable parameters


In [None]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([4002, 300])


In [None]:
model.embedding.weight.data.copy_(TEXT.vocab.vectors)

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [-3.9439e-02, -4.2805e-02, -5.0221e-02,  ..., -2.0496e-02,
          2.3131e-02, -8.1015e-02],
        [ 4.4232e-02, -5.5097e-02,  4.4623e-02,  ..., -7.1947e-02,
          7.4214e-02, -2.5300e-02],
        [-6.0870e-02, -4.6083e-02,  9.3000e-05,  ..., -8.2717e-02,
          1.1736e-01, -4.8698e-02]])

In [None]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[4]

tensor([-2.9648e-02,  1.1336e-02,  1.9949e-02, -8.8832e-02, -2.5225e-02,
         5.6844e-02,  2.5473e-02,  1.4068e-02,  1.6369e-01, -6.7154e-02,
         1.4738e-02,  2.7134e-02,  6.6443e-02, -4.4846e-02, -4.4987e-02,
        -4.0898e-02,  3.0311e-02,  3.4196e-02, -4.9240e-02,  8.5370e-03,
        -6.8091e-02, -8.7938e-02,  3.5300e-02,  1.4939e-01, -1.2350e-02,
         1.2613e-02,  2.9350e-02,  6.9596e-02,  3.9111e-02,  5.7652e-02,
         6.9954e-02, -6.6217e-02, -4.1784e-02,  2.8623e-02,  2.6772e-02,
        -6.6392e-02,  2.9530e-03, -1.2188e-02, -3.0363e-02,  4.0222e-02,
         3.4858e-02,  2.7469e-02, -2.9034e-02, -4.8748e-02, -3.8582e-02,
        -5.1553e-02, -3.3501e-02, -1.9008e-02,  3.0430e-03,  1.1071e-01,
        -2.5096e-02,  1.1108e-01,  3.5244e-02,  1.1421e-01,  1.0195e-02,
         5.1511e-02, -4.0649e-02, -1.1394e-01,  4.4873e-02,  5.2011e-02,
         6.7360e-02,  4.9054e-02, -1.2709e-01, -3.1846e-02,  3.2848e-02,
         4.0825e-02, -8.4873e-02,  5.9801e-02, -6.7

#Train

In [None]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):

  epoch_loss = 0
  epoch_acc = 0

  model.train()

  for batch in iterator:
    optimizer.zero_grad()
    text, text_lengths = batch.t
    predictions = model(text, text_lengths).squeeze(1)
    loss = criterion(predictions, batch.s)
    acc = binary_accuracy(predictions, batch.s)
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()
  
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.t
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.s)
            
            acc = binary_accuracy(predictions, batch.s)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 2s
	Train Loss: 0.701 | Train Acc: 42.07%
	 Val. Loss: 0.691 |  Val. Acc: 58.77%
Epoch: 02 | Epoch Time: 0m 1s
	Train Loss: 0.695 | Train Acc: 50.00%
	 Val. Loss: 0.686 |  Val. Acc: 59.38%
Epoch: 03 | Epoch Time: 0m 1s
	Train Loss: 0.620 | Train Acc: 71.63%
	 Val. Loss: 0.599 |  Val. Acc: 76.39%
Epoch: 04 | Epoch Time: 0m 1s
	Train Loss: 0.430 | Train Acc: 83.89%
	 Val. Loss: 0.413 |  Val. Acc: 89.58%
Epoch: 05 | Epoch Time: 0m 1s
	Train Loss: 0.350 | Train Acc: 86.78%
	 Val. Loss: 0.361 |  Val. Acc: 88.19%
Epoch: 06 | Epoch Time: 0m 1s
	Train Loss: 0.419 | Train Acc: 80.65%
	 Val. Loss: 0.465 |  Val. Acc: 73.44%
Epoch: 07 | Epoch Time: 0m 1s
	Train Loss: 0.368 | Train Acc: 83.77%
	 Val. Loss: 0.564 |  Val. Acc: 79.51%
Epoch: 08 | Epoch Time: 0m 1s
	Train Loss: 0.363 | Train Acc: 86.30%
	 Val. Loss: 0.948 |  Val. Acc: 54.60%
Epoch: 09 | Epoch Time: 0m 1s
	Train Loss: 0.526 | Train Acc: 75.84%
	 Val. Loss: 0.738 |  Val. Acc: 49.65%
Epoch: 10 | Epoch Time: 0m 1

In [None]:
model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.273 | Test Acc: 92.53%


#Test

In [None]:
import spacy
nlp = spacy.load('es_core_news_sm')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [None]:
predict_sentiment(model, "Servicio excelente")

0.08902443200349808