In [None]:
"""
https://www.kaggle.com/kuldeep7688/simple-rnn-using-glove-embeddings-in-pytorch

it uses TorchText
"""

In [48]:
import torch
from torchtext import data

import pandas as pd
import numpy as np
import pyprind
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
import nltk
nltk.download('punkt')
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext

import random
from sklearn.metrics import classification_report

%matplotlib inline

[nltk_data] Downloading package punkt to /home/eduardo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Preparing data for train, validation and test set

In [11]:
main_df = pd.read_csv("../sentiment labelled sentences/imdb_labelled.txt",
                      header=None, sep='\t')
cols = ["text", "target"]
main_df.columns = cols
print(main_df.head())
classes = {0:"negative", 1:"positive"}
print(main_df.shape)

                                                text  target
0  A very, very, very slow-moving, aimless movie ...       0
1  Not sure who was more lost - the flat characte...       0
2  Attempting artiness with black & white and cle...       0
3       Very little music or anything to speak of.         0
4  The best scene in the movie was when Gerardo i...       1
(748, 2)


In [16]:
print(main_df.target.value_counts())
neg_class = main_df.loc[main_df.target == 0, :]
pos_class = main_df.loc[main_df.target == 1, :]
print("negative class shape:",neg_class.shape)
print("positive class shape:",pos_class.shape)

1    386
0    362
Name: target, dtype: int64
negative class shape: (362, 2)
positive class shape: (386, 2)


In [19]:
# preparing balanced test and validation set
# splitting test and train
test_negative = neg_class.iloc[:100, :]
test_positive = pos_class.iloc[:100, :]

valid_negative = neg_class.iloc[100:200, :]
valid_positive = pos_class.iloc[100:200, :]

train_negative = neg_class.iloc[200:, :]
train_positive = pos_class.iloc[200:, :]

train = pd.concat([train_negative, train_positive], axis = 0)
print("train shape:",train.shape)
print(train.target.value_counts())

valid = pd.concat([valid_negative, valid_positive], axis=0)
print("valid shape",valid.shape)
print(valid.target.value_counts())

test = pd.concat([test_negative, test_positive], axis=0)
print("test shape:",test.shape)
print(test.target.value_counts())

train shape: (348, 2)
1    186
0    162
Name: target, dtype: int64
valid shape (200, 2)
1    100
0    100
Name: target, dtype: int64
test shape: (200, 2)
1    100
0    100
Name: target, dtype: int64


Saving files to disk

In [None]:
!mkdir torchtext_data

In [21]:
train.to_csv("torchtext_data/train.csv", index=False)
test.to_csv("torchtext_data/test.csv", index=False)
valid.to_csv("torchtext_data/valid.csv", index=False)

Free up some memory

In [23]:
del main_df, train, test, valid, train_positive, train_negative, test_positive, test_negative, valid_positive, valid_negative

NameError: name 'main_df' is not defined

In [25]:
def tokenizer(text):
    return [tok for tok in nltk.word_tokenize(text)]

TEXT = data.Field(sequential=True, tokenize=tokenizer)
LABEL = data.LabelField(dtype=torch.long, sequential=False)

In [31]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


Loading train, test and validation data

In [57]:
train_data, valid_data, test_data = data.TabularDataset.splits(
    path="torchtext_data/", train="train.csv", validation="valid.csv", test="test.csv",
    format="csv", skip_header=True,
    fields=[("text", TEXT), ("target", LABEL)]
)

print("Number of training examples: {}".format(len(train_data)))
print("Number of validation examples: {}".format(len(valid_data)))
print("Number of testing examples: {}".format(len(test_data)))

Number of training examples: 348
Number of validation examples: 200
Number of testing examples: 200


Load the GloVe embeddings using the train vocalubary

In [35]:
TEXT.build_vocab(train_data, vectors=torchtext.vocab.Vectors("../glove.6B/glove.6B.50d.txt"),
                 max_size=20000, min_freq=10)

LABEL.build_vocab(train_data)

100%|█████████▉| 400000/400001 [00:14<00:00, 26922.49it/s]


In [58]:
print("Unique tokens in TEXT vocabulary: {}".format(len(TEXT.vocab)))
print("Unique tokens in LABEL vocabulary: {}".format(len(LABEL.vocab)))

Unique tokens in TEXT vocabulary: 92
Unique tokens in LABEL vocabulary: 2


Start with the Network

In [71]:
BATCH_SIZE = 20

#keep in mind the sort_key option
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), sort_key=lambda x: len(x.text),
    batch_size=BATCH_SIZE, device=device)

In [55]:
LABEL.vocab.freqs

Counter({'0': 162, '1': 186})

In [72]:
class RNN(nn.Module):

    def __init__(self, input_dims, embedding_dim, hidden_dim, output_dim):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(input_dims, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)

        assert torch.equal(output[-1, :, :], hidden.squeeze(0))

        out = self.fc(hidden)
        return out

In [73]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50
HIDDEN_DIM = 374
OUTPUT_DIM = 2

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

Using the pretrained embeddings

In [74]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)
model.embedding.weight.data = pretrained_embeddings.to(device)

torch.Size([92, 50])


Give weights to the classes

In [75]:
class_weights = torch.tensor([1.0, 15.0]).to(device)
print(class_weights)

tensor([ 1., 15.])


In [76]:
optimizer = optim.SGD(model.parameters(), lr=2e-3)
criterion = nn.CrossEntropyLoss(weight=class_weights)

model = model.to(device)
criterion = criterion.to(device)

Training and Evaluating

In [77]:
def binary_accuracy(preds, y):
    """
    returns accuracy per batch, i.e., if you get 8/10 right, this return 0.8, not 8
    :param preds:
    :param y:
    :return:
    """
    preds, ind = torch.max(F.softmax(preds, dim=-1), 1)
    correct = (ind == y).float()
    acc = correct.sum()/float(len(correct))
    return acc

In [78]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    bar = pyprind.ProgBar(len(iterator), bar_char="█")
    for batch in iterator:

        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(0)
        loss = criterion(predictions, batch.target)
        acc = binary_accuracy(predictions, batch.target)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        bar.update()

    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [79]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        bar = pyprind.ProgBar(len(iterator), bar_char="█")
        for batch in iterator:
            predictions = model(batch.text).squeeze(0)
            loss = criterion(predictions, batch.target)
            acc = binary_accuracy(predictions, batch.target)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            bar.update()

    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [85]:
N_EPOCHS = 2

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    print('| Epoch:{} | Train Loss: {} | Train Acc: {}% | Val. Loss: {} | Val. Acc: {}% |'
          .format(epoch+1, train_loss, train_acc*100, valid_loss, valid_acc*100))

0% [██████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:02
0% [██████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00
0% [██████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:02
0% [██████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00


| Epoch:1 | Train Loss: 0.4528486678997676 | Train Acc: 46.66666669978036% | Val. Loss: 0.4330769807100296 | Val. Acc: 50.0% |

| Epoch:2 | Train Loss: 0.430962140361468 | Train Acc: 46.25000043047799% | Val. Loss: 0.410518479347229 | Val. Acc: 50.0% |



Testing

In [86]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print('| Test Loss: {} | Test Acc: {}% |'.format(test_loss, test_acc*100))

0% [██████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00


| Test Loss: 0.40702800154685975 | Test Acc: 50.0% |


Make predictions

In [98]:
def predict_sentiment(sentence):
    tokenized = nltk.word_tokenize(sentence)
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    #print(tensor.shape)
    tensor = tensor.unsqueeze(1)
    #print(tensor.shape)
    prediction = model(tensor)
    preds, ind = torch.max(F.softmax(prediction.squeeze(0), dim=-1), 1)

    return preds, ind

In [107]:
text = "My voice range is A2-C5. My chest voice goes up to F4. " \
       "Included sample in my higher chest range. What is my voice type?"
print(classes[predict_sentiment(text)[1].item()])




positive
