![Pipeline](pipeline.jpg)

# Using LSTM for Sentiment Analysis of Amazon Reviews

Reference: https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/

## Part 1: Data Cleaning

In [109]:
TRAIN_SIZE = 800000 #3600000
TEST_SIZE = 200000 #40000

In [110]:
def file_to_list(fileName, size):
    lines = []
    with open(fileName) as file:
        for i in range(size):
            lines.append(file.readline())
    return lines

train_set = file_to_list('train.ft.txt', TRAIN_SIZE)
test_set = file_to_list('test.ft.txt', TEST_SIZE)

In [111]:
# Viewing the training set...
train_set[0]

'__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n'

## Separate sentence from label

\__label__2 <review>     <--- Positive Sentiment
                             
\__label__1 <review>     <--- Negative Sentiment

In [112]:
def extract_data(dataset): #Returns: (sentence,label)
    label = []
    sentences = []
    positive = 0
    negative = 0
    for line in dataset:
        sentences.append(line[11:])
        if line[9] == '2':
            positive += 1
            label.append(1) # Positive Sentiment
        else:
            negative += 1   # Negative Sentiment
            label.append(0)
    return sentences, label, positive, negative


In [113]:
print('Labelling Training Set...')
train_sentences, train_label, positive, negative = extract_data(train_set)
print(f'Found {positive}+ sentiments, {negative}- sentiments in Training Set')

print('Labelling Test Set...')
test_sentences, test_label, positive, negative = extract_data(test_set)
print(f'Found {positive}+ sentiments, {negative}- sentiments in Test Set')

Labelling Training Set...
Found 404094+ sentiments, 395906- sentiments in Training Set
Labelling Test Set...
Found 100565+ sentiments, 99435- sentiments in Test Set


In [114]:
# Cleaning the data
# replace \d with 0 in the vector
# Modify URLS to <url>
import re
from tqdm.autonotebook import tqdm # Progress Bar
for i in tqdm(range(len(train_sentences))):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in tqdm(range(len(test_sentences))):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

HBox(children=(IntProgress(value=0, max=800000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))




In [115]:
# Remove empty strings
train_sentences = list(filter(None, train_sentences))
test_sentences = list(filter(None, test_sentences))
print(f'Valid Training Sentences: {len(train_sentences)}')
print(f'Valid Test Sentences: {len(test_sentences)}')

Valid Training Sentences: 800000
Valid Test Sentences: 200000


## Tokenization
Use ntlk.regexp_tokenize for faster tokenization over ntlk.word_tokenize. Regexp tokenizer is around 6x faster

Reference: https://towardsdatascience.com/benchmarking-python-nlp-tokenizers-3ac4735100c5

In [116]:
# Word to Frequency
from collections import Counter
import nltk
nltk.download('punkt') # Tokenizer
words = Counter()
for i, sentence in enumerate(tqdm(train_sentences)):
    try:
        #tokens = nltk.word_tokenize(sentence)
        tokens = nltk.regexp_tokenize(sentence, pattern="\s+", gaps = True)
        train_sentences[i] = []
        for word in tokens: # Tokenize the words
            words.update([word.lower()]) # To Lower Case
            train_sentences[i].append(word)
    except:
        print(sentence)
print("100% done")

[nltk_data] Downloading package punkt to /home/ryzen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


HBox(children=(IntProgress(value=0, max=800000), HTML(value='')))


100% done


In [117]:
# Remove infrequent words (i.e. words that only appear once)
words = {k:v for k,v in words.items() if v>1}
# Sort the words according to frequency, descending order
words = sorted(words, key=words.get, reverse=True)
# Add padding & unknown to corpus
words = ['_PAD','_UNK'] + words

In [14]:
# Dictionaries for fast mappings
word2idx = {w:i for i,w in enumerate(words)}
idx2word = {i:w for i,w in enumerate(words)}

In [118]:
# Convert word to indices
for i, sentence in enumerate(tqdm(train_sentences)):
    train_sentences[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]
    
for i, sentence in enumerate(tqdm(test_sentences)):
    # For test sentences, we have to tokenize the sentences as well
    test_sentences[i] = [word2idx[word.lower()] if word in word2idx else 0 for word in nltk.regexp_tokenize(sentence, pattern="\s+", gaps = True)]

HBox(children=(IntProgress(value=0, max=800000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))




In [119]:
# save clean data
import pickle
with open('train_sentences.data', 'wb') as file:
    pickle.dump(train_sentences, file)
with open('test_sentences.data', 'wb') as file:
    pickle.dump(test_sentences, file)
with open('train_label.data', 'wb') as file:
    pickle.dump(train_label, file)
with open('test_label.data', 'wb') as file:
    pickle.dump(test_label, file)
with open('words.data', 'wb') as file:
    pickle.dump(words, file)

# Load Saved Data

In [34]:
# load data
from tqdm.autonotebook import tqdm
import pickle
import nltk
with open('train_sentences.data', 'rb') as file:
    train_sentences = pickle.load(file)
with open('test_sentences.data', 'rb') as file:
    test_sentences = pickle.load(file)
with open('train_label.data', 'rb') as file:
    train_label = pickle.load(file)
with open('test_label.data', 'rb') as file:
    test_label = pickle.load(file)
with open('words.data', 'rb') as file:
    words = pickle.load(file)

In [36]:
train_sentences[0]

array([     0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
      

In [37]:
# Padding
import numpy as np
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len), dtype=int)
    for i, review in enumerate(tqdm(sentences)):
        if len(review) != 0:
            features[i, -len(review):] = np.array(review)[:seq_len]
    return features

SEQUENCE_LENGTH = 200
train_sentences = pad_input(train_sentences, SEQUENCE_LENGTH)
test_sentences = pad_input(test_sentences, SEQUENCE_LENGTH)

# Label to numpy array
train_label = np.array(train_label)
test_label = np.array(test_label)

HBox(children=(IntProgress(value=0, max=800000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




In [38]:
train_sentences[0]

array([     0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
      

In [39]:
TEST_VALIDATION_SPLIT = 0.5
split_index = int(TEST_VALIDATION_SPLIT * len(test_sentences))
val_sentences, test_sentences = test_sentences[:split_index], test_sentences[split_index:]
val_label, test_label = test_label[:split_index], test_label[split_index:]

## Neural Network Model

In [40]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

train_data = TensorDataset(
    torch.from_numpy(train_sentences),
    torch.from_numpy(train_label))
test_data = TensorDataset(
    torch.from_numpy(test_sentences),
    torch.from_numpy(test_label))
val_data = TensorDataset(
    torch.from_numpy(val_sentences),
    torch.from_numpy(val_label))

BATCH_SIZE = 100

train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, shuffle=True, batch_size=BATCH_SIZE)
val_loader = DataLoader(val_data, shuffle=True, batch_size=BATCH_SIZE)

## Use GPU (CUDA)

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using {device} in training models.')

Using cuda in training models.


In [42]:
class NLP(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(NLP, self).__init__()
        
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # Layers:
        # Embedding -> LSTM (n_layers times) -> Fully Connected
        
        # Create Embeddings
        # Word to Vector
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # Define LSTM Model
        # nn.LSTM(input, hidden, num_hidden_layers, dropout, batch_first=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        # Dropout (Deactivate some neurons randomly)
        self.dropout = nn.Dropout(drop_prob)
        # Define the Fully Connected Layers
        self.fc = nn.Linear(hidden_dim, output_size)
        
        
        # Activation Function
        self.sigmoid = nn.Sigmoid()
        
        
    def forward(self, x, hidden):
        batch_size = x.size(0) # Rows
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (
            weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
            weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)
                 )
        return hidden

In [43]:
vocab_size = len(word2idx) + 1
output_size = 1
embedding_dim = 250
hidden_dim = 350
n_layers = 2

model = NLP(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)

lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## Training the model

In [45]:
epochs = 2
counter = 0
print_every = 100
clip = 5
valid_loss_min = np.Inf

model.train()
for i in range(epochs):
    h = model.init_hidden(BATCH_SIZE)
    
    for inputs, labels in train_loader:
        counter += 1
        h = tuple([e.data for e in h])
        
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            val_h = model.init_hidden(BATCH_SIZE)
            val_losses = []
            for inp, lab in val_loader:
                val_h = tuple([each.data for each in val_h])
                inp, lab = inp.to(device), lab.to(device)
                out, val_h = model(inp, val_h)
                val_loss = criterion(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

Epoch: 1/2... Step: 100... Loss: 0.477714... Val Loss: 0.459629
Validation loss decreased (inf --> 0.459629).  Saving model ...
Epoch: 1/2... Step: 200... Loss: 0.315157... Val Loss: 0.337323
Validation loss decreased (0.459629 --> 0.337323).  Saving model ...
Epoch: 1/2... Step: 300... Loss: 0.360729... Val Loss: 0.307392
Validation loss decreased (0.337323 --> 0.307392).  Saving model ...
Epoch: 1/2... Step: 400... Loss: 0.354569... Val Loss: 0.291516
Validation loss decreased (0.307392 --> 0.291516).  Saving model ...
Epoch: 1/2... Step: 500... Loss: 0.327683... Val Loss: 0.278473
Validation loss decreased (0.291516 --> 0.278473).  Saving model ...
Epoch: 1/2... Step: 600... Loss: 0.287981... Val Loss: 0.281021
Epoch: 1/2... Step: 700... Loss: 0.211321... Val Loss: 0.264161
Validation loss decreased (0.278473 --> 0.264161).  Saving model ...
Epoch: 1/2... Step: 800... Loss: 0.517902... Val Loss: 0.281967
Epoch: 1/2... Step: 900... Loss: 0.225072... Val Loss: 0.257904
Validation loss

Epoch: 2/2... Step: 10200... Loss: 0.201613... Val Loss: 0.219252
Epoch: 2/2... Step: 10300... Loss: 0.249252... Val Loss: 0.215504
Epoch: 2/2... Step: 10400... Loss: 0.190738... Val Loss: 0.216093
Epoch: 2/2... Step: 10500... Loss: 0.237577... Val Loss: 0.217155
Epoch: 2/2... Step: 10600... Loss: 0.243320... Val Loss: 0.233817
Epoch: 2/2... Step: 10700... Loss: 0.265506... Val Loss: 0.217724
Epoch: 2/2... Step: 10800... Loss: 0.309450... Val Loss: 0.219982
Epoch: 2/2... Step: 10900... Loss: 0.239879... Val Loss: 0.223056
Epoch: 2/2... Step: 11000... Loss: 0.347995... Val Loss: 0.217781
Epoch: 2/2... Step: 11100... Loss: 0.167716... Val Loss: 0.226556
Epoch: 2/2... Step: 11200... Loss: 0.238422... Val Loss: 0.219296
Epoch: 2/2... Step: 11300... Loss: 0.208327... Val Loss: 0.225786
Epoch: 2/2... Step: 11400... Loss: 0.263808... Val Loss: 0.225351
Epoch: 2/2... Step: 11500... Loss: 0.243468... Val Loss: 0.217430
Epoch: 2/2... Step: 11600... Loss: 0.246750... Val Loss: 0.222807
Epoch: 2/2

In [54]:
# Loading the best model
model.load_state_dict(torch.load('./state_dict.pt'))

test_losses = []
num_correct = 0
h = model.init_hidden(BATCH_SIZE)

model.eval()
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    
    # Compute for losses
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # Compute Accuracy
    pred = torch.round(output.squeeze())  # Rounds the output to 0/1
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    
    num_correct += np.sum(correct)

In [55]:
print("Test Loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test Accuracy: {:.3f}%".format(test_acc*100))

Test Loss: 0.211
Test Accuracy: 91.692%


In [57]:
for inputs, labels in test_loader:
    print(inputs.size())
    break

torch.Size([100, 200])


In [98]:
def predict(text):
    messages = text
    for i, sentence in enumerate(messages):
        # For test sentences, we have to tokenize the sentences as well
        messages[i] = [word2idx[word.lower()] if word in word2idx else 0 for word in nltk.regexp_tokenize(sentence, pattern="\s+", gaps = True)]
    messages = pad_input(messages, 200)
    h = model.init_hidden(1)
    for msg in messages:
        h = tuple([each.data for each in h])
        msg = torch.from_numpy(msg).to(device).unsqueeze(0)
        output, h = model(msg, h)
        pred = torch.round(output.squeeze())
        if pred.item() == 0.0:
            return 'Negative Review'
        else:
            return 'Positive Review'
        return pred.item()


# Predicting Sentiment

In [107]:
messages = [
    "The item was good and the seller, Kristian Espina, is also responsive to my inquiries."
]
print(f'Predicted Sentiment: {predict(messages)}')

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Predicted Sentiment: Positive Review


In [108]:
messages = [
"The shoes was good but Kristian Espina, the seller, was not responsive. The shipping time is also very slow"
]
print(f'Predicted Sentiment: {predict(messages)}')

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Predicted Sentiment: Negative Review
