In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from string import punctuation

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn

import os

In [2]:

with open('aclImdb/train/trainReviews.txt', 'r', encoding="utf8") as f:
     reviews_train = f.read()
labels_train = pd.read_csv('aclImdb/train/IMDBValuesTrain.txt', sep='delimiter', names=['Score'], engine = 'python')


# tockenizing training data 
reviews_train = reviews_train.lower()

all_text_train = ''.join([c for c in reviews_train if c not in punctuation])

reviews_split_train = all_text_train.split('\n')

labels_train = labels_train.to_numpy()

all_text2_train = ' '.join(reviews_split_train)

words_train = all_text2_train.split()

count_words_train = Counter(words_train)

total_words_train = len(words_train)
sorted_words_train = count_words_train.most_common(total_words_train)

vocab_to_int_train = {w:i+1 for i, (w,c) in enumerate(sorted_words_train)}


# Change Review Score to Binary Score
encoded_labels_train = [0 if label <= 5 else 1 for label in labels_train]
encoded_labels_train = np.array(encoded_labels_train)

reviews_int_train = []
for reviews in reviews_split_train:
    r = [vocab_to_int_train[w] for w in reviews.split()]
    reviews_int_train.append(r)
    

reviews_len_train = [len(x) for x in reviews_int_train]
reviews_int_train = [reviews_int_train[i] for i, l in enumerate(reviews_len_train) if l > 0]   #do we need this? 
encoded_labels_train = [encoded_labels_train[i] for i, l in enumerate(reviews_len_train) if l > 0]

In [3]:

with open('aclImdb/test/testReviews.txt', 'r', encoding="utf8") as f:
    reviews_test = f.read()
labels_test = pd.read_csv('aclImdb/test/IMDBValuesTest.txt', sep='delimiter', names=['Score'], engine = 'python')


# tockenizing test data 
reviews_test = reviews_test.lower()

all_text_test = ''.join([c for c in reviews_test if c not in punctuation])

reviews_split_test = all_text_test.split('\n')

labels_test = labels_test.to_numpy()

all_text2_test = ' '.join(reviews_split_test)

words_test = all_text2_test.split()

count_words_test = Counter(words_test)

total_words_test = len(words_test)
sorted_words_test = count_words_test.most_common(total_words_test)

vocab_to_int_test = {w:i+1 for i, (w,c) in enumerate(sorted_words_test)}


# Change Review Score to Binary Score
encoded_labels_test = [0 if label <= 5 else 1 for label in labels_test]
encoded_labels_test = np.array(encoded_labels_test)

reviews_int_test = []
for reviews in reviews_split_test:
    r = [vocab_to_int_test[w] for w in reviews.split()]
    reviews_int_test.append(r)
    

reviews_len_test = [len(x) for x in reviews_int_test]
reviews_int_test = [reviews_int_test[i] for i, l in enumerate(reviews_len_test) if l > 0]   # remove outliers
encoded_labels_test = [encoded_labels_test[i] for i, l in enumerate(reviews_len_test) if l > 0]

In [4]:
seq_length = 200
def pad_features(reviews_int, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(reviews_int), seq_length), dtype = int)
    
    for i, review in enumerate(reviews_int):
        review_len = len(review)
        
        if review_len <= seq_length:
            zeroes = list(np.zeros(seq_length-review_len))
            new = zeroes+review
        elif review_len > seq_length:
            new = review[0:seq_length]
        
        features[i,:] = np.array(new)
    
    return features

features_train = pad_features(reviews_int_train, seq_length)
features_test = pad_features(reviews_int_test, seq_length)

In [5]:
split_frac = 0.9
len_feat = len(features_test)

remaining_x = features_test[int(split_frac*len_feat):]
remaining_y = encoded_labels_test[int(split_frac*len_feat):]

valid_x = np.array(remaining_x[0:int(len(remaining_x)*0.5)])
valid_y = np.array(remaining_y[0:int(len(remaining_y)*0.5)])

In [6]:
# create Tensor datasets

# convert features and encoded_labels to np array for torch library 
features_train = np.array(features_train); encoded_labels_train = np.array(encoded_labels_train)
features_test = np.array(features_test); encoded_labels_test = np.array(encoded_labels_test)

train_data = TensorDataset(torch.from_numpy(features_train), torch.from_numpy(encoded_labels_train))
test_data = TensorDataset(torch.from_numpy(features_test), torch.from_numpy(encoded_labels_test))
valid_data = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))

# Dataloaders - Shuffle Data
batch_size = 50
                           
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[    0,     0,     0,  ...,   625,     1,  1434],
        [    0,     0,     0,  ..., 13198,    12,  1816],
        [    3,   579,     4,  ..., 48030,  2596,  1754],
        ...,
        [    0,     0,     0,  ...,    57,    64, 13732],
        [    0,     0,     0,  ..., 28229, 55287, 43161],
        [    0,     0,     0,  ...,     3,  1378,  3824]], dtype=torch.int32)

Sample label size:  torch.Size([50])
Sample label: 
 tensor([0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
        0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
        1, 0], dtype=torch.int32)


In [7]:
class SentimentLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.5)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [21]:
# Instantiate the model w/ hyperparams
vocab_size = len(vocab_to_int_train)+1 # +1 for the 0 padding
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2
net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers) # call our RNN pytorch module

In [22]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


# training params

epochs = 3 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    net.cuda()
    device = "cuda:0"
    net = net.to(device)


net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        inputs = inputs.type(torch.LongTensor)
        inputs = inputs.to(device)
        output, h = net(inputs, h)
        output = output.to(device)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                inputs = inputs.type(torch.LongTensor)
                inputs = inputs.to(device)
                output, val_h = net(inputs, val_h)
                output = output.to(device)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/3... Step: 100... Loss: 0.655830... Val Loss: 0.597463
Epoch: 1/3... Step: 200... Loss: 0.719412... Val Loss: 0.468063
Epoch: 1/3... Step: 300... Loss: 0.765935... Val Loss: 1.267863
Epoch: 1/3... Step: 400... Loss: 0.752900... Val Loss: 1.212787
Epoch: 1/3... Step: 500... Loss: 0.360630... Val Loss: 0.296609
Epoch: 2/3... Step: 600... Loss: 0.543922... Val Loss: 1.133109
Epoch: 2/3... Step: 700... Loss: 0.610358... Val Loss: 0.425605
Epoch: 2/3... Step: 800... Loss: 0.328867... Val Loss: 0.420704
Epoch: 2/3... Step: 900... Loss: 0.520030... Val Loss: 1.077079
Epoch: 2/3... Step: 1000... Loss: 0.322840... Val Loss: 0.974163
Epoch: 3/3... Step: 1100... Loss: 0.198299... Val Loss: 0.960107
Epoch: 3/3... Step: 1200... Loss: 0.195233... Val Loss: 1.189246
Epoch: 3/3... Step: 1300... Loss: 0.329660... Val Loss: 1.522535
Epoch: 3/3... Step: 1400... Loss: 0.237844... Val Loss: 1.249859
Epoch: 3/3... Step: 1500... Loss: 0.393153... Val Loss: 1.536370


In [23]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    inputs = inputs.type(torch.LongTensor)
    inputs = inputs.to(device)
    output, h = net(inputs, h)
    output = output.to(device)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 1.048
Test accuracy: 0.541
