In [1]:
from collections import Counter
import os
import string
import re
from tqdm import tqdm

import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords 
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import preprocessor as pre

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xbbncc8/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Configuration
config = {
    'smoke_test_size': 500,   # Length of training set. 0 for all reviews.
    'epochs': 4,              # Total number of epochs
    'batch_size': 100,        # Batch size for each epoch
    'training_dim': 200,      # Number of tokens (words) to put into each review.
    'vocab_size': 7000,       # Vocabulary size
    'output_dim': 1,
    'embedding_dim': 400,
    'hidden_dim': 256,
    'n_layers': 2,
    'lr': 0.001,
    'grad_clip': 5
}


In [3]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device('cuda')
    print('GPU is available.')
else:
    device = torch.device('cpu')
    print('GPU not available.')

GPU not available.


In [4]:
data_file = os.path.join(os.getcwd(), 'aclImdb', 'IMDB Dataset.csv')
df = pd.read_csv(data_file)
df.head()

X, y = df['review'].values, df['sentiment'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.5, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=.8, stratify=y_train)

In [None]:
X_train, y_train, X_valid, y_valid = pre.preprocess_train_valid_data(config)
X_test, y_test = pre.get_test_data()

In [5]:
print(f'shape of train data is {X_train.shape}')
print(f'shape of train data is {X_valid.shape}')
print(f'shape of test data is {X_test.shape}')

shape of train data is (20000,)
shape of train data is (5000,)
shape of test data is (25000,)


In [None]:
dd = pd.Series(y_train).value_counts()
sns.barplot(x=np.array(['negative','positive']), y=dd.values)
plt.show()

In [6]:
def preprocess_string(s):
    # Remove all non-word characters (everything except numbers and letters)
    s = re.sub(r'[^\w\s]', '', s)
    # Replace all runs of whitespaces with no space
    s = re.sub(r'\s+', '', s)
    # replace digits with no space
    s = re.sub(r'\d', '', s)

    return s

def pad(X, sequence_len):
    features = np.zeros((len(X), sequence_len),dtype=int)
    for ii, review in enumerate(X):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:sequence_len]
    return features

def create_tokens(X_train, config):
    vocab_size = config['vocab_size']
    word_list = []

    stop_words = set(stopwords.words('english')) 

    for entry in X_train:
        for word in entry.lower().split():
            word = preprocess_string(word)
            if word not in stop_words and word != '':
                word_list.append(word)

    count_by_word = Counter(word_list)
    # sorting on the basis of most common words
    count_by_word_sorted = sorted(count_by_word, key=count_by_word.get, reverse=True)[:vocab_size-1]
    # creating a dict
    word_to_int_mapping = {w:i+1 for i,w in enumerate(count_by_word_sorted)}
    return word_to_int_mapping

def tokenize(X, y, mapping, config):
    sequence_len = config['sequence_len']
    new_X = []
    for entry in X:
        new_X.append([mapping[preprocess_string(word)] for word in entry.lower().split() 
                                    if preprocess_string(word) in mapping.keys()])
            
    new_X = pad(new_X, sequence_len)
    new_y = [1 if label =='positive' else 0 for label in y]  
    
    return np.array(new_X), np.array(new_y)

In [7]:
word_to_int_mapping = create_tokens(X_train, config)
X_train, y_train = tokenize(X_train, y_train, word_to_int_mapping, config)
X_valid, y_valid = tokenize(X_valid, y_valid, word_to_int_mapping, config)
X_test, y_test = tokenize(X_test, y_test, word_to_int_mapping, config)

In [8]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(X_valid), torch.from_numpy(y_valid))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

# dataloaders
batch_size = config['batch_size']

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [9]:
class SentimentLSTM(nn.Module):
    '''
    An LSTM is a type of RNN network that can be used to perform Sentiment analysis.
    '''

    def __init__(self, vocab_size, output_dim, embedding_dim, hidden_dim, n_layers, batch_size, dropout_prob):
        '''
        Initialize the model and set up the layers.
        '''
        super(SentimentLSTM, self).__init__()

        self.output_dim = output_dim
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.dropout_prob = dropout_prob
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM Layer
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)

        #self.hidden = self.init_hidden()
        self.dropout = nn.Dropout(0.3)

        # Linear layer
        self.fcl = nn.Linear(hidden_dim, output_dim)

        # Sigmoid layer
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        '''
        Forward pass
        '''
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        
        lstm_out, hidden = self.lstm(embeds, hidden)

        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        # fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fcl(out)

        # sigmoid function
        sig_out = self.sig(out)

        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels

        # return last sigmoid output and hidden state
        return sig_out, hidden

    def init_hidden(self, batch_size=None):
        ''' 
        Initializes hidden state
        Creates two new tensors with sizes n_layers x batch_size x hidden_dim,
        initialized to zero, for hidden state and cell state of LSTM.

        Note: The batch_size needs to be 1 for predictions.
        '''
        if not batch_size:
            batch_size = self.batch_size

        h0 = torch.zeros((self.n_layers, batch_size, self.hidden_dim)).to(device)
        c0 = torch.zeros((self.n_layers, batch_size, self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden


In [12]:
batch_size = config['batch_size']
n_layers = config['n_layers']
vocab_size = config['vocab_size']
embedding_dim = config['embedding_dim']
output_size = config['output_size']
hidden_dim = config['hidden_dim']
dropout_prob = config['dropout_prob']

model = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, batch_size, dropout_prob)

#moving to gpu
model.to(device)

print(model)

SentimentLSTM(
  (embedding): Embedding(7000, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fcl): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [13]:
# loss and optimization functions
lr = config['lr']

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Accuracy function
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [14]:
grad_clip = config['grad_clip']
epochs = config['epochs']

valid_loss_min = np.Inf
training_loss_by_epoch, valid_loss_by_epoch = [],[]
training_acc_by_epoch, valid_acc_by_epoch = [],[]

for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    # initialize hidden state 
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:
        
        # If system has a GPU this will move the data to the GPU's memory.
        inputs, labels = inputs.to(device), labels.to(device)

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])
        
        model.zero_grad()
        output, h = model(inputs, h)
        
        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()

        train_losses.append(loss.item())
        
        # calculating accuracy
        accuracy = acc(output,labels)
        train_acc += accuracy
        
        #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        
    val_h = model.init_hidden(batch_size)
    val_losses = []
    val_acc = 0.0
    model.eval()
    for inputs, labels in valid_loader:
            val_h = tuple([each.data for each in val_h])

            inputs, labels = inputs.to(device), labels.to(device)

            output, val_h = model(inputs, val_h)
            val_loss = criterion(output.squeeze(), labels.float())

            val_losses.append(val_loss.item())
            
            accuracy = acc(output,labels)
            val_acc += accuracy
            
    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc / len(train_loader.dataset)
    epoch_val_acc = val_acc / len(valid_loader.dataset)

    training_loss_by_epoch.append(epoch_train_loss)
    valid_loss_by_epoch.append(epoch_val_loss)
    training_acc_by_epoch.append(epoch_train_acc)
    valid_acc_by_epoch.append(epoch_val_acc)

    print(f'Epoch {epoch+1}') 
    print(f'Training Loss: {epoch_train_loss} Validation Loss: {epoch_val_loss}')
    print(f'Training Accuracy: {epoch_train_acc*100} Validation Accuracy: {epoch_val_acc*100}')

    print(25*'==')
    

Epoch 1
train_loss : 0.5220129236578941 val_loss : 0.4340591138601303
train_accuracy : 74.75500000000001 val_accuracy : 82.39999999999999


FileNotFoundError: [Errno 2] No such file or directory: '../working/state_dict.pt'