# LSTM model

Import the necessary libraries

In [None]:
import torch
from matplotlib import pyplot as plt
import numpy as np
import random
import pickle
import os
import nltk
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import tensorflow as tf 
import string
import re
import pandas as pd 
from collections import Counter
from torch.utils.data import DataLoader, TensorDataset
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag
from sklearn.model_selection import train_test_split
from google.colab import drive
import nltk
nltk.download('omw-1.4')
drive.mount('/content/drive')
!ls '/content/drive/Shareddrives/Deep Learning/DeepLearning_2022/Final project/Data/'
myDrive = '/content/drive/Shareddrives/Deep Learning/DeepLearning_2022/Final project/Data/'
results_path = '/content/drive/Shareddrives/Deep Learning/DeepLearning_2022/Final project/Results/'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


Mounted at /content/drive
 all_words.txt	 data_clean.csv		 'IMDB Dataset.csv'
 data_1		 data_no_stop_words.csv   sorted_words.txt


Import data

In [None]:
data = pd.read_csv(f'{myDrive}data_clean.csv')
file1 = open(f'{myDrive}all_words.txt', 'r')
file2 = open(f'{myDrive}sorted_words.txt', 'r')
all_words = file1.read().splitlines()
count_words = Counter(all_words)
sorted_words=count_words.most_common(len(all_words))
vocab_to_int={w:i+1 for i,(w,c) in enumerate(sorted_words)}


In [None]:
data.sentiment = [0 if each == "negative" else 1 for each in data.sentiment]

Encode the words of the reviews to integers using vocab_to_int dictionary

In [None]:
def encode_review(review):
  encoded_review=list()
  words = review.split(' ')
  for word in words:
    encoded_review.append(vocab_to_int[word])

  return encoded_review

In [None]:
data2 = data.copy()
data['review'] = data.apply(lambda row: encode_review(row['review']), axis = 1)
data['review_length'] = data.apply(lambda row: len(row['review']), axis = 1)

The sequence length of our sentences should be the same. Thus, we use two techniques: 
1. Calculate the average length of reviews (in words) and truncate or pad each review to have such amount of words. 
2. Calculate the maximum length of reviews and add padding to the rest so that all have this length.

In [None]:
sequence_length = int(sum(list(data['review_length']))/data.shape[0]) # average length
sequence_length_max = max(list(data['review_length']))
print(sequence_length)

def truncate_or_pad(review):
  num_words = len(review)
  if num_words<sequence_length:
    zeros = list(np.zeros(sequence_length-num_words))
    new = zeros+review
  else:
    new = review[:sequence_length]

  return new

def pad(review):
  num_words = len(review)
  new = review
  if num_words<sequence_length_max:
    zeros = list(np.zeros(sequence_length_max-num_words))
    new = zeros+review
  return new

#test = truncate_or_pad(data.review[0])
data_pad = data.copy()
data['review'] = data.apply(lambda row: truncate_or_pad(row['review']), axis = 1)
data_pad['review'] = data_pad.apply(lambda row: pad(row['review']), axis = 1)

118


Store the reviews in a matrix that will be the input data of our model

In [None]:
data_matrix = np.zeros((data.shape[0], sequence_length), dtype=int)
for i in range(data.shape[0]):
  data_matrix[i, :] = np.array(data.review[i])
print(data_matrix[0:15, :])

[[    3  1020   326 ...    23  1356    12]
 [    0     0     0 ...  1816  1676    19]
 [    0     0     0 ...    13     6   122]
 ...
 [ 3771   105   110 ...   631     2   546]
 [    0     0     0 ...    23    13   341]
 [    0     0     0 ...  1104 11136  6458]]
85


In [None]:
data_matrix_pad = np.zeros((data_pad.shape[0], sequence_length_max), dtype=int)
for i in range(data_pad.shape[0]):
  data_matrix_pad[i, :] = np.array(data_pad.review[i])
print(data_matrix_pad[0:15, :])

[[    0     0     0 ...   469  3431   323]
 [    0     0     0 ...  1816  1676    19]
 [    0     0     0 ...    13     6   122]
 ...
 [    0     0     0 ...   179    90   160]
 [    0     0     0 ...    23    13   341]
 [    0     0     0 ...  1104 11136  6458]]
85


# Create and train LSTM model 

Now, split into train and test datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_matrix_pad, list(data_pad.sentiment), test_size=0.3)

In [None]:
train_data=TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
test_data=TensorDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test))

Divide the data into batches

In [None]:
batch_size=50
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=True)

Let's define our LSTM neural network

In [None]:
class SentimentalLSTM(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):    
        super().__init__()
        self.output_size=output_size
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim
        
        #Embedding and LSTM layers
        self.embedding=nn.Embedding(vocab_size, embedding_dim)

        self.lstm=nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        
        #dropout layer
        self.dropout=nn.Dropout(0.5)
        
        #Linear and sigmoid layer
        self.fc1=nn.Linear(hidden_dim, 64)
        self.fc2=nn.Linear(64, 16)
        self.fc3=nn.Linear(16,output_size)
        self.sigmoid=nn.Sigmoid()
        
    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size=x.size()
        
        #Embadding and LSTM output
        embedd=self.embedding(x)
        lstm_out, hidden=self.lstm(embedd, hidden)
        
        #stack up the lstm output
        lstm_out=lstm_out.contiguous().view(-1, self.hidden_dim)
        
        #dropout and fully connected layers
        out=self.dropout(lstm_out)
        out=self.fc1(out)
        out=self.dropout(out)
        out=self.fc2(out)
        out=self.dropout(out)
        out=self.fc3(out)
        sig_out=self.sigmoid(out)
        
        sig_out=sig_out.view(batch_size, -1)
        sig_out=sig_out[:, -1]
        
        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        """Initialize Hidden STATE"""
        # Create two new sensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        return hidden

The following function will be used to train the model using train dataset

In [None]:
def train_loop(model, model_name = 'model.ckpt', device = 'cuda'):
    model.train()
    total_step = len(train_loader)
    losses_list = []
    criterion = nn.BCELoss()

    for epoch in range(num_epochs):
        loss_avg = 0
        h = net.init_hidden(batch_size)
        nBatches = 0
        # TRAINING LOOP
        for i, (review, sentiment) in enumerate(train_loader):
            review = review.type(torch.LongTensor).to(device)
            sentiment = sentiment.to(device)

            h = tuple([each.data for each in h])
            net.zero_grad()

            # Forward pass
            outputs,h = model(review, h)
            # Calculate CE_loss
            loss = criterion(outputs, sentiment)

            # Backpropagate
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_avg += loss.cpu().item()
            nBatches+=1
            if (i+1) % 200 == 0:
                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                       .format(epoch+1, num_epochs, i+1, total_step, loss_avg / nBatches))
        print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                       .format(epoch+1, num_epochs, i+1, total_step, loss_avg / nBatches))
        losses_list.append(loss_avg / nBatches)
        torch.save(model.state_dict(), results_path+model_name)


Test function to use after train, with the test dataset

In [None]:
#Load and test function
def load_and_test(model, device='cuda', model_name = 'model.ckpt'):
  model.load_state_dict(torch.load(results_path+model_name))
  criterion = nn.BCELoss()
  h = net.init_hidden(batch_size)
  # Test the model
  model.eval() # Set the model in evaluation mode

  # Compute testing accuracy
  with torch.no_grad():
      correct = 0
      total = 0
      for review, sentiment in test_loader:
          h = tuple([each.data for each in h])

          review = review.type(torch.LongTensor).to(device)
          sentiment = sentiment.to(device)
          # get network predictions
          outputs, h = model(review, h)

          # get predicted class
          predicted = torch.round(outputs)
          # compare with the ground-truth
          total += sentiment.size(0)
          correct += (predicted == sentiment).sum().item()

      print('Test Accuracy of the model: {} %'.format(100 * correct / total))

Instantiation of the model with hyperparams:

In [None]:
vocab_size = 101145
output_size = 1
embedding_dim = 100
hidden_dim = 256
n_layers = 2

Checking different values for Adam SGD optimizer

In [None]:
num_epochs = 6
for lr_i in [0.001, 0.0005, 0.002]:
  net = SentimentalLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
  net = net.cuda()
  optimizer = torch.optim.Adam(net.parameters(),lr = lr_i)

  # Device configuration (choose GPU if it is available )
  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

  train_loop(net, model_name = 'model_4.ckpt')
  load_and_test(net,  model_name = 'model_4.ckpt')

Epoch [1/6], Step [200/700], Loss: 0.6488
Epoch [1/6], Step [400/700], Loss: 0.6151
Epoch [1/6], Step [600/700], Loss: 0.6418
Epoch [1/6], Step [700/700], Loss: 0.6492
Epoch [2/6], Step [200/700], Loss: 0.6860
Epoch [2/6], Step [400/700], Loss: 0.6406
Epoch [2/6], Step [600/700], Loss: 0.5756
Epoch [2/6], Step [700/700], Loss: 0.5499
Epoch [3/6], Step [200/700], Loss: 0.3475
Epoch [3/6], Step [400/700], Loss: 0.3327
Epoch [3/6], Step [600/700], Loss: 0.3270
Epoch [3/6], Step [700/700], Loss: 0.3232
Epoch [4/6], Step [200/700], Loss: 0.2342
Epoch [4/6], Step [400/700], Loss: 0.2345
Epoch [4/6], Step [600/700], Loss: 0.2317
Epoch [4/6], Step [700/700], Loss: 0.2308
Epoch [5/6], Step [200/700], Loss: 0.1700
Epoch [5/6], Step [400/700], Loss: 0.1728
Epoch [5/6], Step [600/700], Loss: 0.1725
Epoch [5/6], Step [700/700], Loss: 0.1739
Epoch [6/6], Step [200/700], Loss: 0.1287
Epoch [6/6], Step [400/700], Loss: 0.1236
Epoch [6/6], Step [600/700], Loss: 0.1272
Epoch [6/6], Step [700/700], Loss:

Checking different values for momentum SGD optimizer

In [None]:
for lr_i in [0.001, 0.01, 0.05, 0.06]:
  net = SentimentalLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
  net = net.cuda()
  optimizer = torch.optim.SGD(net.parameters(), lr=lr_i, momentum=0.9)

  # Device configuration (choose GPU if it is available )
  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

  train_loop(net, model_name = 'model_4.ckpt')
  load_and_test(net,  model_name = 'model_4.ckpt')

Epoch [1/6], Step [200/700], Loss: 0.6952
Epoch [1/6], Step [400/700], Loss: 0.6945
Epoch [1/6], Step [600/700], Loss: 0.6943
Epoch [1/6], Step [700/700], Loss: 0.6942
Epoch [2/6], Step [200/700], Loss: 0.6932
Epoch [2/6], Step [400/700], Loss: 0.6933
Epoch [2/6], Step [600/700], Loss: 0.6934
Epoch [2/6], Step [700/700], Loss: 0.6934
Epoch [3/6], Step [200/700], Loss: 0.6933
Epoch [3/6], Step [400/700], Loss: 0.6934
Epoch [3/6], Step [600/700], Loss: 0.6933
Epoch [3/6], Step [700/700], Loss: 0.6932
Epoch [4/6], Step [200/700], Loss: 0.6934
Epoch [4/6], Step [400/700], Loss: 0.6933
Epoch [4/6], Step [600/700], Loss: 0.6933
Epoch [4/6], Step [700/700], Loss: 0.6934
Epoch [5/6], Step [200/700], Loss: 0.6931
Epoch [5/6], Step [400/700], Loss: 0.6932
Epoch [5/6], Step [600/700], Loss: 0.6933
Epoch [5/6], Step [700/700], Loss: 0.6933
Epoch [6/6], Step [200/700], Loss: 0.6931
Epoch [6/6], Step [400/700], Loss: 0.6932
Epoch [6/6], Step [600/700], Loss: 0.6931
Epoch [6/6], Step [700/700], Loss:

In [None]:
for mom in [0.8, 0.85, 0.89, 0.95]:
  net = SentimentalLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
  net = net.cuda()
  optimizer = torch.optim.SGD(net.parameters(), lr=0.05, momentum=mom)

  # Device configuration (choose GPU if it is available )
  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

  train_loop(net, model_name = 'model_4.ckpt')
  load_and_test(net,  model_name = 'model_4.ckpt')

Epoch [1/6], Step [200/700], Loss: 0.6941
Epoch [1/6], Step [400/700], Loss: 0.6932
Epoch [1/6], Step [600/700], Loss: 0.6929
Epoch [1/6], Step [700/700], Loss: 0.6919
Epoch [2/6], Step [200/700], Loss: 0.6705
Epoch [2/6], Step [400/700], Loss: 0.6590
Epoch [2/6], Step [600/700], Loss: 0.6531
Epoch [2/6], Step [700/700], Loss: 0.6523
Epoch [3/6], Step [200/700], Loss: 0.6161
Epoch [3/6], Step [400/700], Loss: 0.6089
Epoch [3/6], Step [600/700], Loss: 0.6089
Epoch [3/6], Step [700/700], Loss: 0.6059
Epoch [4/6], Step [200/700], Loss: 0.5675
Epoch [4/6], Step [400/700], Loss: 0.5702
Epoch [4/6], Step [600/700], Loss: 0.5763
Epoch [4/6], Step [700/700], Loss: 0.5769
Epoch [5/6], Step [200/700], Loss: 0.6953
Epoch [5/6], Step [400/700], Loss: 0.6944
Epoch [5/6], Step [600/700], Loss: 0.6941
Epoch [5/6], Step [700/700], Loss: 0.6940
Epoch [6/6], Step [200/700], Loss: 0.6937
Epoch [6/6], Step [400/700], Loss: 0.6933
Epoch [6/6], Step [600/700], Loss: 0.6927
Epoch [6/6], Step [700/700], Loss: