In [32]:
# choose dataset from 'NPOV', 'WNC', 'CrowS-Pairs', 'Stereo', 'Mixed'
dataset = 'Stereo'

# Imports and Set-up

In [2]:
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.utils.data.dataloader import default_collate
import numpy as np
import math
import pandas as pd
import shutil
from string import punctuation
from collections import Counter 
import os

# Evaluation
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Dataset and Dataloader

In [33]:
# import dataset
source_folder = "/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/Processed Datasets/"
df = pd.read_csv(source_folder + dataset + '.csv', delimiter=',')

In [34]:
texts = df['text'].to_numpy().ravel('F').tolist() # turn into a list
labels = df['label'].to_numpy().ravel('F').tolist() # turn into a list

In [35]:
# Remove punctuation and get the vocabulary of the dataset (i.e. all the words in the sentences)

all_texts=list()
for text in texts:
    text = text.lower()
    text = "".join([ch for ch in text if ch not in punctuation])
    all_texts.append(text)
all_text = " ".join(all_texts) # list of the texts in "texts" but without punctuation
all_words = all_text.split() # all words in "all_text" with repetitions

In [36]:
# Count all the words using Counter Method

count_words = Counter(all_words)
total_words=len(all_words)
sorted_words=count_words.most_common(total_words)

In [37]:
# Top 10 occurring words

sorted_words[:10]

[('the', 1641),
 ('a', 946),
 ('to', 734),
 ('was', 599),
 ('and', 569),
 ('is', 551),
 ('are', 461),
 ('in', 448),
 ('of', 443),
 ('he', 418)]

In [38]:
# Create a dictionary and give each work an ID which is equal to its rank of occurrence
# We reserve ID 0 for padding

vocab_to_int={w:i+1 for i,(w,c) in enumerate(sorted_words)}

In [39]:
# We express the texts as encodings (i.e. the words are replaced with their respective IDs)

encoded_texts = list()
for text in all_texts:
  encoded_text = list()
  for word in text.split():
    if word not in vocab_to_int.keys():
      #if word is not available in vocab_to_int put 0 in that place
      encoded_text.append(0)
    else:
      encoded_text.append(vocab_to_int[word])
  encoded_texts.append(encoded_text)

In [40]:
# make all encoded texts of the same length (i.e. add pre-padding)

sequence_length = 0

for text in encoded_texts:
    if len(text) > sequence_length:
        sequence_length = len(text)

features=np.zeros((len(encoded_texts), sequence_length), dtype=int)
for i, text in enumerate(encoded_texts):
  text_len=len(text)
  if (text_len<=sequence_length):
    zeros=list(np.zeros(sequence_length-text_len))
    new=zeros+text # note that we add the padding at the beginning instead of at the end
  else:
    new=text[:sequence_length]
  
  features[i,:]=np.array(new)

In [41]:
features

array([[   0,    0,    0, ...,  538,   39,   15],
       [   0,    0,    0, ...,   19,    1, 1271],
       [   0,    0,    0, ...,  617,    3,   59],
       ...,
       [   0,    0,    0, ...,   42, 4557,  355],
       [   0,    0,    0, ..., 4560,   25,  739],
       [   0,    0,    0, ..., 1984,    1,  169]])

In [42]:
# Split data into training/validation/testing --> 80/10/10

train_x=features[:int(0.8*len(features))]
train_y=labels[:int(0.8*len(features))]
valid_x=features[int(0.8*len(features)):int(0.9*len(features))]
valid_y=labels[int(0.8*len(features)):int(0.9*len(features))]
test_x=features[int(0.9*len(features)):]
test_y=labels[int(0.9*len(features)):]
print(len(train_y), len(valid_y), len(test_y))

1776 222 223


In [43]:
#create Tensor Dataset
train_data=TensorDataset(torch.FloatTensor(train_x), torch.FloatTensor(train_y))
valid_data=TensorDataset(torch.FloatTensor(valid_x), torch.FloatTensor(valid_y))
test_data=TensorDataset(torch.FloatTensor(test_x), torch.FloatTensor(test_y))

#dataloader
batch_size=50
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader=DataLoader(valid_data, batch_size=batch_size, shuffle=False)
test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [44]:
# Analyse the DataLoader --> obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 40])
Sample input: 
 tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 2.6500e+02, 1.0000e+00,
         8.1500e+02],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 8.0000e+00, 1.0000e+00,
         2.1800e+02],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 2.0000e+00, 1.0600e+02,
         5.0300e+02],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 3.0800e+02, 6.0000e+00,
         2.2630e+03],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 3.1400e+03, 2.6000e+01,
         3.1410e+03],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 4.0420e+03, 8.0000e+00,
         1.8370e+03]])
Sample label size:  torch.Size([50])
Sample label: 
 tensor([0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0.,
        0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0.,
        0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0.])


# Model Architecture

In [45]:
# Create architecture

class SentimentalLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.3):
        """
        Initialize the model by setting up the layers
        """
        super().__init__()
        self.output_size=output_size
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim
        
        #Embedding and LSTM layers
        self.embedding=nn.Embedding(vocab_size, embedding_dim)
        self.lstm=nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        
        #dropout layer
        self.dropout=nn.Dropout(0.3)
        
        #Linear and sigmoid layer
        self.fc1=nn.Linear(hidden_dim, output_size)
        self.sigmoid=nn.Sigmoid()
        
    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """

        x = x.to(torch.int64) # added, to solve RuntimeError
        # x = torch.tensor(x).to(torch.int64) # added, to solve RuntimeError

        batch_size=x.size()
        
        #Embedding and LSTM output
        embedd=self.embedding(x)
        lstm_out, hidden=self.lstm(embedd, hidden)
        
        #stack up the lstm output
        lstm_out=lstm_out.contiguous().view(-1, self.hidden_dim)
        
        #dropout and fully connected layers
        out=self.dropout(lstm_out)
        out=self.fc1(out)
        sig_out=self.sigmoid(out)
        
        sig_out=sig_out.view(batch_size, -1)
        sig_out=sig_out[:, -1]
        
        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        """Initialize Hidden STATE"""
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

# Model and Hyperparameters

In [46]:
# Instantiate the model w/ hyperparams

vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding
output_size = 1
embedding_dim = 300
hidden_dim = 128
n_layers = 2
bi_directional = False

net = SentimentalLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(net)

'''
Output:
 SentimentalLSTM(   
(embedding): Embedding(74073, 400)   
(lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)   (dropout): Dropout(p=0.3)   
(fc): Linear(in_features=256, out_features=1, bias=True)   (sigmoid): Sigmoid() )
'''

SentimentalLSTM(
  (embedding): Embedding(4561, 300)
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


'\nOutput:\n SentimentalLSTM(   \n(embedding): Embedding(74073, 400)   \n(lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)   (dropout): Dropout(p=0.3)   \n(fc): Linear(in_features=256, out_features=1, bias=True)   (sigmoid): Sigmoid() )\n'

# Model Checkpoint

In [19]:
# Save and Load Functions

def save_checkpoint(save_path, model, optimizer, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_checkpoint(load_path, model, optimizer):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    
    return state_dict['valid_loss']

In [47]:
# dataset = 'NPOV' # load NPOV trained model

net.to(device)
lr=0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(params =  net.parameters(), lr=lr)

destination_folder = "/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Saved Models and Checkpoints"

try:
  load_checkpoint(destination_folder + '/lstm_model_' + dataset + '.pt', net, optimizer) # comment this if you wannt to train the model from zero
  print('lstm_model_' + dataset + '.pt --> loaded')
except:
  print('training lstm_model_' + dataset + ' from scratch')


Model loaded from <== /content/drive/MyDrive/Colab Notebooks/Amplifi Project/Saved Models and Checkpoints/lstm_model_NPOV.pt
training lstm_model_NPOV from scratch


# Train Model

In [None]:
if dataset == 'Stereo':
  val_loss_min = 0.858123
elif dataset == 'NPOV':
  val_loss_min = 0.522309
elif dataset == 'Mixed':
  val_loss_min = np.Inf
else:
  val_loss_min = np.Inf

In [22]:
# check if CUDA is available
train_on_gpu = torch.cuda.is_available()
val_loss_min = np.Inf

# training params

epochs = 10 # 3-4 is approx where I noticed the validation loss stop decreasing

print_every = len(train_loader)/6
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    counter = 0
    for inputs, labels in train_loader:
        counter += 1
        if counter == len(train_loader): # the last bacth doesn't have the batch size specified and it gives an error, so we skip it.
          continue

        if(train_on_gpu):
            inputs=inputs.cuda()
            labels=labels.cuda()
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            counter2 = 0
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                counter2 += 1
                if counter2 == len(valid_loader):
                  continue

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                  inputs, labels = inputs.cuda(), labels.cuda()  

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            # create checkpoint variable and add important data
            checkpoint = {
                  'epoch': e + 1,
                  'valid_loss_min': val_loss,
                  'state_dict': net.state_dict(),
                  'optimizer': optimizer.state_dict()
            }

            ## TODO: save the model if validation loss has decreased
            if val_loss <= val_loss_min:
              print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(val_loss_min,val_loss))
              # save checkpoint as best model
              save_checkpoint(destination_folder + '/lstm_model_' + dataset + '.pt', net, optimizer, val_loss_min)
              val_loss_min = val_loss

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Validation loss decreased (inf --> 1.129760).  Saving model ...
Model saved to ==> /content/drive/MyDrive/Colab Notebooks/Amplifi Project/Saved Models and Checkpoints/lstm_model_Stereo.pt
Epoch: 1/10... Step: 6... Loss: 0.620617... Val Loss: 0.742576
Epoch: 1/10... Step: 12... Loss: 0.562976... Val Loss: 0.871720
Epoch: 1/10... Step: 18... Loss: 0.545510... Val Loss: 0.702273
Epoch: 1/10... Step: 24... Loss: 0.412801... Val Loss: 0.684113
Epoch: 1/10... Step: 30... Loss: 0.566342... Val Loss: 0.765450
Epoch: 2/10... Step: 6... Loss: 0.265729... Val Loss: 0.801111
Epoch: 2/10... Step: 12... Loss: 0.447123... Val Loss: 0.787933
Epoch: 2/10... Step: 18... Loss: 0.229172... Val Loss: 0.852352
Epoch: 2/10... Step: 24... Loss: 0.420937... Val Loss: 0.804489
Epoch: 2/10... Step: 30... Loss: 0.373012... Val Loss: 0.770287
Epoch: 3/10... Step: 6... Loss: 0.277467... Val Loss: 0.875895
Epoch: 3/10... Step: 12... Loss: 0.243606... Val Loss: 0.891707
Epoch: 3/10... Step: 18... Loss: 0.328635... Va

# Test model accuracy

In [51]:
# Test model accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
counter3 = 0
for inputs, labels in test_loader:

    counter3 += 1
    if counter3 == len(test_loader):
      continue
    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()

    output, h = net(inputs, h)

    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer

    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.784
Test accuracy: 0.374
