In [87]:
# Importing Libraries and Functions
import torch
import torch.nn as nn
from functools import partial
from torch.utils.data import DataLoader
import torch.optim as optim
import pandas as pd
import nltk
nltk.download("punkt")
nltk.download("stopwords")
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from torch.nn.utils.rnn import pad_sequence

# Reading in the dataset from Google Drive (Train-Test split is implemented in an 80-20 ratio)
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FinancialData.csv', names = ["Sentiment", "Review"], encoding='utf-8')
training_data = dataset.sample(frac = 0.8, random_state = 42)
training_data = training_data.drop(index = 4845)
test_data = dataset.drop(training_data.index)
test_data = test_data.drop(index = 4845)
# Splitting corpus and labels from training and test sets
training_corpus = list(training_data["Review"])
test_corpus = list(test_data["Review"])
training_labels = list(training_data["Sentiment"])
test_labels = list(test_data["Sentiment"])

# Defining a tensor of lengths of sequence pre-padding so that loss calculation is accurate
training_lengths = []
for sentence in training_corpus:
  training_lengths.append(len(sentence))
training_lengths = torch.tensor(training_lengths, dtype=torch.float)
test_lengths = []
for sentence in test_corpus:
  test_lengths.append(len(sentence))
test_lengths = torch.tensor(training_lengths, dtype=torch.float)

# Mapping each sentiment to a number -1 = Negative, 0 = Neutral, 1 = Positive
def map_sentiment(labels):
  mapped_labels = []
  for label in labels:
    if label == "negative": mapped_labels.append(-1)
    elif label == "neutral": mapped_labels.append(0)
    else: mapped_labels.append(1)
  return mapped_labels

training_labels = map_sentiment(training_labels)
test_labels = map_sentiment(test_labels)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [94]:
corpus = dataset["Review"]
# Defining a function to preprocess the corpus - convert all letters to lowercase and splitting on the occurence of spaces, removing stop words and numbers
def preprocess(corpus):
  tokenized_corpus = []
  for sentence in corpus:
    tokenized_sentence  = nltk.word_tokenize(sentence.lower())
    punc_free_sent = [token for token in tokenized_sentence if token.isalpha()]
    keepers = ["up", "down", "off", "on", "above", "below", "too", "very", "between", "against", "between", "same", "not", "no", "only", "too", "very"]
    stop_words = [word for word in stopwords.words("english") if word not in keepers]
    stop_free_sent = [token for token in punc_free_sent if token not in stopwords.words('english')]
    if len(stop_free_sent) != 0:
      tokenized_corpus.append(stop_free_sent)
  return tokenized_corpus

corpus = preprocess(corpus)

# Training a word2vec model for the training corpus
word_embeds = Word2Vec(corpus, size = 75, min_count = 1)

# Mapping each token of each sentence (sequence) to its respective embedding
def map_sequence(corpus, embeds):
  corpus_embeds = []
  for sentence in corpus:
    sentence_embeds = []
    for token in sentence:
      sentence_embeds.append(torch.from_numpy(embeds[token]))
    sentence_embeds = torch.stack(sentence_embeds, dim=0)
    corpus_embeds.append(sentence_embeds)
  corpus_embeds = pad_sequence(corpus_embeds, batch_first = True)
  return corpus_embeds

corpus = map_sequence(corpus, word_embeds)
training_corpus = []
test_corpus = []
index = training_data.index.to_list()
for i in index:
  # Filtering out of bounds indices
  training_corpus.append(corpus[i])
index = test_data.index.to_list()
for i in index:
  test_corpus.append(corpus[i])

# training_embeds = map_sequence(training_corpus, word_embeds)
# test_embeds = map_sequence(test_corpus, word_embeds)
# Creating a dictionary of hyperparameters for the model
hyperparameters = {
    "input_size": 75,
    "hidden_size": 256,
    "sequence_length": 35,
    "num_layers": 3,
    "batch_size": 256,
    "num_classes": 3,
    "num_epochs": 1000,
    "learning_rate": 0.001,
}

# Defining the architecture of the simple RNN model
class SentimentRNN(nn.Module):
  def __init__(self, hyperparameters):
    super(SentimentRNN, self).__init__()
    self.input_size = hyperparameters["input_size"]
    self.hidden_size = hyperparameters["hidden_size"]
    self.num_layers = hyperparameters["num_layers"]
    self.num_classes = hyperparameters["num_classes"]
    self.sequence_length = hyperparameters["sequence_length"]
    # Creating hidden layer for each token, output matrix will be B x L x H, 256 x 35 x 256
    self.recurrent_layer = nn.RNN(self.input_size, self.hidden_size, num_layers = self.num_layers, bidirectional=True, batch_first=True)
    # Adding a final linear layer followed by the Sigmoid() non-linearity to make a probability distribution, outputs a matrix of dimensions B x 1 - 256 x 1
    self.linear_layer = nn.Sequential(
        nn.Linear(self.hidden_size * 2 * self.sequence_length, 1),
        nn.Sigmoid())
    
  def forward(self, inputs):
    hidden_output, _ = self.recurrent_layer(inputs)
    hidden_output = hidden_output.reshape(hidden_output.shape[0], -1)
    linear_output = self.linear_layer(hidden_output)
    linear_output = linear_output.squeeze(1)
    return linear_output

# Defining the architecture of the LSTM model
class SentimentLSTM(nn.Module):
  def __init__(self, hyperparameters):
    super(SentimentLSTM, self).__init__()
    self.input_size = hyperparameters["input_size"]
    self.hidden_size = hyperparameters["hidden_size"]
    self.num_layers = hyperparameters["num_layers"]
    self.num_classes = hyperparameters["num_classes"]
    self.sequence_length = hyperparameters["sequence_length"]
    # Creating hidden layer for each token, output matrix will be B x L x H, 256 x 35 x 256
    self.recurrent_layer = nn.LSTM(self.input_size, self.hidden_size, num_layers = self.num_layers, bidirectional=True, batch_first=True)
    # Adding a final linear layer followed by the Sigmoid() non-linearity to make a probability distribution, outputs a matrix of dimensions B x 1 - 256 x 1
    self.linear_layer = nn.Sequential(
        nn.Linear(self.hidden_size * 2 * self.sequence_length, 1),
        nn.Sigmoid())
    
  def forward(self, inputs):
    hidden_output, _ = self.recurrent_layer(inputs)
    hidden_output = hidden_output.reshape(hidden_output.shape[0], -1)
    linear_output = self.linear_layer(hidden_output)
    linear_output = linear_output.squeeze(1)
    return linear_output

# Defining the architecture of the GRU model
class SentimentGRU(nn.Module):
  def __init__(self, hyperparameters):
    super(SentimentGRU, self).__init__()
    self.input_size = hyperparameters["input_size"]
    self.hidden_size = hyperparameters["hidden_size"]
    self.num_layers = hyperparameters["num_layers"]
    self.num_classes = hyperparameters["num_classes"]
    self.sequence_length = hyperparameters["sequence_length"]
    # Creating hidden layer for each token, output matrix will be B x L x H, 256 x 35 x 256
    self.recurrent_layer = nn.GRU(self.input_size, self.hidden_size, num_layers = self.num_layers, bidirectional=True, batch_first=True)
    # Adding a final linear layer followed by the Sigmoid() non-linearity to make a probability distribution, outputs a matrix of dimensions B x 1 - 256 x 1
    self.linear_layer = nn.Sequential(
        nn.Linear(self.hidden_size * 2 * self.sequence_length, 1),
        nn.Sigmoid())
    
  def forward(self, inputs):
    hidden_output, _ = self.recurrent_layer(inputs)
    hidden_output = hidden_output.reshape(hidden_output.shape[0], -1)
    linear_output = self.linear_layer(hidden_output)
    linear_output = linear_output.squeeze(1)
    return linear_output






In [97]:
# Parsing the data into batches using DataLoader
data = list(zip(training_corpus, training_labels, training_lengths))
loader = DataLoader(data, batch_size = hyperparameters["batch_size"], shuffle = True)

# Defining a custom loss function to account for variable length
def loss_func(y, preds, length):
  loss = nn.BCELoss()
  loss_val = loss(preds, y.float())
  loss_val = loss_val / length.sum().float()
  return loss_val

# Initializing the models
RNN_model = SentimentRNN(hyperparameters)
LSTM_model = SentimentLSTM(hyperparameters)
GRU_model = SentimentGRU(hyperparameters)
# Initializing an SGD optimizer for each model
RNN_SGD = optim.SGD(RNN_model.parameters(), lr=hyperparameters["learning_rate"])
LSTM_SGD = optim.SGD(LSTM_model.parameters(), lr=hyperparameters["learning_rate"])
GRU_SGD = optim.SGD(GRU_model.parameters(), lr=hyperparameters["learning_rate"])
models = [RNN_model, LSTM_model, GRU_model]
optimizers = [RNN_SGD, LSTM_SGD, GRU_SGD]
# Defining a train function that will train the models over a given number of epochs
def train(models, optimizers, loader, nepoch=100):
  for i in range(nepoch):
    for x_batch, y_batch, length_batch in loader:
      optimizers[0].zero_grad()
      preds = models[0](x_batch)
      RNN_loss = loss_func(y_batch, preds, length_batch)
      RNN_loss.backward()
      optimizers[0].step()

      optimizers[1].zero_grad()
      preds = models[1](x_batch)
      LSTM_loss = loss_func(y_batch, preds, length_batch)
      LSTM_loss.backward()
      optimizers[1].step()

      optimizers[2].zero_grad()
      preds = models[2](x_batch)
      GRU_loss = loss_func(y_batch, preds, length_batch)
      GRU_loss.backward()
      optimizers[2].step()
    if i % 10 == 0:
      print(f"Simple RNN Loss: {RNN_loss}")
      print(f"Simple LSTM Loss: {LSTM_loss}")
      print(f"Simple GRU Loss: {GRU_loss}")

train(models, optimizers, loader)

Simple RNN Loss: 0.00016884817159734666
Simple LSTM Loss: 0.00016688869800418615
Simple GRU Loss: 0.00016816092829685658
Simple RNN Loss: 0.000153010492795147
Simple LSTM Loss: 0.00015051108493935317
Simple GRU Loss: 0.00015171938866842538
Simple RNN Loss: 0.00016742543084546924
Simple LSTM Loss: 0.00016304533346556127
Simple GRU Loss: 0.00016478399629704654
Simple RNN Loss: 0.00015183120558504015
Simple LSTM Loss: 0.00014834340254310519
Simple GRU Loss: 0.00014937165542505682
Simple RNN Loss: 0.00014401554653886706
Simple LSTM Loss: 0.00014173315139487386
Simple GRU Loss: 0.00014277802256401628
Simple RNN Loss: 0.0001378691813442856
Simple LSTM Loss: 0.00013555561599787325
Simple GRU Loss: 0.00013628765009343624
Simple RNN Loss: 0.0001554391928948462
Simple LSTM Loss: 0.0001529199507785961
Simple GRU Loss: 0.0001539181830594316
Simple RNN Loss: 0.0001323157048318535
Simple LSTM Loss: 0.00013137435598764569
Simple GRU Loss: 0.00013175740605220199
Simple RNN Loss: 0.00016618764493614435

In [99]:
data = list(zip(test_corpus, test_labels, test_lengths))
loader = DataLoader(data, batch_size = 256, shuffle = True)

def test(models, loader):
  counter = 0
  total_rnn_loss = 0
  total_lstm_loss = 0
  total_gru_loss = 0
  for x_batch, y_batch, length_batch in loader:
    preds = models[0](x_batch)
    total_rnn_loss += loss_func(y_batch, preds, length_batch)
    preds = models[1](x_batch)
    total_lstm_loss += loss_func(y_batch, preds, length_batch)
    preds = models[2](x_batch)
    total_gru_loss += loss_func(y_batch, preds, length_batch)
    counter += 1
  losses = [total_rnn_loss.item()/counter, total_lstm_loss.item()/counter, total_gru_loss.item()/counter]
  return losses

losses = test(models, loader)
print(f"Final Loss of the simple RNN model on the Test Set: {losses[0]}")
print(f"Final Loss of the LSTM model on the Test Set: {losses[1]}")
print(f"Final Loss of the GRU model on the Test Set: {losses[2]}")

Final Loss of the simple RNN model on the Test Set: 2.314013363502454e-05
Final Loss of the LSTM model on the Test Set: 2.2718817490385845e-05
Final Loss of the GRU model on the Test Set: 2.2915784938959405e-05
