In [8]:
# Importing Libraries and Functions
import torch
import torch.nn as nn
from functools import partial
from torch.utils.data import DataLoader
import torch.optim as optim
import pandas as pd
import nltk
nltk.download("punkt")
nltk.download("stopwords")
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from torch.nn.utils.rnn import pad_sequence

# Reading in the dataset from Google Drive (Train-Test split is implemented in an 80-20 ratio)
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FinancialData.csv', names = ["Sentiment", "Review"], encoding='utf-8')
training_data = dataset.sample(frac = 0.8, random_state = 42)
test_data = dataset.drop(training_data.index)
corpus = dataset["Review"]
# Splitting corpus and labels from training and test sets
training_corpus = list(training_data["Review"])
training_labels = list(training_data["Sentiment"])
test_corpus = list(test_data["Review"])
test_labels = list(test_data["Sentiment"])

# Defining a function to preprocess the corpus - convert all letters to lowercase and splitting on the occurence of spaces, removing stop words and numbers
def preprocess(corpus):
  tokenized_corpus = []
  for sentence in corpus:
    tokenized_sentence  = nltk.word_tokenize(sentence.lower())
    punc_free_sent = [token for token in tokenized_sentence if token.isalpha()]
    keepers = ["up", "down", "off", "on", "above", "below", "too", "very", "between", "against", "between", "same", "not", "no", "only", "too", "very"]
    stop_words = [word for word in stopwords.words("english") if word not in keepers]
    stop_free_sent = [token for token in punc_free_sent if token not in stopwords.words('english')]
    if len(stop_free_sent) != 0:
      tokenized_corpus.append(stop_free_sent)
  return tokenized_corpus

corpus = preprocess(corpus)
training_corpus = preprocess(training_corpus)
test_corpus = preprocess(test_corpus)

# Defining a tensor of lengths of sequence pre-padding so that loss calculation is accurate
training_lengths = []
for sentence in training_corpus:
  training_lengths.append(len(sentence))
training_lengths = torch.tensor(lengths, dtype=torch.float)
test_lengths = []
for sentence in test_corpus:
  test_lengths.append(len(sentence))
test_lengths = torch.tensor(lengths, dtype=torch.float)

# Mapping each sentiment to a number -1 = Negative, 0 = Neutral, 1 = Positive
def map_sentiment(labels):
  mapped_labels = []
  for label in labels:
    if label == "negative": mapped_labels.append(-1)
    elif label == "neutral": mapped_labels.append(0)
    else: mapped_labels.append(1)
  return mapped_labels

training_labels = map_sentiment(training_labels)
test_labels = map_sentiment(test_labels)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [13]:
# Training a word2vec model for the training corpus
word_embeds = Word2Vec(corpus, size = 75, min_count = 1)

# Mapping each token of each sentence (sequence) to its respective embedding
def map_sequence(corpus, embeds):
  corpus_embeds = []
  for sentence in corpus:
    sentence_embeds = []
    for token in sentence:
      sentence_embeds.append(torch.from_numpy(embeds[token]))
    sentence_embeds = torch.stack(sentence_embeds, dim=0)
    corpus_embeds.append(sentence_embeds)
  corpus_embeds = pad_sequence(corpus_embeds, batch_first = True)
  return corpus_embeds

corpus_embeds = map_sequence(training_corpus, word_embeds)
test_embeds = map_sequence(test_corpus, word_embeds)
# Creating a dictionary of hyperparameters for the model
hyperparameters = {
    "input_size": 75,
    "hidden_size": 256,
    "sequence_length": 35,
    "num_layers": 3,
    "batch_size": 256,
    "num_classes": 3,
    "num_epochs": 1000,
    "learning_rate": 0.001,
}

# Defining the architecture of the model
class SentimentRNN(nn.Module):
  def __init__(self, hyperparameters):
    super(SentimentRNN, self).__init__()
    self.input_size = hyperparameters["input_size"]
    self.hidden_size = hyperparameters["hidden_size"]
    self.num_layers = hyperparameters["num_layers"]
    self.num_classes = hyperparameters["num_classes"]
    self.sequence_length = hyperparameters["sequence_length"]
    # Creating hidden layer for each token, output matrix will be B x L x H, 256 x 35 x 256
    self.recurrent_layer = nn.RNN(self.input_size, self.hidden_size, num_layers = self.num_layers, bidirectional=True, batch_first=True)
    # Adding a final linear layer followed by the Sigmoid() non-linearity to make a probability distribution, outputs a matrix of dimensions B x 1 - 256 x 1
    self.linear_layer = nn.Sequential(
        nn.Linear(self.hidden_size * 2 * self.sequence_length, 1),
        nn.Sigmoid())
    
  def forward(self, inputs):
    hidden_output, _ = self.recurrent_layer(inputs)
    hidden_output = hidden_output.reshape(hidden_output.shape[0], -1)
    linear_output = self.linear_layer(hidden_output)
    linear_output = linear_output.squeeze(1)
    return linear_output





  # Remove the CWD from sys.path while we load stuff.


torch.Size([3876, 35, 75])


In [6]:
# Parsing the data into batches using DataLoader
data = list(zip(corpus_embeds, training_labels, lengths))
loader = DataLoader(data, batch_size = hyperparameters["batch_size"], shuffle = True)

# Defining a custom loss function to accont for variable length
def loss_func(y, preds, length):
  loss = nn.BCELoss()
  loss_val = loss(preds, y.float())
  loss_val = loss_val / length.sum().float()
  return loss_val

# Initializing the model
model = SentimentRNN(hyperparameters)
# Initializing an SGD optimizer
SGD = optim.SGD(model.parameters(), lr=hyperparameters["learning_rate"])

def train(model, optimizer, loader, nepoch=100):
  for i in range(nepoch):
    for x_batch, y_batch, length_batch in loader:
      SGD.zero_grad()
      preds = model(x_batch)
      loss = loss_func(y_batch, preds, length_batch)
      loss.backward()
      SGD.step()
    if i % 10 == 0:
      print(loss)

train(model, SGD, loader)

tensor(0.0015, grad_fn=<DivBackward0>)
tensor(0.0016, grad_fn=<DivBackward0>)
tensor(0.0016, grad_fn=<DivBackward0>)
tensor(0.0019, grad_fn=<DivBackward0>)
tensor(0.0016, grad_fn=<DivBackward0>)
tensor(0.0016, grad_fn=<DivBackward0>)
tensor(0.0014, grad_fn=<DivBackward0>)
tensor(0.0015, grad_fn=<DivBackward0>)
tensor(0.0017, grad_fn=<DivBackward0>)
tensor(0.0017, grad_fn=<DivBackward0>)
