<a href="https://colab.research.google.com/github/justi-lai/Business_News_Analytics/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import requests
import json
import numpy as np
import pandas as pd
import string
import random
import tqdm


In [2]:
def load_data(train_data_size, valid_data_size, testing_size):
  splits = {'train': 'sent_train.csv', 'validation': 'sent_valid.csv'}
  df = pd.read_csv("hf://datasets/zeroshot/twitter-financial-news-sentiment/" + splits["train"])

  data = df.to_numpy()
  train_data = data[:int(train_data_size * len(data))]
  valid_data = data[int(train_data_size * len(data)):int((train_data_size + valid_data_size) * len(data))]
  test_data = data[int((train_data_size + valid_data_size) * len(data)):]

  vocab = []
  vocab.append('<unk>')
  for i in train_data:
    temp = i[0].translate(str.maketrans('', '', string.punctuation))
    for j in temp.split():
      word = j.lower()
      if 'http' in word:
        continue
      if word not in vocab:
        vocab.append(word)

  return train_data, valid_data, test_data, vocab

In [36]:
class SentimentClassifier(nn.Module):
  def __init__(self, embedding_dim, vocab, hidden_dim, output_dim):
    super(SentimentClassifier, self).__init__()
    self.vocab = vocab
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_dim
    self.output_dim = output_dim
    self.embedding = nn.Embedding(len(vocab), embedding_dim)
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) # Added batch_first=True
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    # Reshape input for LSTM with batch_first=True
    x = x.unsqueeze(0) # Add a batch dimension

    embedded = self.embedding(x)
    output, (hidden, cell) = self.rnn(embedded)
    # hidden state shape is now (num_layers, batch_size, hidden_size)
    prediction = self.fc(hidden[-1]) # Use the last layer's hidden state
    # prediction shape should now be (batch_size, output_dim)

    result = F.sigmoid(prediction)
    return result

  def compute_loss(self, x, y):
    choice = torch.argmax(x).item()
    loss = abs(choice-y.item())
    return loss



In [42]:
embedding_dim = 256
hidden_dim = 256
output_dim = 3

num_epochs = 10
learning_rate = 0.01
batch_size = 64

train_data_size = 0.8
valid_data_size = 0.1
testing_size = 0.1

print('==== Loading Data ====')
train_data, valid_data, test_data, vocab = load_data(train_data_size, valid_data_size, testing_size)

print('==== Creating Model ====')
model = SentimentClassifier(embedding_dim, vocab, hidden_dim, output_dim)

print('==== Training Model ====')
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
  random.shuffle(train_data)
  model.train()
  print('Starting epoch: ' + str(epoch))
  loss = 0
  count = 0

  for i in tqdm.tqdm(range(0, len(train_data), batch_size)):
    batch = train_data[i:i+batch_size]

    for single in batch:
      count += 1
      optimizer.zero_grad()
      model.zero_grad()

      embeddings = []
      labels = [single[1]]
      temp = single[0].translate(str.maketrans('', '', string.punctuation))
      for word in temp.split():
        if word.lower() in model.vocab:
          embeddings.append(model.vocab.index(word.lower()))
        else:
          embeddings.append(model.vocab.index('<unk>'))

      embeddings = F.pad(torch.tensor(embeddings, dtype=torch.int64), (0, embedding_dim - len(embeddings)), value=0)
      labels = torch.tensor(labels, dtype=torch.int64)

      predictions = model(embeddings)
      loss += model.compute_loss(predictions, labels)
      optimizer.step()
  epoch_loss = loss / len(train_data)
  print('Epoch: ' + str(epoch) + ' | Loss: ' + str(epoch_loss))
print('==== Evaluating Model ====')


==== Loading Data ====
==== Creating Model ====
==== Training Model ====
Starting epoch: 0


100%|██████████| 120/120 [01:05<00:00,  1.83it/s]


Epoch: 0 | Loss: 1.39324076499869
Starting epoch: 1


100%|██████████| 120/120 [01:03<00:00,  1.88it/s]


Epoch: 1 | Loss: 1.1152737752161384
Starting epoch: 2


100%|██████████| 120/120 [01:02<00:00,  1.92it/s]


Epoch: 2 | Loss: 0.8052135184700027
Starting epoch: 3


100%|██████████| 120/120 [01:01<00:00,  1.96it/s]


Epoch: 3 | Loss: 0.5009169504846738
Starting epoch: 4


100%|██████████| 120/120 [00:59<00:00,  2.03it/s]


Epoch: 4 | Loss: 0.2813728058684831
Starting epoch: 5


100%|██████████| 120/120 [00:59<00:00,  2.01it/s]


Epoch: 5 | Loss: 0.14540214828399267
Starting epoch: 6


100%|██████████| 120/120 [01:00<00:00,  2.00it/s]


Epoch: 6 | Loss: 0.07165313073094053
Starting epoch: 7


100%|██████████| 120/120 [01:00<00:00,  1.97it/s]


Epoch: 7 | Loss: 0.034058160859313596
Starting epoch: 8


100%|██████████| 120/120 [01:00<00:00,  1.99it/s]


Epoch: 8 | Loss: 0.014278228975635316
Starting epoch: 9


100%|██████████| 120/120 [01:00<00:00,  2.00it/s]

Epoch: 9 | Loss: 0.003536809012313335
==== Evaluating Model ====





In [46]:
random.shuffle(valid_data)
model.eval()

for single in tqdm.tqdm(valid_data):
  optimizer.zero_grad()
  model.zero_grad()

  embeddings = []
  labels = [single[1]]
  temp = single[0].translate(str.maketrans('', '', string.punctuation))
  for word in temp.split():
    if word.lower() in model.vocab:
      embeddings.append(model.vocab.index(word.lower()))
    else:
      embeddings.append(model.vocab.index('<unk>'))

  embeddings = F.pad(torch.tensor(embeddings, dtype=torch.int64), (0, embedding_dim - len(embeddings)), value=0)
  labels = torch.tensor(labels, dtype=torch.int64)

  predictions = model(embeddings)
  loss += model.compute_loss(predictions, labels)
  optimizer.step()
print('Loss: ' + str(loss / len(valid_data)))

100%|██████████| 954/954 [00:08<00:00, 107.10it/s]

Loss: 1.5628930817610063



