<a href="https://colab.research.google.com/github/justi-lai/Business_News_Analytics/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import requests
import json
import numpy as np
import pandas as pd
import string
import random


In [53]:
def load_data(train_data_size, valid_data_size, testing_size):
  # apikey = '602a1f9dab464b98b1628cf062fb6b27'
  # url = 'https://newsapi.org/v2/top-headlines'

  # params = {
  #     "country": "us",
  #     "category": "business",
  #     "apiKey": apikey
  # }

  # response = requests.get(url, params)

  # if response.status_code == 200:
  #     print("Success")
  # else:
  #     print("Error")

  # train_data = response.json()

  splits = {'train': 'sent_train.csv', 'validation': 'sent_valid.csv'}
  df = pd.read_csv("hf://datasets/zeroshot/twitter-financial-news-sentiment/" + splits["train"])

  data = df.to_numpy()
  train_data = data[:int(train_data_size * len(data))]
  valid_data = data[int(train_data_size * len(data)):int((train_data_size + valid_data_size) * len(data))]
  test_data = data[int((train_data_size + valid_data_size) * len(data)):]

  vocab = []
  vocab.append('<unk>')
  for i in train_data:
    temp = i[0].translate(str.maketrans('', '', string.punctuation))
    for j in temp.split():
      word = j.lower()
      if 'http' in word:
        continue
      if word not in vocab:
        vocab.append(word)

  return train_data, valid_data, test_data, vocab

In [54]:
class SentimentClassifier(nn.Module):
  def __init__(self, embedding_dim, vocab, hidden_dim, output_dim):
    super(SentimentClassifier, self).__init__()
    self.vocab = vocab
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_dim
    self.output_dim = output_dim
    self.embedding = nn.Embedding(len(vocab), embedding_dim)
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) # Added batch_first=True
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    # Reshape input for LSTM with batch_first=True
    x = x.unsqueeze(0) # Add a batch dimension

    embedded = self.embedding(x)
    output, (hidden, cell) = self.rnn(embedded)
    # hidden state shape is now (num_layers, batch_size, hidden_size)
    prediction = self.fc(hidden[-1]) # Use the last layer's hidden state
    # prediction shape should now be (batch_size, output_dim)

    result = F.softmax(prediction, dim=1) # Apply softmax along dim=1
    return prediction # Or return result if you need the softmax output


  def compute_loss(self, x, y):
    loss = F.cross_entropy(x, y)
    return loss



In [56]:
embedding_dim = 256
hidden_dim = 256
output_dim = 3

num_epochs = 10
learning_rate = 0.01
batch_size = 64

train_data_size = 0.8
valid_data_size = 0.1
testing_size = 0.1

print('==== Loading Data ====')
train_data, valid_data, test_data, vocab = load_data(train_data_size, valid_data_size, testing_size)

print('==== Creating Model ====')
model = SentimentClassifier(embedding_dim, vocab, hidden_dim, output_dim)

print('==== Training Model ====')
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
  random.shuffle(train_data)
  model.train()
  print('Starting epoch: ' + str(epoch))

  for i in range(0, len(train_data), batch_size):
    batch = train_data[i:i+batch_size]

    for single in batch:
      optimizer.zero_grad()
      model.zero_grad()

      embeddings = []
      labels = [single[1]]
      temp = single[0].translate(str.maketrans('', '', string.punctuation))
      for word in temp.split():
        if word.lower() in model.vocab:
          embeddings.append(model.vocab.index(word.lower()))
        else:
          embeddings.append(model.vocab.index('<unk>'))

      embeddings = F.pad(torch.tensor(embeddings), (0, embedding_dim - len(embeddings)), value=0)
      labels = torch.tensor(labels)

      predictions = model(embeddings)
      loss = model.compute_loss(predictions, labels)
      loss.backward()
      optimizer.step()
    print('Epoch: ' + str(epoch) + ' | Loss: ' + str(loss.item()))
print('==== Evaluating Model ====')


==== Loading Data ====
==== Creating Model ====
==== Training Model ====
Starting epoch: 0
Epoch: 0 | Loss: 0.0
Epoch: 0 | Loss: 0.0
Epoch: 0 | Loss: 1.3803420066833496


KeyboardInterrupt: 