In [57]:
import csv
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import spacy
nlp = spacy.load("en_core_web_lg")

def samplestring_to_num(sampleoutput):
    """Takes a list (sample, output) where sample is "Positive", "Negative", "Neutral",
    or "Irrelevant", and returns the same list, but the aforementioned terms are
    replaced with 1, -1, 0, or None
    """
    match sampleoutput[0]:
        case "Positive":
            return [0, sampleoutput[1]]
        case "Negative":
            return [1, sampleoutput[1]]
        case "Neutral":
            return [2, sampleoutput[1]]
        case "Irrelevant":
            return None

def format_data(input_data):
    """Takes unformated data from training/validation, and formats it into a list

    Input: input_data -- a string, represeting one line from twitter_training/validation
    """
    data = next(csv.reader([input_data]))[2:]
    return samplestring_to_num(data)
    
def create_data(data_csv, max_samples=None):
    """Create data formatted for training and validation
    
    Inputs:
    data_csv -- a string, directory of a csv of training/validation data
    max_samples -- int, how many samples to include, default is all (useful for debugging)
    
    Output: (samples, outputs) samples -- a list of vectorized data, outputs -- the model output given
    the vectorized data
    """
    with open("archive/twitter_training.csv", 'r') as infile:
        training = infile.read().split("\n")[:max_samples]
        formatted = [format_data(data) for data in training]

        (outputs, samples) = ([], [])
        for data in formatted:
            if data == None:
                continue
            tokens = [token.vector for token in nlp(data[1]) if not token.is_space]
            if len(tokens) == 0:
                continue    
            outputs.append(data[0])
            samples.append(tokens)

        

    return (samples, outputs)

In [59]:
samples, outputs = create_data("archive/twitter_training.csv", max_samples=200)

In [93]:
class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, num_outputs=3):
        super(BiRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size,\
        num_layers=self.num_layers, batch_first=True, bidirectional=True)

        self.fc = nn.Linear(hidden_size * 2, num_outputs)

    def forward(self, x):
        initial_hidden = torch.zeros(2 * self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(x.device)
        out, dummy = self.rnn(x, initial_hidden)
        out = out[:, -1, :]

        return self.fc(out)
        
model = BiRNN(input_size=300, hidden_size=32, num_layers=1, num_outputs=3)



def train(model, samples, outputs, epochs, device):
    """Trains a bidirectional RNN for a given amount of epochs"""
    model = model.to(device)
    loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())
    
    samples = pad_sequence([torch.tensor(sample, dtype=torch.float32) for sample in samples], batch_first=True).to(device)
    outputs = torch.tensor(outputs, dtype=torch.long).to(device)

    dataset = TensorDataset(samples, outputs)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for (sample, actual) in dataloader:
            sample = sample.to(device)
            actual = actual.to(device)
            
            optimizer.zero_grad()
            outputs= model(sample)
            output_loss = loss(outputs, actual)
            output_loss.backward()
            optimizer.step()
            epoch_loss += output_loss.item()
        
        if epoch % 5 == 0:
            print(format(f"Epoch {epoch} - Loss: {epoch_loss}",))

    print(format(f"Epoch {epochs} - Loss: {epoch_loss}",))
    return model


train(model, samples, outputs, 25, "cpu")

Epoch 0 - Loss: 13.011891484260559
Epoch 5 - Loss: 12.465294599533081
Epoch 10 - Loss: 12.721186339855194
Epoch 15 - Loss: 12.709371984004974
Epoch 20 - Loss: 12.485625863075256
Epoch 25 - Loss: 9.850455284118652


BiRNN(
  (rnn): RNN(300, 32, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=64, out_features=3, bias=True)
)