In [10]:
import csv
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import spacy
nlp = spacy.load("en_core_web_lg")

def samplestring_to_num(sampleoutput):
    """Takes a list (sample, output) where sample is "Positive", "Negative", "Neutral",
    or "Irrelevant", and returns the same list, but the aforementioned terms are
    replaced with 1, -1, 0, or None
    """
    match sampleoutput[0]:
        case "Positive":
            return [1, sampleoutput[1]]
        case "Negative":
            return [-1, sampleoutput[1]]
        case "Neutral":
            return [0, sampleoutput[1]]
        case "Irrelevant":
            return None

def format_data(input_data):
    """Takes unformated data from training/validation, and formats it into a list

    Input: input_data -- a string, represeting one line from twitter_training/validation
    """
    data = next(csv.reader([input_data]))[2:]
    return samplestring_to_num(data)
    
def create_data(data_csv, max_samples=None):
    """Create data formatted for training and validation
    
    Inputs:
    data_csv -- a string, directory of a csv of training/validation data
    max_samples -- int, how many samples to include, default is all (useful for debugging)
    
    Output: (samples, outputs) samples -- a list of vectorized data, outputs -- the model output given
    the vectorized data
    """
    with open("archive/twitter_training.csv", 'r') as infile:
        training = infile.read().split("\n")[:max_samples]
        formatted = [format_data(data) for data in training]
        
        outputs = [formatted_data[0] for formatted_data in formatted if formatted_data != None]
        samples = [[token.vector for token in nlp(formatted_data[1])] for formatted_data in formatted if formatted_data != None]

    return (samples, outputs)

In [64]:
samples, output = create_data("archive/twitter_training.csv", max_samples=1)

samples[0][0]

array([ 1.8733e-01,  4.0595e-01, -5.1174e-01, -5.5482e-01,  3.9716e-02,
        1.2887e-01,  4.5137e-01, -5.9149e-01,  1.5591e-01,  1.5137e+00,
       -8.7020e-01,  5.0672e-02,  1.5211e-01, -1.9183e-01,  1.1181e-01,
        1.2131e-01, -2.7212e-01,  1.6203e+00, -2.4884e-01,  1.4060e-01,
        3.3099e-01, -1.8061e-02,  1.5244e-01, -2.6943e-01, -2.7833e-01,
       -5.2123e-02, -4.8149e-01, -5.1839e-01,  8.6262e-02,  3.0818e-02,
       -2.1253e-01, -1.1378e-01, -2.2384e-01,  1.8262e-01, -3.4541e-01,
        8.2611e-02,  1.0024e-01, -7.9550e-02, -8.1721e-01,  6.5621e-03,
        8.0134e-02, -3.9976e-01, -6.3131e-02,  3.2260e-01, -3.1625e-02,
        4.3056e-01, -2.7270e-01, -7.6020e-02,  1.0293e-01, -8.8653e-02,
       -2.9087e-01, -4.7214e-02,  4.6036e-02, -1.7788e-02,  6.4990e-02,
        8.8451e-02, -3.1574e-01, -5.8522e-01,  2.2295e-01, -5.2785e-02,
       -5.5981e-01, -3.9580e-01, -7.9849e-02, -1.0933e-02, -4.1722e-02,
       -5.5576e-01,  8.8707e-02,  1.3710e-01, -2.9873e-03, -2.62

In [106]:
class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, num_outputs=3):
        super(BiRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size,\
        num_layers=self.num_layers, batch_first=True, bidirectional=True)

        self.fc = nn.Linear(hidden_size * 2, num_outputs)

    def forward(self, x):
        initial_hidden = torch.zeros(2 * self.rnn.num_layers, x.size(0), self.rnn.hidden_size)
        out, dummy = self.rnn(x, initial_hidden)
        out = out[:, -1, :]

        return self.fc(out)
        
model = BiRNN(input_size=300, hidden_size=32, num_layers=1, num_outputs=3)
model(torch.tensor(samples[0], dtype=torch.float32).unsqueeze(0))

tensor([[-0.0144, -0.0015, -0.2548]], grad_fn=<AddmmBackward0>)