In [207]:
import csv
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import spacy
import time
nlp = spacy.load("en_core_web_lg")

def samplestring_to_num(sampleoutput):
    """Takes a list (sample, output) where sample is "Positive", "Negative", "Neutral",
    or "Irrelevant", and returns the same list, but the aforementioned terms are
    replaced with 1, -1, 0, or None
    """
    match sampleoutput[0]:
        case "Positive":
            return [0, sampleoutput[1]]
        case "Negative":
            return [1, sampleoutput[1]]
        case "Neutral":
            return [2, sampleoutput[1]]
        case "Irrelevant":
            return None

def format_data(input_data):
    """Takes unformated data from training/validation, and formats it into a list

    Input: input_data -- a string, represeting one line from twitter_training/validation
    """
    data = next(csv.reader([input_data]))[2:]
    return samplestring_to_num(data)
    
def create_data(data_csv, max_samples=None):
    """Create data formatted for training and validation
    
    Inputs:
    data_csv -- a string, directory of a csv of training/validation data
    max_samples -- int, how many samples to include, default is all (useful for debugging)
    
    Output: (samples, outputs) samples -- a list of vectorized data, outputs -- the model output given
    the vectorized data
    """
    with open("archive/twitter_training.csv", 'r') as infile:
        training = infile.read().split("\n")[:max_samples]
        formatted = [format_data(data) for data in training]

        (outputs, samples) = ([], [])
        for data in formatted:
            if data == None:
                continue
            tokens = [token.vector for token in nlp(data[1]) if not token.is_space]
            if len(tokens) == 0:
                continue    
            outputs.append(data[0])
            samples.append(tokens)

        

    return (samples, outputs)

In [237]:
samples, outputs = create_data("archive/twitter_training.csv", max_samples = 400)

In [256]:
class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers = 1, num_outputs= 3 ):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_outputs = num_outputs
        self.bidirectional = True
        self.rnn = nn.RNN(input_size, hidden_size, num_layers = 1, bidirectional = True)
        self.fc = nn.Linear(hidden_size * 2, num_outputs)

    def forward(self, x):
        lengths = torch.tensor([len(seq) for seq in x]) #Get lengths of inputs
        lengths, permx = lengths.sort(descending = True) #Sort inputs
        padded_sequence = pad_sequence(x, batch_first = True)
        padded_sequence = padded_sequence[permx]
        packed_sequence = pack_padded_sequence(padded_sequence, lengths, batch_first = True)
        output, hidden = self.rnn(packed_sequence)
        output, dummy = pad_packed_sequence(output, batch_first = True)
        batch_size = output.size(0)
        last_timesteps = lengths - 1
        batch_indicies = torch.arange(batch_size)
        
        return self.fc(output[batch_indicies, last_timesteps, :])
    

        
model = BiRNN(input_size=300, hidden_size=32, num_layers=1, num_outputs=3)



def train(model, samples, outputs, epochs, device):
    loss_func = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())
    
    for epoch in range(epochs):
        sampleslen = len(samples)
        overallloss = 0 
        for sampleid, (sample, actual) in enumerate(zip(samples, outputs)):
            if sampleid == 1 and epoch == 1:
                timea = time.time()
            optimizer.zero_grad() #Clear previous gradients
            model_output = model(torch.tensor([sample], dtype=torch.float32))
            loss = loss_func(model_output, torch.tensor(actual).unsqueeze(0))
            overallloss += loss.item()
            loss.backward()
            optimizer.step()
            if sampleid == 100 and epoch == 1:
                overalltime = time.time() - timea
                print(format(f"ETA: {(overalltime * (sampleslen/100)) * epochs} seconds"))
        if epoch % 5 == 0:
            print(format(f"Epoch {epoch}: Loss = {overallloss/sampleslen}"))

    return model

model = train(model, samples, outputs, 100, "cpu")



Epoch 0: Loss = 1.0710766715120201
ETA: 66.01144409179686 seconds
Epoch 5: Loss = 0.06395531084087837
Epoch 10: Loss = 0.0033793560277564803
Epoch 15: Loss = 0.0009574272833647445
Epoch 20: Loss = 0.00033566628063512715
Epoch 25: Loss = 0.00012603269680521126
Epoch 30: Loss = 4.876349944929618e-05
Epoch 35: Loss = 1.9516887287253786e-05
Epoch 40: Loss = 0.001310447652724072
Epoch 45: Loss = 0.0004050917894689014
Epoch 50: Loss = 0.00016904721634714451
Epoch 55: Loss = 7.404154975286348e-05
Epoch 60: Loss = 3.275602297004614e-05
Epoch 65: Loss = 1.4265907269373775e-05
Epoch 70: Loss = 6.075912933473326e-06
Epoch 75: Loss = 2.5464550421584225e-06
Epoch 80: Loss = 1.0561849603435502e-06
Epoch 85: Loss = 4.2775054791592396e-07
Epoch 90: Loss = 1.6929715948211652e-07
Epoch 95: Loss = 6.210903076747172e-08


In [254]:
input_text = "For some reason, my laptop can run Borderlands 3, War thunder and Warframe all on high settings.. . BUT. . The moment I even try to even launch Csgo my entire computer crashes..."
input_tensor = preprocess_text(input_text)
model.eval()
with torch.no_grad():
    logits = model(input_tensor)
    probs = torch.softmax(logits, dim=1)
    predicted_class = torch.argmax(probs, dim=1).item()

print(f"Logits: {logits}")
print(f"Probabilities: {probs}")
print(f"Predicted class index: {predicted_class}")

Logits: tensor([[-10.1431,  -1.0854,  10.3965]])
Probabilities: tensor([[1.2017e-09, 1.0315e-05, 9.9999e-01]])
Predicted class index: 2
