In [None]:
# Pre-Processing
import pandas as pd
import re
import emoji
import contractions

# Machine Learning
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

## Import Dataset

In [None]:
test_data_fp = 'twitter_data/custom_data/COVIDSenti-main/test.csv'

In [None]:
test_data_df = pd.read_csv(test_data_fp)

In [None]:
test_data_df = test_data_df.head(1000)

In [None]:
pruned_docs = test_data_df['tweet'].to_list()
test_sentiment_scores = test_data_df['label'].to_list()

## Importing Model

In [None]:
# Loads BERT tokenizer and model from a BERT model pre-trained on emotion dataset
checkpoint_dir = 'sentiment_model_results/checkpoint-1125'

tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir, num_labels=3)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_dir, num_labels=3)

In [None]:
# Function used to convert the texts to what is needed:
# - turn the text into tensors
# - truncate and pad the tweets to 280 characters
def analyze_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=280)
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    return probabilities

In [None]:
# Used to get sentiment score for the predictions
def sentiment_score(values):
    # Ensure the input list has exactly three numbers
    if len(values) != 3:
        raise ValueError("The input list must contain exactly three numbers.")
    # Find the index of the maximum value
    max_index = values.index(max(values))
    return max_index

In [None]:
def closest_value(val):
    # Define the target values
    target_values = [0, 1, 2]
    # Find the closest target value to the input value
    closest = min(target_values, key=lambda x: abs(val - x))
    return closest

In [None]:
# Sentiments list
sentiments = ["negative", "neutral", "positive"]

In [None]:
# Predicting sentiment for the individual tweets
test_sentiment_predictions = []
for doc in pruned_docs:
    # Getting probabilities
    probabilities = analyze_sentiment(doc)

    # Saving the probabilities to respective variables
    negative_score = probabilities[0][0].item()
    neutral_score = probabilities[0][1].item()
    positive_score = probabilities[0][2].item()
    
    # Getting an overall sentiment score
    pred_sent_score = sentiment_score([negative_score, neutral_score, positive_score])
    categorized_sent_score = closest_value(pred_sent_score)
    test_sentiment_predictions.append(categorized_sent_score)

## Evaluation Methods

In [None]:
# Calculates the list of absolute differences
differences = [abs(a - b) for a, b in zip(test_sentiment_predictions, test_sentiment_scores)]

# Calculates the total differences
count_equal = sum(a == b for a, b in zip(test_sentiment_predictions, test_sentiment_scores))

In [None]:
difference_avg = sum(differences) / len(differences)

In [None]:
difference_avg

In [None]:
count_equal