In [323]:
# Pre-Processing
import pandas as pd
import re
import emoji
import contractions

# Machine Learning
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

## Import Dataset

In [324]:
test_data_fp = 'twitter_data/custom_data/COVIDSenti-main/COVIDSenti-A.csv'

In [325]:
test_data_df = pd.read_csv(test_data_fp)

In [326]:
test_data_df = test_data_df.head(500)

In [327]:
test_data_df

Unnamed: 0,tweet,label
0,Coronavirus | Human Coronavirus Types | CDC ht...,neu
1,"@shehryar_taseer That‚Äôs üíØ true , \nCorona...",neu
2,"TLDR: Not SARS, possibly new coronavirus. Diff...",neg
3,Disease outbreak news from the WHO: Middle Eas...,neu
4,China - Media: WSJ says sources tell them myst...,neu
...,...,...
495,@realDonaldTrump is the American coronavirus o...,neu
496,New coronavirus not yet a global health concer...,neu
497,"Wuhan, China, shutting down transportation to ...",neg
498,@DomenickBeskos Wuhan population 11 million Ch...,neg


## Preprocessing

In [328]:
# Gathering documents
docs = test_data_df['tweet'].tolist()
pruned_docs = docs

In [329]:
# Removal of links
def remove_urls(doc):
    return re.sub(r'http\S+', '', doc)

pruned_docs = [remove_urls(doc) for doc in pruned_docs]

def convert_emojis(doc):
    # delimiters are what is used around the emoji description, in this case spaces are used
    return emoji.replace_emoji(doc, replace='')

pruned_docs = [convert_emojis(doc) for doc in pruned_docs]

def remove_hashtags(doc):
    return doc.replace('#', '')
    #return re.sub(r'#\w+', '', doc)

pruned_docs = [remove_hashtags(doc) for doc in pruned_docs]

def remove_numbers(doc):
    return re.sub(r'\d+', '', doc)

pruned_docs = [remove_numbers(doc) for doc in pruned_docs]

def remove_user_mentions(doc):
    return re.sub(r'@\w+', '', doc)

pruned_docs = [remove_user_mentions(doc) for doc in pruned_docs]

def fix_contractions(doc):
    return contractions.fix(doc)

pruned_docs = [fix_contractions(doc) for doc in pruned_docs]

def remove_punctuation(doc):
    return re.sub(r'[^\w\s]', '', doc)

pruned_docs = [remove_punctuation(doc) for doc in pruned_docs]

def remove_amp(doc):
    return re.sub(r'\bamp\b', '', doc).strip() # strip removes the surrounding white space

pruned_docs = [remove_amp(doc) for doc in pruned_docs]

def remove_special_character_combinations(doc):
    # Remove all combinations of \r and \n in any order
    cleaned_text = re.sub(r'[\r\n\xa0]+', '', doc)
    return cleaned_text
    
pruned_docs = [remove_special_character_combinations(doc) for doc in pruned_docs]

def remove_non_english_characters(doc):
    return re.sub(r'[^\x00-\x7F]+', '', doc)

pruned_docs = [remove_non_english_characters(doc) for doc in pruned_docs]

In [330]:
pruned_docs

['Coronavirus  Human Coronavirus Types  CDC',
 'Thats  true  Corona virus swine flue Bird flu in December when whole Pk is busy in Marriage',
 'TLDR Not SARS possibly new coronavirus Difficult to confirm because patients identified later in infection when',
 'Disease outbreak news from the WHO Middle East respiratory syndrome coronavirus MERSCoV  The United Arab Emira',
 'China  Media WSJ says sources tell them mystery pneumonia is a new coronavirus  something that has been speculat',
 'The mystery new virus causing China pneumonia outbreak is possibly new coronavirus same family as sars and mers',
 'Virologists weigh in on novel coronavirus in Chinas outbreak',
 'Chinese authorities have made a preliminary determination of a novel or new coronavirus identified in a hospit',
 'Why CCP keep on saying unknown because of pneumonia The because is obviously related to corona virus Let us',
 'Chinese report says mysterious illnesses may be from new coronavirus',
 'China identifies new strain

## Setting Sentiment Scores

In [331]:
# Used to get sentiment score for the predictions
def sentiment_score(negative, neutral, positive):
    # Finds positive/negative difference returning a value between -1 to 1
    score = positive - negative
    # This is normalized to the range 0 to 1 because it's simpler to divide the sentiments into 5 parts between 0 and 1
    normalized_score = (score + 1) / 2
    # Round the score to 4 decimal places
    normalized_score = round(normalized_score, 4)
    return normalized_score

In [332]:
possible_sentiment_values = [0.25, 0.5, 0.75]

# Round values to sentiment values
def closest_value(sent_score):
    return min(possible_sentiment_values, key=lambda x: abs(sent_score - x))

In [333]:
# Used to get categorize sentiment score for the training data
def categorize_sentiment(score):
    if score == "neg":
        return 0.25
    elif score == "neg":
        return 0.25
    elif score == "neu":
        return 0.5
    elif score == "pos":
        return 0.75
    else:
        return 0.75

In [334]:
# Test Data Sentiment Score List
test_sentiment_scores = []
test_sentiments = test_data_df['label'].to_list()

for sentiment in test_sentiments:
    sent_score = categorize_sentiment(sentiment)
    test_sentiment_scores.append(sent_score)

In [335]:
len(test_sentiments)

500

## Performing BERT Sentiment Analysis

In [336]:
# Function used to convert the texts to what is needed:
# - turn the text into tensors
# - truncate and pad the tweets to 280 characters
def analyze_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=280)
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    return probabilities

In [337]:
# Loads BERT tokenizer and model from a BERT model pre-trained on emotion dataset
# Found here: https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis
# This was based off of BERTweet which can be found here: https://github.com/VinAIResearch/BERTweet?tab=readme-ov-file
# The model was then trained with SemEval 2017 corpus (around ~40k tweets) to refine sentiment
tokenizer = AutoTokenizer.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')
model = AutoModelForSequenceClassification.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')

In [338]:
# Sentiments list
sentiments = ["negative", "neutral", "positive"]

In [339]:
# Predicting sentiment for the individual tweets
test_sentiment_predictions = []
for doc in pruned_docs:
    # Getting probabilities
    probabilities = analyze_sentiment(doc)

    # Saving the probabilities to respective variables
    negative_score = probabilities[0][0].item()
    neutral_score = probabilities[0][1].item()
    positive_score = probabilities[0][2].item()
    
    # Getting an overall sentiment score
    pred_sent_score = sentiment_score(negative_score, neutral_score, positive_score)
    categorized_sent_score = closest_value(pred_sent_score)
    test_sentiment_predictions.append(categorized_sent_score)

## Evaluation Methods

In [340]:
test_sentiment_predictions

[0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.25,
 0.5,
 0.25,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.25,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.75,
 0.25,
 0.25,
 0.25,
 0.25,
 0.25,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.25,
 0.5,
 0.25,
 0.25,
 0.5,
 0.25,
 0.25,
 0.5,
 0.5,
 0.25,
 0.25,
 0.25,
 0.25,
 0.5,
 0.25,
 0.5,
 0.25,
 0.5,
 0.5,
 0.25,
 0.25,
 0.5,
 0.5,
 0.25,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.25,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.25,
 0.25,
 0.25,
 0.5,
 0.25,
 0.25,
 0.5,
 0.75,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.25,
 0.5,
 0.25,
 0.5,
 0.25,
 0.25,
 0.25,
 0.5,
 0.25,
 0.5,
 0.5,
 0.25,
 0.5,
 0.25,
 0.5,
 0.25,
 0.2

In [341]:
test_sentiment_scores

[0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.75,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.75,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.75,
 0.5,
 0.75,
 0.5,
 0.5,
 0.25,
 0.25,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.75,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.25,
 0.5,
 0.25,
 0.25,
 0.5,
 0.5,
 0.25,
 0.25,
 0.5,
 0.5,
 0.5,
 0.75,
 0.5,
 0.75,
 0.5,
 0.25,
 0.75

In [342]:
# Calculates the list of absolute differences
differences = [abs(a - b) for a, b in zip(test_sentiment_predictions, test_sentiment_scores)]

# Calculates the total differences
count_equal = sum(a == b for a, b in zip(test_sentiment_predictions, test_sentiment_scores))

In [343]:
difference_avg = sum(differences) / len(differences)

In [344]:
difference_avg

0.1045

In [345]:
count_equal

301