In [50]:
# Pre-Processing
import pandas as pd
import re
import emoji
import contractions

# Machine Learning
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

## Import Datasets

In [51]:
# Filepaths
tweet_data_fp = 'twitter_data/custom_data/pruned_media_users.csv'

In [52]:
tweet_data_df = pd.read_csv(tweet_data_fp)
tweet_data_df = tweet_data_df[tweet_data_df['Lang_code'] == 'en']

## Preprocessing

In [53]:
# Gathering documents
docs = tweet_data_df['Tweet'].tolist()
pruned_docs = docs

In [54]:
# Removal of links
def remove_urls(doc):
    return re.sub(r'http\S+', '', doc)

pruned_docs = [remove_urls(doc) for doc in pruned_docs]

def convert_emojis(doc):
    # delimiters are what is used around the emoji description, in this case spaces are used
    return emoji.replace_emoji(doc, replace='')

pruned_docs = [convert_emojis(doc) for doc in pruned_docs]

def remove_hashtags(doc):
    return doc.replace('#', '')
    #return re.sub(r'#\w+', '', doc)

pruned_docs = [remove_hashtags(doc) for doc in pruned_docs]

def remove_numbers(doc):
    return re.sub(r'\d+', '', doc)

pruned_docs = [remove_numbers(doc) for doc in pruned_docs]

def remove_user_mentions(doc):
    return re.sub(r'@\w+', '', doc)

pruned_docs = [remove_user_mentions(doc) for doc in pruned_docs]

def fix_contractions(doc):
    return contractions.fix(doc)

pruned_docs = [fix_contractions(doc) for doc in pruned_docs]

def remove_punctuation(doc):
    return re.sub(r'[^\w\s]', '', doc)

pruned_docs = [remove_punctuation(doc) for doc in pruned_docs]

def remove_amp(doc):
    return re.sub(r'\bamp\b', '', doc).strip() # strip removes the surrounding white space

pruned_docs = [remove_amp(doc) for doc in pruned_docs]

## Importing BERT Sentiment Analysis

In [55]:
# Function used to convert the texts to what is needed:
# - turn the text into tensors
# - truncate and pad the tweets to 280 characters
def analyze_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=280)
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    return probabilities

In [56]:
# Loads BERT tokenizer and model from a BERT model pre-trained on emotion dataset
# Found here: https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis
# This was based off of BERTweet which can be found here: https://github.com/VinAIResearch/BERTweet?tab=readme-ov-file
# The model was then trained with SemEval 2017 corpus (around ~40k tweets) to refine sentiment
tokenizer = AutoTokenizer.from_pretrained('sentiment_model_results/checkpoint-1125')
model = AutoModelForSequenceClassification.from_pretrained('sentiment_model_results/checkpoint-1125')

In [57]:
# Sentiments list
sentiments = ["negative", "neutral", "positive"]

In [58]:
# Predicting sentiment for the individual tweets
for doc in pruned_docs:
    print(doc)
    probabilities = analyze_sentiment(doc)
    for i, label in enumerate(sentiments):
        print(f"{label}: {probabilities[0][i].item():.4f}")
    print()

Through this campaign we wanted to challenge misinformation dispel fear and support the health workers and others at the forefront of the fight against the virus Ram Devineni the IndianAmerican creator of the Priya comic series said
negative: 0.1185
neutral: 0.8663
positive: 0.0152

Interesting news story Leaked documents reveal Chinas mishandling of the early stages of Covid
 pages of leaked documents from the Hubei Provincial Center for Disease Control and Prevention shared with and verified by CNN
negative: 0.0034
neutral: 0.0193
positive: 0.9773

These findings suggest that SARSCoV may have been introduced into the United States prior to January   Serologic testing of YOUS blood donations to identify SARSCoVreactive antibodies Dec Jan   Clinical Infectious Diseases  Oxford
negative: 0.2633
neutral: 0.7247
positive: 0.0120

Interesting perspective from a respected vaccine expert Dr Paul Offit
negative: 0.0027
neutral: 0.0121
positive: 0.9852

Gov Hogan Worst part of this entire cris

KeyboardInterrupt: 

In [None]:
Through this campaign we wanted to challenge misinformation dispel fear and support the health workers and others at the forefront of the fight against the virus Ram Devineni the IndianAmerican creator of the Priya comic series said
negative: 0.0044
neutral: 0.3677
positive: 0.6280

Interesting news story Leaked documents reveal Chinas mishandling of the early stages of Covid
 pages of leaked documents from the Hubei Provincial Center for Disease Control and Prevention shared with and verified by CNN
negative: 0.1531
neutral: 0.8284
positive: 0.0185

These findings suggest that SARSCoV may have been introduced into the United States prior to January   Serologic testing of YOUS blood donations to identify SARSCoVreactive antibodies Dec Jan   Clinical Infectious Diseases  Oxford
negative: 0.0144
neutral: 0.9644
positive: 0.0212

Interesting perspective from a respected vaccine expert Dr Paul Offit
negative: 0.0015
neutral: 0.1047
positive: 0.8938

Gov Hogan Worst part of this entire crisis is still ahead of us in Maryland  Maryland hospitals now have a week to get a plan ready for the looming surge
negative: 0.9605
neutral: 0.0370
positive: 0.0025

FBI warns of new coronavirus email autoforwarding scam
negative: 0.8956
neutral: 0.1008
positive: 0.0037

Austin mayor stressed residents need to stay home He was vacationing in Cabo at the time SmartNews
negative: 0.4334
neutral: 0.5604
positive: 0.0062

The abrupt halt in world trade and tourism and the impact of lockdowns on international migration and remittances dealt a ruinous blow
negative: 0.9611
neutral: 0.0364
positive: 0.0025

Coronavirus Hackers targeted Covid vaccine supply cold chain phishing emails were sent out across six countries which targeted organisations linked to Cold Chain Equipment Optimisation Platform of Gavi the international vaccine alliance BBC News
negative: 0.8842
neutral: 0.1123
positive: 0.0035

Many clinicians worry about the toll that widespread public doubts and misinformation about the coronavirus are taking on their institutions overall ability to provide medical care
negative: 0.7934
neutral: 0.2007
positive: 0.0059
