In [98]:
# Pre-Processing
import pandas as pd
import re
import emoji
import contractions

# Machine Learning
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

## Import Datasets

In [99]:
# Filepaths
tweet_data_fp = 'twitter_data/custom_data/pruned_medicine_and_research_users.csv'

In [100]:
tweet_data_df = pd.read_csv(tweet_data_fp)
tweet_data_df = tweet_data_df[tweet_data_df['Lang_code'] == 'en']

## Preprocessing

In [101]:
# Gathering documents
docs = tweet_data_df['Tweet'].tolist()
pruned_docs = docs

In [102]:
# Removal of links
def remove_urls(doc):
    return re.sub(r'http\S+', '', doc)

pruned_docs = [remove_urls(doc) for doc in pruned_docs]

def convert_emojis(doc):
    # delimiters are what is used around the emoji description, in this case spaces are used
    return emoji.replace_emoji(doc, replace='')

pruned_docs = [convert_emojis(doc) for doc in pruned_docs]

def remove_hashtags(doc):
    return doc.replace('#', '')
    #return re.sub(r'#\w+', '', doc)

pruned_docs = [remove_hashtags(doc) for doc in pruned_docs]

def remove_numbers(doc):
    return re.sub(r'\d+', '', doc)

pruned_docs = [remove_numbers(doc) for doc in pruned_docs]

def remove_user_mentions(doc):
    return re.sub(r'@\w+', '', doc)

pruned_docs = [remove_user_mentions(doc) for doc in pruned_docs]

def fix_contractions(doc):
    return contractions.fix(doc)

pruned_docs = [fix_contractions(doc) for doc in pruned_docs]

def remove_punctuation(doc):
    return re.sub(r'[^\w\s]', '', doc)

pruned_docs = [remove_punctuation(doc) for doc in pruned_docs]

def remove_amp(doc):
    return re.sub(r'\bamp\b', '', doc).strip() # strip removes the surrounding white space

pruned_docs = [remove_amp(doc) for doc in pruned_docs]

## Importing BERT

In [103]:
# Function used to convert the texts to what is needed:
# - turn the text into tensors
# - truncate and pad the tweets to 280 characters
def analyze_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=280)
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    return probabilities

In [104]:
# Loads BERT tokenizer and model from a BERT model pre-trained on emotion dataset
# Found here: https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis
# This was based off of BERTweet which can be found here: https://github.com/VinAIResearch/BERTweet?tab=readme-ov-file
# The model was then trained with SemEval 2017 corpus (around ~40k tweets) to refine sentiment
tokenizer = AutoTokenizer.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')
model = AutoModelForSequenceClassification.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')

In [105]:
# Sentiments list
sentiments = ["negative", "neutral", "positive"]

In [106]:
# Predicting sentiment for the individual tweets
for doc in pruned_docs:
    print(doc)
    probabilities = analyze_sentiment(doc)
    for i, label in enumerate(sentiments):
        print(f"{label}: {probabilities[0][i].item():.4f}")
    print()

Speaking of Kids and infections
negative: 0.3198
neutral: 0.6688
positive: 0.0114

In practice by announcing a  price tag notably during the deepest global recession since the second World War Nature has failed to enact their own diversity and inclusion pledge

Nailed it Dr Pai
negative: 0.7584
neutral: 0.2286
positive: 0.0130

Awesome
negative: 0.0038
neutral: 0.0216
positive: 0.9746

Good
negative: 0.0048
neutral: 0.1105
positive: 0.8846

Out with a whimper
negative: 0.3935
neutral: 0.5937
positive: 0.0128

Steady Tuesday twitterverse
negative: 0.0037
neutral: 0.7009
positive: 0.2954

One recent study suggested that the Families First Coronavirus Response Act FFCRA which granted some workers paid sick leave or expanded family and medical leave due to COVID helped reduce the spread of the virus

Learn more
negative: 0.0061
neutral: 0.5886
positive: 0.4053

Anyone looking for poster ideas to encourage vaccination in your hospital or clinic
negative: 0.0068
neutral: 0.9390
positive: 0.0

KeyboardInterrupt: 