## Import Libraries

In [21]:
# Pre-Processing
import pandas as pd
import re
import emoji
import contractions
from sklearn.model_selection import train_test_split

# Machine Learning
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

## Import Models

In [22]:
# Sentiment Model
sentiment_model_dir = '../models/sentiment_emotion_models/sentiment_pn_model'

sentiment_model_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_dir, num_labels=2)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_dir, num_labels=2)

# Official Reporting Model
reporting_model_dir = '../models/sentiment_emotion_models/official_report_emotion_model'

reporting_model_tokenizer = AutoTokenizer.from_pretrained(reporting_model_dir, num_labels=2)
reporting_model = AutoModelForSequenceClassification.from_pretrained(reporting_model_dir, num_labels=2)

# Joyful Emotion Model
joyful_model_dir = '../models/sentiment_emotion_models/joyful_emotion_model'

joyful_model_tokenizer = AutoTokenizer.from_pretrained(joyful_model_dir, num_labels=2)
joyful_model = AutoModelForSequenceClassification.from_pretrained(joyful_model_dir, num_labels=2)

# Love Emotion Model
love_model_dir = '../models/sentiment_emotion_models/love_emotion_model'

love_model_tokenizer = AutoTokenizer.from_pretrained(love_model_dir, num_labels=2)
love_model = AutoModelForSequenceClassification.from_pretrained(love_model_dir, num_labels=2)

# Anger Emotion Model
anger_model_dir = '../models/sentiment_emotion_models/anger_emotion_model'

anger_model_tokenizer = AutoTokenizer.from_pretrained(anger_model_dir, num_labels=2)
anger_model = AutoModelForSequenceClassification.from_pretrained(anger_model_dir, num_labels=2)

# Sadness Emotion Model
sadness_model_dir = '../models/sentiment_emotion_models/sadness_emotion_model'

sadness_model_tokenizer = AutoTokenizer.from_pretrained(sadness_model_dir, num_labels=2)
sadness_model = AutoModelForSequenceClassification.from_pretrained(sadness_model_dir, num_labels=2)

## Import Data

In [23]:
# Data for sentiment analysis
sentiment_analysis_dataset = '../twitter_data/custom_data/full_covid_sent.csv'
sentiment_analysis_df = pd.read_csv(sentiment_analysis_dataset)

# Data for official report
official_report_dataset = '../twitter_data/custom_data/official_report_dataset.csv'
official_report_df = pd.read_csv(official_report_dataset)

# Data for joyful emotion
joyful_dataset = '../twitter_data/custom_data/geoemotion_joyful_group.csv'
joyful_df = pd.read_csv(joyful_dataset)

# Data for love emotion
love_dataset = '../twitter_data/custom_data/geoemotion_love_group.csv'
love_df = pd.read_csv(love_dataset)

# Data for anger emotion
anger_dataset = '../twitter_data/custom_data/geoemotion_anger_group.csv'
anger_df = pd.read_csv(anger_dataset)

# Data for sadness emotion
sadness_dataset = '../twitter_data/custom_data/geoemotion_sadness_group.csv'
sadness_df = pd.read_csv(sadness_dataset)

## Data Pre-Processing

### Pre-Processing Functions

In [24]:
def remove_urls(doc):
    return re.sub(r'http\S+', '', doc)

def convert_emojis(doc):
    # delimiters are what is used around the emoji description, in this case spaces are used
    return emoji.replace_emoji(doc, replace='')

def remove_hashtags(doc):
    return doc.replace('#', '')
    #return re.sub(r'#\w+', '', doc)

def remove_numbers(doc):
    return re.sub(r'\d+', '', doc)

def remove_user_mentions(doc):
    return re.sub(r'@\w+', '', doc)

def fix_contractions(doc):
    return contractions.fix(doc)

def remove_punctuation(doc):
    return re.sub(r'[^\w\s]', '', doc)

def remove_amp(doc):
    return re.sub(r'\bamp\b', ' and ', doc).strip() # strip removes the surrounding white space

def remove_special_character_combinations(doc):
    # Remove all combinations of \r and \n in any order
    cleaned_text = re.sub(r'[\r\n\xa0]+', '', doc)
    return cleaned_text

def remove_non_english_characters(doc):
    return re.sub(r'[^\x00-\x7F]+', '', doc)

def lowercase_doc(doc):
    return doc.lower()

### Pre-Processing Master Function

In [25]:
def preprocess_tweet(doc):
    doc = remove_urls(doc)
    doc = convert_emojis(doc)
    doc = remove_hashtags(doc)
    #doc = remove_numbers(doc)
    doc = remove_user_mentions(doc)
    doc = fix_contractions(doc)
    #doc = remove_punctuation(doc)
    doc = remove_amp(doc)
    doc = remove_special_character_combinations(doc)
    doc = remove_non_english_characters(doc)
    #doc = lowercase_doc(doc)
    return doc

### Applying the Pre-Processing

In [26]:
# Sentiment Analysis
sentiment_analysis_df = sentiment_analysis_df[sentiment_analysis_df['label'] != 1]
sentiment_analysis_df['label'] = sentiment_analysis_df['label'].replace(2, 1)
sentiment_analysis_df['Tweet'] = sentiment_analysis_df['Tweet'].apply(preprocess_tweet)

# Official Report
official_report_df['Tweet'] = official_report_df['Tweet'].apply(preprocess_tweet)

# Joyful Emotion
joyful_df['text'] = joyful_df['text'].apply(preprocess_tweet)

# Love Emotion
love_df['text'] = love_df['text'].apply(preprocess_tweet)

# Anger Emotion
anger_df['text'] = anger_df['text'].apply(preprocess_tweet)

# Sadness Emotion
sadness_df['text'] = sadness_df['text'].apply(preprocess_tweet)

### Get Test Data

In [27]:
# Sentiment Analysis
sentiment_train_df, sentiment_test_val_df = train_test_split(sentiment_analysis_df, test_size=0.1, random_state=23)
sentiment_val_df, sentiment_test_df = train_test_split(sentiment_test_val_df, test_size=0.5, random_state=23)

# Official Report
official_report_train_df, official_report_test_val_df = train_test_split(official_report_df, test_size=0.1, random_state=23)
official_report_val_df, official_report_test_df = train_test_split(official_report_test_val_df, test_size=0.5, random_state=23)

# Joyful Emotion
joyful_train_df, joyful_test_val_df = train_test_split(joyful_df, test_size=0.1, random_state=23)
joyful_val_df, joyful_test_df = train_test_split(joyful_test_val_df, test_size=0.5, random_state=23)

# Love Emotion
love_train_df, love_test_val_df = train_test_split(love_df, test_size=0.1, random_state=23)
love_val_df, love_test_df = train_test_split(love_test_val_df, test_size=0.5, random_state=23)

# Anger Emotion
anger_train_df, anger_test_val_df = train_test_split(anger_df, test_size=0.1, random_state=23)
anger_val_df, anger_test_df = train_test_split(anger_test_val_df, test_size=0.5, random_state=23)

# Sadness Emotion
sadness_train_df, sadness_test_val_df = train_test_split(sadness_df, test_size=0.1, random_state=23)
sadness_val_df, sadness_test_df = train_test_split(sadness_test_val_df, test_size=0.5, random_state=23)

## Evaluating Models

### Sentiment Model

In [29]:
def analyze_sentiment(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=280)
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    return probabilities

In [51]:
def list_compare(predicted, actual):
    return sum(1 for a, b in zip(predicted, actual) if a == b)

In [177]:
# Predicting sentiment for the individual tweets
sentiments = [0, 1]
predicted_sentiments = []
actual_sentiments = []

for index, row in sentiment_test_df.iterrows():

    text = row['text']
    label = row['label']

    # Calculate the probabilities
    probabilities = analyze_sentiment(text, sadness_model, sadness_model_tokenizer).tolist()

    # Get the index with the highest probability
    max_index = probabilities[0].index(max(probabilities[0]))

    predicted_sentiments.append(max_index)
    actual_sentiments.append(label)

In [178]:
list_compare(predicted_sentiments, actual_sentiments)

2084

In [179]:
len(sadness_train_df)

25866