## Import Libraries

In [62]:
# Pre-Processing
import pandas as pd
import re
import emoji
import contractions
from sklearn.model_selection import train_test_split

# Machine Learning
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

# Accuracy Metrics
from sklearn.metrics import precision_score, recall_score, f1_score

## Import Models

In [63]:
# Sentiment Model
sentiment_model_dir = '../models/sentiment_emotion_models/sentiment_pn_model'

sentiment_model_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_dir, num_labels=2)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_dir, num_labels=2)

# Official Reporting Model
reporting_model_dir = '../models/sentiment_emotion_models/official_report_emotion_model'

reporting_model_tokenizer = AutoTokenizer.from_pretrained(reporting_model_dir, num_labels=2)
reporting_model = AutoModelForSequenceClassification.from_pretrained(reporting_model_dir, num_labels=2)

# Joyful Emotion Model
joyful_model_dir = '../models/sentiment_emotion_models/joyful_emotion_model'

joyful_model_tokenizer = AutoTokenizer.from_pretrained(joyful_model_dir, num_labels=2)
joyful_model = AutoModelForSequenceClassification.from_pretrained(joyful_model_dir, num_labels=2)

# Love Emotion Model
love_model_dir = '../models/sentiment_emotion_models/love_emotion_model'

love_model_tokenizer = AutoTokenizer.from_pretrained(love_model_dir, num_labels=2)
love_model = AutoModelForSequenceClassification.from_pretrained(love_model_dir, num_labels=2)

# Anger Emotion Model
anger_model_dir = '../models/sentiment_emotion_models/anger_emotion_model'

anger_model_tokenizer = AutoTokenizer.from_pretrained(anger_model_dir, num_labels=2)
anger_model = AutoModelForSequenceClassification.from_pretrained(anger_model_dir, num_labels=2)

# Sadness Emotion Model
sadness_model_dir = '../models/sentiment_emotion_models/sadness_emotion_model'

sadness_model_tokenizer = AutoTokenizer.from_pretrained(sadness_model_dir, num_labels=2)
sadness_model = AutoModelForSequenceClassification.from_pretrained(sadness_model_dir, num_labels=2)

## Import Data

In [64]:
# Data for sentiment analysis
sentiment_analysis_dataset = '../twitter_data/custom_data/full_covid_sent.csv'
sentiment_analysis_df = pd.read_csv(sentiment_analysis_dataset)

# Data for official report
official_report_dataset = '../twitter_data/custom_data/official_report_dataset.csv'
official_report_df = pd.read_csv(official_report_dataset)

# Data for joyful emotion
joyful_dataset = '../twitter_data/custom_data/geoemotion_joyful_group.csv'
joyful_df = pd.read_csv(joyful_dataset)

# Data for love emotion
love_dataset = '../twitter_data/custom_data/geoemotion_love_group.csv'
love_df = pd.read_csv(love_dataset)

# Data for anger emotion
anger_dataset = '../twitter_data/custom_data/geoemotion_anger_group.csv'
anger_df = pd.read_csv(anger_dataset)

# Data for sadness emotion
sadness_dataset = '../twitter_data/custom_data/geoemotion_sadness_group.csv'
sadness_df = pd.read_csv(sadness_dataset)

## Data Pre-Processing

### Pre-Processing Functions

In [65]:
def remove_urls(doc):
    return re.sub(r'http\S+', '', doc)

def convert_emojis(doc):
    # delimiters are what is used around the emoji description, in this case spaces are used
    return emoji.replace_emoji(doc, replace='')

def remove_hashtags(doc):
    return doc.replace('#', '')
    #return re.sub(r'#\w+', '', doc)

def remove_numbers(doc):
    return re.sub(r'\d+', '', doc)

def remove_user_mentions(doc):
    return re.sub(r'@\w+', '', doc)

def fix_contractions(doc):
    return contractions.fix(doc)

def remove_punctuation(doc):
    return re.sub(r'[^\w\s]', '', doc)

def remove_amp(doc):
    return re.sub(r'\bamp\b', ' and ', doc).strip() # strip removes the surrounding white space

def remove_special_character_combinations(doc):
    # Remove all combinations of \r and \n in any order
    cleaned_text = re.sub(r'[\r\n\xa0]+', '', doc)
    return cleaned_text

def remove_non_english_characters(doc):
    return re.sub(r'[^\x00-\x7F]+', '', doc)

def lowercase_doc(doc):
    return doc.lower()

### Pre-Processing Master Function

In [66]:
def preprocess_tweet(doc):
    doc = remove_urls(doc)
    doc = convert_emojis(doc)
    doc = remove_hashtags(doc)
    #doc = remove_numbers(doc)
    doc = remove_user_mentions(doc)
    doc = fix_contractions(doc)
    #doc = remove_punctuation(doc)
    doc = remove_amp(doc)
    doc = remove_special_character_combinations(doc)
    doc = remove_non_english_characters(doc)
    #doc = lowercase_doc(doc)
    return doc

### Applying the Pre-Processing

In [67]:
# Sentiment Analysis
sentiment_analysis_df = sentiment_analysis_df[sentiment_analysis_df['label'] != 1]
sentiment_analysis_df['label'] = sentiment_analysis_df['label'].replace(2, 1)
sentiment_analysis_df['Tweet'] = sentiment_analysis_df['Tweet'].apply(preprocess_tweet)

# Official Report
official_report_df['Tweet'] = official_report_df['Tweet'].apply(preprocess_tweet)

# Joyful Emotion
joyful_df['text'] = joyful_df['text'].apply(preprocess_tweet)

# Love Emotion
love_df['text'] = love_df['text'].apply(preprocess_tweet)

# Anger Emotion
anger_df['text'] = anger_df['text'].apply(preprocess_tweet)

# Sadness Emotion
sadness_df['text'] = sadness_df['text'].apply(preprocess_tweet)

### Get Test Data

In [68]:
# Sentiment Analysis
sentiment_train_df, sentiment_test_val_df = train_test_split(sentiment_analysis_df, test_size=0.1, random_state=23)
sentiment_val_df, sentiment_test_df = train_test_split(sentiment_test_val_df, test_size=0.5, random_state=23)

# Official Report
official_report_train_df, official_report_test_val_df = train_test_split(official_report_df, test_size=0.1, random_state=23)
official_report_val_df, official_report_test_df = train_test_split(official_report_test_val_df, test_size=0.5, random_state=23)

# Joyful Emotion
joyful_train_df, joyful_test_val_df = train_test_split(joyful_df, test_size=0.1, random_state=23)
joyful_val_df, joyful_test_df = train_test_split(joyful_test_val_df, test_size=0.5, random_state=23)

# Love Emotion
love_train_df, love_test_val_df = train_test_split(love_df, test_size=0.1, random_state=23)
love_val_df, love_test_df = train_test_split(love_test_val_df, test_size=0.5, random_state=23)

# Anger Emotion
anger_train_df, anger_test_val_df = train_test_split(anger_df, test_size=0.1, random_state=23)
anger_val_df, anger_test_df = train_test_split(anger_test_val_df, test_size=0.5, random_state=23)

# Sadness Emotion
sadness_train_df, sadness_test_val_df = train_test_split(sadness_df, test_size=0.1, random_state=23)
sadness_val_df, sadness_test_df = train_test_split(sadness_test_val_df, test_size=0.5, random_state=23)

## Evaluating Models

### Individual Models

In [93]:
def analyze_sentiment(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=280)
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    return probabilities

In [94]:
def list_compare(predicted, actual):
    return sum(1 for a, b in zip(predicted, actual) if a == b)

In [128]:
# Predicting sentiment for the individual tweets
sentiments = [0, 1]
predicted_sentiments = []
actual_sentiments = []

for index, row in sadness_train_df[:2500].iterrows():

    text = row['text']
    label = row['label']

    # Calculate the probabilities
    probabilities = analyze_sentiment(text,sadness_model, sadness_model_tokenizer).tolist()

    # Get the index with the highest probability
    max_index = probabilities[0].index(max(probabilities[0]))

    predicted_sentiments.append(max_index)
    actual_sentiments.append(label)

In [129]:
list_compare(predicted_sentiments, actual_sentiments)

2084

In [130]:
f1_score(predicted_sentiments, actual_sentiments)

0.717391304347826

### Model Conglomeration

In [11]:
go_joyful_df = joyful_test_df.copy()

go_love_df = love_test_df.copy()
go_love_df['label'] = love_test_df['label'].replace(1, 2)

go_anger_df = anger_test_df.copy()
go_anger_df['label'] = anger_test_df['label'].replace(1, 3)

go_sadness_df = sadness_test_df.copy()
go_sadness_df['label'] = sadness_test_df['label'].replace(1, 4)

In [12]:
go_emotion_df = pd.concat([go_joyful_df, go_love_df, go_anger_df, go_sadness_df], ignore_index=True, sort=False)

In [13]:
# Creating the columns
go_emotion_df['joyful'] = 0
go_emotion_df['love'] = 0
go_emotion_df['anger'] = 0
go_emotion_df['sadness'] = 0

In [14]:
# Labelling the columns
go_emotion_df.loc[go_emotion_df['label'] == 1, 'joyful'] = 1
go_emotion_df.loc[go_emotion_df['label'] == 2, 'love'] = 1
go_emotion_df.loc[go_emotion_df['label'] == 3, 'anger'] = 1
go_emotion_df.loc[go_emotion_df['label'] == 4, 'sadness'] = 1

In [15]:
# Creating generated label columns
go_emotion_df['pred_joyful'] = 0
go_emotion_df['pred_love'] = 0
go_emotion_df['pred_anger'] = 0
go_emotion_df['pred_sadness'] = 0

In [16]:
go_emotion_df

Unnamed: 0.1,Unnamed: 0,text,label,joyful,love,anger,sadness,pred_joyful,pred_love,pred_anger,pred_sadness
0,55265,You people.,0,0,0,0,0,0,0,0,0
1,55228,Its almost as if everyone is too afraid to be ...,0,0,0,0,0,0,0,0,0
2,28111,What sorcery is this!,0,0,0,0,0,0,0,0,0
3,53301,This is very deceptive. If you have not read i...,0,0,0,0,0,0,0,0,0
4,42358,SCREENSHOT THIS.,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
11297,6538,That is sad. You can be acquaintances with wom...,4,0,0,0,1,0,0,0,0
11298,18887,He was in the merge lane and should have yield...,0,0,0,0,0,0,0,0,0
11299,19971,"Fortunately, those defibs only require being a...",0,0,0,0,0,0,0,0,0
11300,12924,If we shoot by accident we do not keep feeding...,0,0,0,0,0,0,0,0,0


In [17]:
joyful_pred_probs = []
love_pred_probs = []
anger_pred_probs = []
sadness_pred_probs = []

In [49]:
sentiment_pred_probs = []

In [50]:
sentiments = [0, 1]
predicted_sentiments = []
actual_sentiments = []

for index, row in go_emotion_df.iterrows():

    text = row['text']
    label = row['joyful']

    # Calculate the probabilities
    probabilities = analyze_sentiment(text, sentiment_model, sentiment_model_tokenizer).tolist()

    # Get the index with the highest probability
    max_index = probabilities[0].index(max(probabilities[0]))
    max_prob = probabilities[0][1]

    predicted_sentiments.append(max_index)
    sentiment_pred_probs.append(max_prob)
    
    actual_sentiments.append(label)

In [51]:
list_compare(predicted_sentiments, actual_sentiments)

6456

In [52]:
len(go_emotion_df)

11302

In [53]:
len(predicted_sentiments)

11302

In [54]:
sentiment_pred_probs

[0.43733760714530945,
 0.019343124702572823,
 0.5478744506835938,
 0.8604565262794495,
 0.8875943422317505,
 0.975978672504425,
 0.007864605635404587,
 0.9937297105789185,
 0.2872035503387451,
 0.2667981684207916,
 0.00693399365991354,
 0.9674035906791687,
 0.763714611530304,
 0.03145994618535042,
 0.04558457434177399,
 0.34075668454170227,
 0.115138940513134,
 0.024892626330256462,
 0.9984726309776306,
 0.9105734825134277,
 0.01578531600534916,
 0.8080649375915527,
 0.009785931557416916,
 0.9973529577255249,
 0.990127444267273,
 0.9973326921463013,
 0.35957977175712585,
 0.02470272034406662,
 0.12480852007865906,
 0.009704387746751308,
 0.9948025941848755,
 0.9952049255371094,
 0.008700610138475895,
 0.9573862552642822,
 0.2172137051820755,
 0.1610669046640396,
 0.9324118494987488,
 0.9645538330078125,
 0.11800592392683029,
 0.9973798394203186,
 0.004591766744852066,
 0.0468747578561306,
 0.7460547685623169,
 0.02242080494761467,
 0.012255697511136532,
 0.04310249537229538,
 0.9913836

In [55]:
predicted_sentiments

[0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,


In [43]:
go_emotion_df['pred_joyful'] = joyful_pred_probs
go_emotion_df['pred_love'] = love_pred_probs
go_emotion_df['pred_anger'] = anger_pred_probs
go_emotion_df['pred_sadness'] = sadness_pred_probs

In [56]:
go_emotion_df['pred_sentiment'] = sentiment_pred_probs

In [61]:
go_emotion_df.to_csv('go_emotion_predictions.csv', index=False)