## Import Libraries

In [31]:
# Pre-Processing
import pandas as pd
import re
import emoji
import contractions

# Machine Learning
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

## Import Models

In [32]:
# Sentiment Model
sentiment_model_dir = '../models/sentiment_emotion_models/sentiment_pn_model'

sentiment_model_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_dir, num_labels=2)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_dir, num_labels=2)

# Official Reporting Model
reporting_model_dir = '../models/sentiment_emotion_models/official_report_emotion_model'

reporting_model_tokenizer = AutoTokenizer.from_pretrained(reporting_model_dir, num_labels=2)
reporting_model = AutoModelForSequenceClassification.from_pretrained(reporting_model_dir, num_labels=2)

# Joyful Emotion Model
joyful_model_dir = '../models/sentiment_emotion_models/joyful_emotion_model'

joyful_model_tokenizer = AutoTokenizer.from_pretrained(joyful_model_dir, num_labels=2)
joyful_model = AutoModelForSequenceClassification.from_pretrained(joyful_model_dir, num_labels=2)

# Love Emotion Model
love_model_dir = '../models/sentiment_emotion_models/love_emotion_model'

love_model_tokenizer = AutoTokenizer.from_pretrained(love_model_dir, num_labels=2)
love_model = AutoModelForSequenceClassification.from_pretrained(love_model_dir, num_labels=2)

# Anger Emotion Model
anger_model_dir = '../models/sentiment_emotion_models/anger_emotion_model'

anger_model_tokenizer = AutoTokenizer.from_pretrained(anger_model_dir, num_labels=2)
anger_model = AutoModelForSequenceClassification.from_pretrained(anger_model_dir, num_labels=2)

# Sadness Emotion Model
sadness_model_dir = '../models/sentiment_emotion_models/sadness_emotion_model'

sadness_model_tokenizer = AutoTokenizer.from_pretrained(sadness_model_dir, num_labels=2)
sadness_model = AutoModelForSequenceClassification.from_pretrained(sadness_model_dir, num_labels=2)

## Import Data

In [34]:
# Full Dataset
pruned_network_df = pd.read_csv('../labelled_data/pruned_topic_network_data.csv')

# Media
pruned_media_users_df = pd.read_csv('../labelled_data/pruned_topic_media_data.csv')

# Medicine and Researchers
pruned_medicine_and_research_users_df = pd.read_csv('../labelled_data/pruned_topic_medicine_and_research_data.csv')

## Data Pre-Processing

### Pre-Processing Functions

In [35]:
def remove_urls(doc):
    return re.sub(r'http\S+', '', doc)

def convert_emojis(doc):
    # delimiters are what is used around the emoji description, in this case spaces are used
    return emoji.replace_emoji(doc, replace='')

def remove_hashtags(doc):
    return doc.replace('#', '')
    #return re.sub(r'#\w+', '', doc)

def remove_numbers(doc):
    return re.sub(r'\d+', '', doc)

def remove_user_mentions(doc):
    return re.sub(r'@\w+', '', doc)

def fix_contractions(doc):
    return contractions.fix(doc)

def remove_punctuation(doc):
    return re.sub(r'[^\w\s]', '', doc)

def remove_amp(doc):
    return re.sub(r'\bamp\b', ' and ', doc).strip() # strip removes the surrounding white space

def remove_special_character_combinations(doc):
    # Remove all combinations of \r and \n in any order
    cleaned_text = re.sub(r'[\r\n\xa0]+', '', doc)
    return cleaned_text

def remove_non_english_characters(doc):
    return re.sub(r'[^\x00-\x7F]+', '', doc)

def lowercase_doc(doc):
    return doc.lower()

### Pre-Processing Master Function

In [36]:
def preprocess_tweet(doc):
    doc = remove_urls(doc)
    doc = convert_emojis(doc)
    doc = remove_hashtags(doc)
    #doc = remove_numbers(doc)
    doc = remove_user_mentions(doc)
    doc = fix_contractions(doc)
    #doc = remove_punctuation(doc)
    doc = remove_amp(doc)
    doc = remove_special_character_combinations(doc)
    doc = remove_non_english_characters(doc)
    #doc = lowercase_doc(doc)
    return doc

### Applying the Pre-Processing

In [37]:
pruned_network_df['Tweet'] = pruned_network_df['Tweet'].apply(preprocess_tweet)
pruned_media_users_df['Tweet'] = pruned_media_users_df['Tweet'].apply(preprocess_tweet)
pruned_medicine_and_research_users_df['Tweet'] = pruned_medicine_and_research_users_df['Tweet'].apply(preprocess_tweet)

## Dataset Labeller

In [38]:
def analyze_sentiment(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=280)
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    return probabilities

In [39]:
# Predicting sentiment for the individual tweets
sentiments = [0, 1]
predicted_sentiment_list = []
predicted_official_report_list = []
predicted_joyful_list = []
predicted_love_list = []
predicted_anger_list = []
predicted_sadness_list = []

for index, row in pruned_network_df.iterrows():

    text = row['Tweet']

    # Calculate probabilities for sentiment
    predicted_sentiment = analyze_sentiment(text, sentiment_model, sentiment_model_tokenizer).tolist()
    sentiment_max_index = predicted_sentiment[0][1]

    # Calculate probabilities for official report
    predicted_official_report = analyze_sentiment(text, reporting_model, reporting_model_tokenizer).tolist()
    official_report_max_index = predicted_official_report[0][1]

    # Calculate probabilities for joyful
    predicted_joyful = analyze_sentiment(text, joyful_model, joyful_model_tokenizer).tolist()
    joyful_max_index = predicted_joyful[0][1]

    # Calculate probabilities for love
    predicted_love = analyze_sentiment(text, love_model, love_model_tokenizer).tolist()
    love_max_index = predicted_love[0][1]

    # Calculate probabilities for anger
    predicted_anger = analyze_sentiment(text, anger_model, anger_model_tokenizer).tolist()
    anger_max_index = predicted_anger[0][1]

    # Calculate probabilities for anger
    predicted_sadness = analyze_sentiment(text, sadness_model, sadness_model_tokenizer).tolist()
    sadness_max_index = predicted_sadness[0][1]

    # Saving sentiments in a list
    predicted_sentiment_list.append(sentiment_max_index)
    predicted_official_report_list.append(official_report_max_index)
    predicted_joyful_list.append(joyful_max_index)
    predicted_love_list.append(love_max_index)
    predicted_anger_list.append(anger_max_index)
    predicted_sadness_list.append(sadness_max_index)

    if(index % 100 == 0):
        perc_current = (index / len(pruned_network_df)) * 100
        print(str(perc_current) + "%")


0.0%
0.1267154098609932%
0.2534308197219864%
0.38014622958297956%
0.5068616394439728%
0.633577049304966%
0.7602924591659591%
0.8870078690269524%
1.0137232788879456%
1.1404386887489388%
1.267154098609932%
1.3938695084709252%
1.5205849183319182%
1.6473003281929115%
1.7740157380539048%
1.900731147914898%
2.027446557775891%
2.1541619676368846%
2.2808773774978777%
2.4075927873588707%
2.534308197219864%
2.661023607080857%
2.7877390169418503%
2.9144544268028434%
3.0411698366638364%
3.16788524652483%
3.294600656385823%
3.4213160662468165%
3.5480314761078096%
3.6747468859688026%
3.801462295829796%
3.928177705690789%
4.054893115551782%
4.181608525412775%
4.308323935273769%
4.435039345134762%
4.561754754995755%
4.688470164856748%
4.8151855747177414%
4.9419009845787345%
5.068616394439728%
5.195331804300721%
5.322047214161714%
5.448762624022708%
5.575478033883701%
5.702193443744694%
5.828908853605687%
5.95562426346668%
6.082339673327673%
6.209055083188667%
6.33577049304966%
6.462485902910653%
6.589

In [40]:
test_network_df = pruned_network_df.copy()

In [41]:
test_network_df['sentiment_results'] = predicted_sentiment_list
test_network_df['official_report_results'] = predicted_official_report_list
test_network_df['joyful_results'] = predicted_joyful_list
test_network_df['love_results'] = predicted_love_list
test_network_df['anger_results'] = predicted_anger_list
test_network_df['sadness_results'] = predicted_sadness_list

In [42]:
test_network_df.to_csv('../labelled_data/pruned_topic_sentiment_network_data.csv', index=False)

In [43]:
test_network_df

Unnamed: 0,Vertex1,Vertex2,Relationship_type,Relationship Date (UTC),Tweet,Lang_code,Tweet Date (UTC),Tweet_ID,Conversation_ID,Author_ID,...,topic_number,topic_name,topic_words,topic_probabilities,sentiment_results,official_report_results,joyful_results,love_results,anger_results,sadness_results
0,michaelmina_lab,diseaseecology,Quote,2020-12-01T00:02:24.000Z,Speaking of Kids and infections.,en,2020-12-01T00:02:24.000Z,1333562054498783233,1.333350e+18,1094762324097822720,...,1,1_school_kid_child_pediatric,school - kid - child - pediatric - reopen - ki...,0.728295,0.088899,0.234023,0.031881,0.033416,0.364663,0.084098
1,angie_rasmussen,paimadhu,Quote,2020-12-01T00:04:28.000Z,"""In practice, by announcing a 9,500 price tag ...",en,2020-12-01T00:04:28.000Z,1333562574877573120,1.333563e+18,394087611,...,-1,-1_pandemic_health_people_public,pandemic - health - people - public - get - wo...,0.000000,0.855803,0.216912,0.039175,0.337592,0.665193,0.470304
2,drericding,drericding,Tweet,2020-12-01T00:07:20.000Z,BREAKINGTrumps pandemic advisor Scott Atlas ha...,en,2020-12-01T00:07:20.000Z,1333563296608362499,1.333563e+18,18831926,...,189,189_resign_resignation_disgrace_mulvaney,resign - resignation - disgrace - mulvaney - a...,0.229864,0.027745,0.811745,0.051454,0.026335,0.246732,0.262267
3,florian_krammer,kindrachukjason,Quote,2020-12-01T00:08:17.000Z,Awesome!,en,2020-12-01T00:08:17.000Z,1333563532475031553,1.333564e+18,704282873231237121,...,412,412_awesome_relieve_luck_exactly,awesome - relieve - luck - exactly - always - ...,1.000000,0.998337,0.042719,0.200408,0.978655,0.007541,0.013860
4,florian_krammer,rover829,Quote,2020-12-01T00:08:51.000Z,Good!!!,en,2020-12-01T00:08:51.000Z,1333563678034190337,1.333564e+18,704282873231237121,...,232,232_good_ad_sound_prepare,good - ad - sound - prepare - - - - - -,0.791279,0.998333,0.035249,0.227623,0.913730,0.007356,0.009316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78912,kindrachukjason,lisabarrettid,Quote,2022-02-23T23:18:26.000Z,Well said,en,2022-02-23T23:18:26.000Z,1496625529755156485,1.496626e+18,1071978579934765061,...,191,191_say_saidbut_well_aloud,say - saidbut - well - aloud - tshirt - mentee...,0.948415,0.975359,0.022210,0.058609,0.946853,0.108973,0.034826
78913,collignonpeter,collignonpeter,Tweet,2022-02-23T23:27:10.000Z,"For influenza, but if done for Covid would lik...",en,2022-02-23T23:27:10.000Z,1496627725741993984,1.496628e+18,707987761,...,-1,-1_pandemic_health_people_public,pandemic - health - people - public - get - wo...,0.000000,0.522020,0.870899,0.098961,0.077678,0.088220,0.147294
78914,gregggonsalves,gregggonsalves,Tweet,2022-02-23T23:36:08.000Z,OMG. I laughed so hard. Elmo replaces Timothee...,en,2022-02-23T23:36:08.000Z,1496629982013050882,1.496630e+18,30844417,...,-1,-1_pandemic_health_people_public,pandemic - health - people - public - get - wo...,0.000000,0.988945,0.009265,0.945885,0.010625,0.031764,0.035295
78915,drericding,drericding,Tweet,2022-02-23T23:50:13.000Z,BIRD FLU spreading moreThe YOU.S. Department o...,en,2022-02-23T23:50:13.000Z,1496633525092577282,1.496634e+18,18831926,...,143,143_avian_hn_bird_flu,avian - hn - bird - flu - swine - poultry - hu...,0.700800,0.045838,0.899922,0.032816,0.046925,0.516923,0.259603
