## Import Libraries

In [57]:
# Pre-Processing
import pandas as pd
import re
import emoji
import contractions

# Machine Learning
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

## Import Models

In [58]:
# Sentiment Model
sentiment_model_dir = '../models/sentiment_emotion_models/sentiment_pn_model'

sentiment_model_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_dir, num_labels=2)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_dir, num_labels=2)

# Official Reporting Model
reporting_model_dir = '../models/sentiment_emotion_models/official_report_emotion_model'

reporting_model_tokenizer = AutoTokenizer.from_pretrained(reporting_model_dir, num_labels=2)
reporting_model = AutoModelForSequenceClassification.from_pretrained(reporting_model_dir, num_labels=2)

# Joyful Emotion Model
joyful_model_dir = '../models/sentiment_emotion_models/joyful_emotion_model'

joyful_model_tokenizer = AutoTokenizer.from_pretrained(joyful_model_dir, num_labels=2)
joyful_model = AutoModelForSequenceClassification.from_pretrained(joyful_model_dir, num_labels=2)

# Love Emotion Model
love_model_dir = '../models/sentiment_emotion_models/love_emotion_model'

love_model_tokenizer = AutoTokenizer.from_pretrained(love_model_dir, num_labels=2)
love_model = AutoModelForSequenceClassification.from_pretrained(love_model_dir, num_labels=2)

# Anger Emotion Model
anger_model_dir = '../models/sentiment_emotion_models/anger_emotion_model'

anger_model_tokenizer = AutoTokenizer.from_pretrained(anger_model_dir, num_labels=2)
anger_model = AutoModelForSequenceClassification.from_pretrained(anger_model_dir, num_labels=2)

# Sadness Emotion Model
sadness_model_dir = '../models/sentiment_emotion_models/sadness_emotion_model'

sadness_model_tokenizer = AutoTokenizer.from_pretrained(sadness_model_dir, num_labels=2)
sadness_model = AutoModelForSequenceClassification.from_pretrained(sadness_model_dir, num_labels=2)

## Import Data

In [59]:
# Full Dataset
pruned_network_df = pd.read_csv('../twitter_data/network_data/pruned_network_data.csv')

# Media
pruned_media_users_df = pd.read_csv('../twitter_data/network_data/pruned_media_users.csv')

# Medicine and Researchers
pruned_medicine_and_research_users_df = pd.read_csv('../twitter_data/network_data/pruned_medicine_and_research_users.csv')

## Data Pre-Processing

### Pre-Processing Functions

In [60]:
def remove_urls(doc):
    return re.sub(r'http\S+', '', doc)

def convert_emojis(doc):
    # delimiters are what is used around the emoji description, in this case spaces are used
    return emoji.replace_emoji(doc, replace='')

def remove_hashtags(doc):
    return doc.replace('#', '')
    #return re.sub(r'#\w+', '', doc)

def remove_numbers(doc):
    return re.sub(r'\d+', '', doc)

def remove_user_mentions(doc):
    return re.sub(r'@\w+', '', doc)

def fix_contractions(doc):
    return contractions.fix(doc)

def remove_punctuation(doc):
    return re.sub(r'[^\w\s]', '', doc)

def remove_amp(doc):
    return re.sub(r'\bamp\b', ' and ', doc).strip() # strip removes the surrounding white space

def remove_special_character_combinations(doc):
    # Remove all combinations of \r and \n in any order
    cleaned_text = re.sub(r'[\r\n\xa0]+', '', doc)
    return cleaned_text

def remove_non_english_characters(doc):
    return re.sub(r'[^\x00-\x7F]+', '', doc)

def lowercase_doc(doc):
    return doc.lower()

### Pre-Processing Master Function

In [61]:
def preprocess_tweet(doc):
    doc = remove_urls(doc)
    doc = convert_emojis(doc)
    doc = remove_hashtags(doc)
    #doc = remove_numbers(doc)
    doc = remove_user_mentions(doc)
    doc = fix_contractions(doc)
    #doc = remove_punctuation(doc)
    doc = remove_amp(doc)
    doc = remove_special_character_combinations(doc)
    doc = remove_non_english_characters(doc)
    #doc = lowercase_doc(doc)
    return doc

### Applying the Pre-Processing

In [62]:
pruned_network_df['Tweet'] = pruned_network_df['Tweet'].apply(preprocess_tweet)
pruned_media_users_df['Tweet'] = pruned_media_users_df['Tweet'].apply(preprocess_tweet)
pruned_medicine_and_research_users_df['Tweet'] = pruned_medicine_and_research_users_df['Tweet'].apply(preprocess_tweet)

## Dataset Labeller

In [63]:
def analyze_sentiment(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=280)
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    return probabilities

In [65]:
# Predicting sentiment for the individual tweets
sentiments = [0, 1]
predicted_sentiment_list = []
predicted_official_report_list = []
predicted_joyful_list = []
predicted_love_list = []
predicted_anger_list = []
predicted_sadness_list = []

for index, row in pruned_medicine_and_research_users_df.iterrows():

    text = row['Tweet']

    # Calculate probabilities for sentiment
    predicted_sentiment = analyze_sentiment(text, sentiment_model, sentiment_model_tokenizer).tolist()
    sentiment_max_index = predicted_sentiment[0].index(max(predicted_sentiment[0]))

    # Calculate probabilities for official report
    predicted_official_report = analyze_sentiment(text, reporting_model, reporting_model_tokenizer).tolist()
    official_report_max_index = predicted_official_report[0].index(max(predicted_official_report[0]))

    # Calculate probabilities for joyful
    predicted_joyful = analyze_sentiment(text, joyful_model, joyful_model_tokenizer).tolist()
    joyful_max_index = predicted_joyful[0].index(max(predicted_joyful[0]))

    # Calculate probabilities for love
    predicted_love = analyze_sentiment(text, love_model, love_model_tokenizer).tolist()
    love_max_index = predicted_love[0].index(max(predicted_love[0]))

    # Calculate probabilities for anger
    predicted_anger = analyze_sentiment(text, anger_model, anger_model_tokenizer).tolist()
    anger_max_index = predicted_anger[0].index(max(predicted_anger[0]))

    # Calculate probabilities for anger
    predicted_sadness = analyze_sentiment(text, sadness_model, sadness_model_tokenizer).tolist()
    sadness_max_index = predicted_sadness[0].index(max(predicted_sadness[0]))

    # Saving sentiments in a list
    predicted_sentiment_list.append(sentiment_max_index)
    predicted_official_report_list.append(official_report_max_index)
    predicted_joyful_list.append(joyful_max_index)
    predicted_love_list.append(love_max_index)
    predicted_anger_list.append(anger_max_index)
    predicted_sadness_list.append(sadness_max_index)

    if(index % 500 == 0):
        perc_current = (index / len(pruned_medicine_and_research_users_df)) * 100
        print(str(perc_current) + "%")


0.0%
1.7887807670291929%
3.5775615340583857%
5.366342301087579%
7.1551230681167715%
8.943903835145965%
10.732684602175159%
12.521465369204352%
14.310246136233543%
16.099026903262736%
17.88780767029193%
19.676588437321122%
21.465369204350317%
23.254149971379505%
25.042930738408703%
26.831711505437895%
28.620492272467086%
30.40927303949628%
32.19805380652547%
33.98683457355467%
35.77561534058386%
37.56439610761305%
39.353176874642244%
41.14195764167144%
42.930738408700634%
44.71951917572982%
46.50829994275901%
48.29708070978821%
50.085861476817406%
51.874642243846594%
53.66342301087579%
55.452203777904984%
57.24098454493417%
59.02976531196337%
60.81854607899256%
62.60732684602175%
64.39610761305094%
66.18488838008014%
67.97366914710933%
69.76244991413853%
71.55123068116772%
73.3400114481969%
75.1287922152261%
76.9175729822553%
78.70635374928449%
80.49513451631368%
82.28391528334288%
84.07269605037206%
85.86147681740127%
87.65025758443046%
89.43903835145964%
91.22781911848884%
93.01659988

In [66]:
test_network_df = pruned_medicine_and_research_users_df.copy()

In [67]:
test_network_df['sentiment_results'] = predicted_sentiment_list
test_network_df['official_report_results'] = predicted_official_report_list
test_network_df['joyful_results'] = predicted_joyful_list
test_network_df['love_results'] = predicted_love_list
test_network_df['anger_results'] = predicted_anger_list
test_network_df['sadness_results'] = predicted_sadness_list

In [70]:
test_network_df.to_csv('pruned_labelled_medicine_and_research_users.csv', index=False)

In [71]:
test_network_df

Unnamed: 0,Vertex1,Vertex2,Relationship_type,Relationship Date (UTC),Tweet,Lang_code,Tweet Date (UTC),Tweet_ID,Conversation_ID,Author_ID,...,Sourcetweet_id,Sourcetweet_text,Sourcetweet_author_id,Sourcetweet_lang,sentiment_results,official_report_results,joyful_results,love_results,anger_results,sadness_results
0,michaelmina_lab,diseaseecology,Quote,2020-12-01T00:02:24.000Z,Speaking of Kids and infections.,en,2020-12-01T00:02:24.000Z,1333562054498783233,1.333350e+18,1094762324097822720,...,1.333532e+18,Age-patterns of infection from random sampling...,1.647830e+09,en,0,0,0,0,0,0
1,angie_rasmussen,paimadhu,Quote,2020-12-01T00:04:28.000Z,"""In practice, by announcing a 9,500 price tag ...",en,2020-12-01T00:04:28.000Z,1333562574877573120,1.333563e+18,394087611,...,1.333551e+18,"My new @forbes piece is about ""prestige journa...",3.419251e+09,en,1,0,0,0,1,0
2,florian_krammer,kindrachukjason,Quote,2020-12-01T00:08:17.000Z,Awesome!,en,2020-12-01T00:08:17.000Z,1333563532475031553,1.333564e+18,704282873231237121,...,1.333563e+18,💥 https://t.co/iZldo8Cruo,1.071979e+18,und,1,0,0,1,0,0
3,florian_krammer,rover829,Quote,2020-12-01T00:08:51.000Z,Good!!!,en,2020-12-01T00:08:51.000Z,1333563678034190337,1.333564e+18,704282873231237121,...,1.333561e+18,Reuters: DR. SCOTT ATLAS HAS RESIGNED AS SPEC...,1.268167e+08,en,1,0,0,1,0,0
4,angie_rasmussen,angie_rasmussen,Tweet,2020-12-01T00:08:53.000Z,Out with a whimper,en,2020-12-01T00:08:53.000Z,1333563683952156672,1.333564e+18,394087611,...,,,,,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27947,michaelmina_lab,michaelmina_lab,Tweet,2022-02-23T22:40:42.000Z,"I just received this question:[Michael,] Any t...",en,2022-02-23T22:40:42.000Z,1496616031753670656,1.496616e+18,1094762324097822720,...,,,,,0,0,0,0,0,0
27948,doctorsoumya,doctorsoumya,Tweet,2022-02-23T22:44:17.000Z,WHO creates training hub to boost pharmaceutic...,en,2022-02-23T22:44:17.000Z,1496616936008003586,1.496617e+18,2855536962,...,,,,,1,0,0,0,0,1
27949,michaelmina_lab,michaelmina_lab,Quote,2022-02-23T22:50:37.000Z,2 years ago today!Testing in US was already fa...,en,2022-02-23T22:50:37.000Z,1496618526261399554,1.496619e+18,1094762324097822720,...,1.231504e+18,"Reminder: As of today (Feb 23), the US remains...",1.094762e+18,en,0,0,0,0,0,0
27950,gregggonsalves,gregggonsalves,Tweet,2022-02-23T23:36:08.000Z,OMG. I laughed so hard. Elmo replaces Timothee...,en,2022-02-23T23:36:08.000Z,1496629982013050882,1.496630e+18,30844417,...,,,,,1,0,1,0,0,0
