In [29]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import re # for regular expressions

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import TweetTokenizer
from textblob import TextBlob
import advertools as adv

In [2]:
# read tweets from csv file
tweets = pd.read_csv('./data/0401_UkraineCombinedTweetsDeduped.csv')
tweets = tweets[tweets['language']=="en"]
tweets = tweets[["tweetcreatedts", "text"]]
tweets.head()
tweets.reset_index(drop=True)



  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,tweetcreatedts,text
0,2022-04-01 00:00:00.000000,⚡The Ukrainian Air Force would like to address...
1,2022-04-01 00:00:00.000000,Chernihiv oblast. Ukrainians welcome their lib...
2,2022-04-01 00:00:00.000000,America 🇺🇸 is preparing for something worse th...
3,2022-04-01 00:00:00.000000,JUST IN: #Anonymous has hacked &amp; released ...
4,2022-04-01 00:00:00.000000,***PUBLIC MINT NOW LIVE***\n\nFor \n@billionai...
...,...,...
254621,2022-04-01 23:59:57.000000,14-year-old Yura from #Bucha told how a Russia...
254622,2022-04-01 23:59:57.000000,#RussianUkrainianWar #UkraineRussianWar #Russi...
254623,2022-04-01 23:59:58.000000,“From where Winston stood it was just possible...
254624,2022-04-01 23:59:58.000000,When I said tonight in front of 3000 people In...


In [30]:
from collections import Counter
from tqdm import tqdm
from Lexicon import lexicon
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
tqdm.pandas()

emoji_lexicon = pd.read_csv('resource_folder/EmoTag1200-scores.csv', sep =',')
emoji_dict = emoji_lexicon.set_index('emoji').T.to_dict()
emoji_factor = 0.1

# extention of NRCLex, rewrite build_word_affect mehtod to add emoji analysis
class NRCLexEx():
    def build_word_affect(self):
        #print('build_word_affect')
        # Build word affect function
        affect_list = []
        affect_dict = dict()
        affect_frequencies = Counter()
        lexicon_keys = lexicon.keys()
        for word in self.words:
            if word in lexicon_keys:
                affect_list.extend(lexicon[word])
                affect_dict.update({word: lexicon[word]})
        for word in affect_list:
            affect_frequencies[word] += 1
        sum_values = sum(affect_frequencies.values())
        affect_percent = {'fear': 0.0, 'anger': 0.0, 'anticipation': 0.0, 'trust': 0.0, 'surprise': 0.0, 'sadness': 0.0, 'disgust': 0.0, 'joy': 0.0}
        emoji_affect_percent = {'fear': 0.0, 'anger': 0.0, 'anticipation': 0.0, 'trust': 0.0, 'surprise': 0.0, 'sadness': 0.0, 'disgust': 0.0, 'joy': 0.0}
        for key in affect_frequencies.keys():
            affect_percent.update({key: float(affect_frequencies[key]) / float(sum_values)})

        emoji_affect_frequencies = Counter()
        if len(affect_list) > 0:
            for emoji in self.emojis:
                #print(emoji)
                if emoji in emoji_dict.keys():
                    emoji_affect_dic = emoji_dict[emoji]
                    #print(emoji_affect_frequencies)
                    for key in affect_percent.keys():
                        if emoji_affect_dic[key] > 0:
                            emoji_affect_frequencies[key] += emoji_affect_dic[key]
        emoji_sum_values = sum(emoji_affect_frequencies.values())
        #print('emoji_sum_values',emoji_sum_values)
        #print('emoji_affect_frequencies',emoji_affect_frequencies)
        for key in emoji_affect_frequencies.keys():
            #print(float(emoji_affect_frequencies[key]),float(emoji_sum_values),float(emoji_affect_frequencies[key]) / float(emoji_sum_values))
            #freq = float(emoji_affect_frequencies[key]) / float(emoji_sum_values)
            #print(freq)
            emoji_affect_percent.update({key: float(emoji_affect_frequencies[key]) / float(emoji_sum_values)})
            #print(key,'emoji_affect_frequencies',emoji_affect_frequencies)
        self.affect_list = affect_list
        self.raw_emotion_scores = dict(affect_frequencies)
        self.affect_frequencies = affect_percent
        self.emoji_affect_frequencies = emoji_affect_percent

    def __init__(self,text,emojis):
        self.text = text
        blob = TextBlob(text)
        self.words = list(blob.words)
        self.sentences = list(blob.sentences)
        self.emojis = emojis
        self.build_word_affect()
        self.top_emotions()

    # get top emotins based on accumulate score
    def top_emotions(self):
        emo_dict = self.affect_frequencies
        max_value = max(emo_dict.values())
        top_emotions = []
        for key in emo_dict.keys():
            if emo_dict[key] == max_value and max_value > 0:
                top_emotions.append((key, max_value))
        self.top_emotions = top_emotions
        emoji_emo_dict = self.emoji_affect_frequencies
        emoji_max_value = max(emoji_emo_dict.values())
        top_emoji_emotions = []
        for key in emoji_emo_dict.keys():
            if emoji_emo_dict[key] == emoji_max_value and emoji_max_value > 0:
                top_emoji_emotions.append((key, emoji_max_value))
        self.top_emoji_emotions = top_emoji_emotions

# Test
#text_object = NRCLexEx("o denote the measurements of some unknown quantity V,"
                        " and sought the  estimator of that quantity",['🌈', '👏', '👏','🙄'])
#print(text_object.affect_frequencies);
#print(text_object.top_emotions);
#print(text_object.top_emoji_emotions);

{'fear': 0.5, 'anger': 0.0, 'anticipation': 0.5, 'trust': 0.0, 'surprise': 0.0, 'sadness': 0.0, 'disgust': 0.0, 'joy': 0.0}
[('fear', 0.5), ('anticipation', 0.5)]
[('joy', 0.3568840579710145)]


In [31]:
class EmotionDetector:
    # copyed from nrclex.py, remove positive and negative emotions

    wordnet_lemmatizer = WordNetLemmatizer()
    tk = TweetTokenizer()
    stop_words = set(stopwords.words('english'))
    analyser = SentimentIntensityAnalyzer()
    stemmer = SnowballStemmer("english")

    # initialize lexicon
    def __init__(self, emoji_factor = 1):
        # load lexico from https://github.com/abushoeb/EmoTag for emoji emotion detection
        self.emoji_factor = emoji_factor

    def process(self, dataFrame, columnName='text'):
        print("begin to extract emoji ")
        emoji_summary = adv.extract_emoji(dataFrame[columnName])
        dataFrame['emojis'] = emoji_summary['emoji']
        print("pre process")
        pre_process_res= dataFrame[columnName].progress_apply(EmotionDetector.pre_process)
        dataFrame = pd.concat([dataFrame, pre_process_res], axis=1)
        print("emotion analysis")
        res = dataFrame.progress_apply(self.emotion_analysis,axis=1)
        res.head()
        dataFrame = pd.concat([dataFrame, res], axis=1)
        print("sentiment analysis")
        sentiment_analysis_res = dataFrame.progress_apply(self.sentiment_analysis,axis=1)
        dataFrame = pd.concat([dataFrame, sentiment_analysis_res], axis=1)
        return dataFrame

    # sentiment analysis method
    def sentiment_analysis(self, row):
        words = row["tokens"]
        score = EmotionDetector.analyser.polarity_scores(str(words))
        score=score['compound']
        res = 'Neutral'
        if score>=0.05:
            res = 'Positive'
        elif score<=-0.05:
            res = 'Negative'
        else:
            res = 'Neutral'
        return pd.Series([score,res],
                         index=['sentiment_score', 'sentiment'])

    # emotion analysis method
    def emotion_analysis(self, row):
        #print("1111")
        #print(row)
        text = row["text"]
        emojis = row['emojis']
        emotions = NRCLexEx(text,emojis)
        top_emotions = emotions.top_emotions
        top_emoji_emotions = emotions.top_emoji_emotions
        top_emotions_dict = {'fear': 0, 'anger': 0, 'anticipation': 0, 'trust': 0, 'surprise': 0,
                             'sadness': 0, 'disgust': 0, 'joy': 0}
        top_eomji_emotions_dict = {'e_fear': 0, 'e_anger': 0, 'e_anticipation': 0, 'e_trust': 0, 'e_surprise': 0,
                             'e_sadness': 0, 'e_disgust': 0, 'e_joy': 0}
        #print(top_emotions)
        for key,value in top_emotions:
            top_emotions_dict[key] = value
        for key,value in top_emoji_emotions:
            top_eomji_emotions_dict['e_'+key] = value
        # print(top_emotions_dict.values())
        # print(top_emotions_dict.keys())
        # res = [affect_list,affect_dict,dict(affect_frequencies),affect_percent,top_emotions]
        # res = res.append()
        return pd.Series([top_emotions,*top_emotions_dict.values(),*top_eomji_emotions_dict.values()],
                         index=['top_emotions',*top_emotions_dict.keys(),*top_eomji_emotions_dict.keys()])

    @staticmethod
    def pre_process(text):
        #print(text)
        # clean text
        text = EmotionDetector.cleanText(str(text))
        #print(text)
        #emoji_summary = adv.extract_emoji([text])
        #print(emoji_summary['emoji'])
        #tokenization = nltk.word_tokenize(text)
        # tokenization = EmotionDetector.tk.tokenize(text)

        blob = TextBlob(text)
        filtered_words = [word.lower() for word in blob.words]
        nouns = []
        adjectives = []
        for word, tag in pos_tag(filtered_words):
            # since most tweets contain such words, it is not helpful for future analysis
            if word.startswith("russi") or word.startswith("ukrai"):
                continue
            if tag.startswith("NN"): #Nouns
                nouns.append(word)
            elif tag.startswith("JJ"): #Adjective
                adjectives.append(word)

        # map_object = map(EmotionDetector.stemmer.stem, filtered_words)
        # lemma_words = list(map_object)
        #print(lemma_words)
        return pd.Series([' '.join(filtered_words),' '.join(nouns),' '.join(adjectives)],
                         index=['tokens', 'nouns', 'adjectives'])


    @staticmethod
    def cleanText(text):
        text=text.lower()
        text = re.sub(r'(?i)RT @\w+: ','', text) # remove RT
        text = re.sub(r'@\w+','', text) # remove @member
        text = re.sub(r'#','', text) # remove # symbol
        text = re.sub(r'https?:\/\/\S+','', text) # remove the hyper link
        text = re.sub(r'[^\w\s]', '', text) # remove punctuations
        return text

In [27]:
# test
tweets_short = tweets[0:100]
ed = EmotionDetector()
res = ed.process(tweets_short)
res.head(100)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataFrame['emojis'] = emoji_summary['emoji']


begin to extract emoji 
pre process


100%|██████████| 100/100 [00:00<00:00, 386.04it/s]


emotion analysis


100%|██████████| 100/100 [00:00<00:00, 917.21it/s]


sentiment analysis


100%|██████████| 100/100 [00:00<00:00, 1785.81it/s]


Unnamed: 0,tweetcreatedts,text,emojis,tokens,nouns,adjectives,top_emotions,fear,anger,anticipation,...,e_fear,e_anger,e_anticipation,e_trust,e_surprise,e_sadness,e_disgust,e_joy,sentiment_score,sentiment
0,2022-04-01 00:00:00.000000,⚡The Ukrainian Air Force would like to address...,"[⚡, 🇺🇦, 🧵]",the ukrainian air force would like to address ...,air force misinformation media outlets situati...,multiple western protectuаsky,[],0.00,0.00,0.00,...,0,0,0.000000,0,0,0,0.000000,0.0,0.4404,Positive
1,2022-04-01 00:00:00.000000,Chernihiv oblast. Ukrainians welcome their lib...,[],chernihiv oblast ukrainians welcome their libe...,chernihiv oblast liberators standwithukraine s...,putinisawarcriminal,[],0.00,0.00,0.00,...,0,0,0.000000,0,0,0,0.000000,0.0,0.4588,Positive
2,2022-04-01 00:00:00.000000,America 🇺🇸 is preparing for something worse th...,"[🇺🇸, 🇹🇼, 🇷🇺, 🇺🇸, 🇨🇳, 👇]",america is preparing for something worse than ...,america something month war policy future rela...,worse last new cold best taiwan full,"[(fear, 0.5)]",0.50,0.00,0.00,...,0,0,0.333333,0,0,0,0.000000,0.0,-0.4215,Negative
3,2022-04-01 00:00:00.000000,JUST IN: #Anonymous has hacked &amp; released ...,[],just in anonymous has hacked amp released 6200...,amp emails marathon group investment firm olig...,anonymous eu foreign,[],0.00,0.00,0.00,...,0,0,0.000000,0,0,0,0.000000,0.0,-0.4019,Negative
4,2022-04-01 00:00:00.000000,***PUBLIC MINT NOW LIVE***\n\nFor \n@billionai...,[],public mint now live for win 100000 during pub...,mint sale mint visit information luck nft mint,public public further good,"[(anticipation, 1.0)]",0.00,0.00,1.00,...,0,0,0.000000,0,0,0,0.000000,0.0,0.8658,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,2022-04-01 00:00:30.000000,💙💛 Cover of the April issue of Polish Vogue \...,"[💙, 💛]",cover of the april issue of polish vogue it wa...,cover issue vogue fashion photographer artist ...,april polish,[],0.00,0.00,0.00,...,0,0,0.000000,0,0,0,0.000000,0.0,0.2500,Neutral
124,2022-04-01 00:00:30.000000,#Ukrainian forces successfully conducted local...,[],ukrainian forces successfully conducted local ...,forces counterattacks kyiv towards sumy kherso...,local further northwest coming full,"[(anticipation, 1.0)]",0.00,0.00,1.00,...,0,0,0.000000,0,0,0,0.000000,0.0,0.4939,Positive
125,2022-04-01 00:00:31.000000,⚡The Ukrainian Air Force would like to address...,"[⚡, 🇺🇦, 🧵]",the ukrainian air force would like to address ...,air force misinformation media outlets situati...,multiple western protectuаsky,[],0.00,0.00,0.00,...,0,0,0.000000,0,0,0,0.000000,0.0,0.4404,Positive
127,2022-04-01 00:00:31.000000,❌Myth: The US and @NATO are providing #Ukraine...,"[❌, ✅]",myth the us and are providing ukraine with nee...,weapons allies everything involvement war trut...,needed possible short direct,"[(fear, 0.25), (anger, 0.25), (anticipation, 0...",0.25,0.25,0.25,...,0,0,0.000000,0,0,0,0.210909,0.0,-0.1779,Neutral


In [32]:
### Process dataset
files = ["Putin.csv","Russia.csv","Ukraine.csv","Ukraine Russia War.csv","Zelensky.csv"]
#files = ["Putin.csv"]
ed = EmotionDetector()

for file in files:
    tweets = pd.read_csv('./data/'+file)
    tweets = tweets.rename(columns={'0': 'text'})
    tweets.head()
    tweets.reset_index(drop=True)
    res = ed.process(tweets)
    res.to_csv('./processed_dataset/'+file)

begin to extract emoji 
pre process


100%|██████████| 56962/56962 [03:20<00:00, 284.81it/s]


emotion analysis


100%|██████████| 56962/56962 [01:02<00:00, 916.47it/s] 


sentiment analysis


100%|██████████| 56962/56962 [00:32<00:00, 1770.67it/s]


begin to extract emoji 
pre process


100%|██████████| 50966/50966 [01:55<00:00, 442.52it/s]


emotion analysis


100%|██████████| 50966/50966 [00:49<00:00, 1034.66it/s]


sentiment analysis


100%|██████████| 50966/50966 [00:26<00:00, 1951.29it/s]


begin to extract emoji 
pre process


100%|██████████| 61459/61459 [02:20<00:00, 436.73it/s]


emotion analysis


100%|██████████| 61459/61459 [01:04<00:00, 946.35it/s] 


sentiment analysis


100%|██████████| 61459/61459 [00:36<00:00, 1677.40it/s]


begin to extract emoji 
pre process


100%|██████████| 56962/56962 [02:12<00:00, 431.03it/s]


emotion analysis


100%|██████████| 56962/56962 [01:07<00:00, 849.76it/s] 


sentiment analysis


100%|██████████| 56962/56962 [00:34<00:00, 1640.18it/s]


begin to extract emoji 
pre process


100%|██████████| 59960/59960 [02:09<00:00, 464.34it/s]


emotion analysis


100%|██████████| 59960/59960 [00:55<00:00, 1086.12it/s]


sentiment analysis


100%|██████████| 59960/59960 [00:28<00:00, 2127.43it/s]
