In [33]:
# install necessary libs
!pip install nltk
!pip install pandas
!pip install -U textblob
!pip install advertools

Requirement already up-to-date: textblob in c:\users\51588\anaconda3\lib\site-packages (0.17.1)


In [36]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import re # for regular expressions

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import TweetTokenizer
from textblob import TextBlob
import advertools as adv

In [43]:
# read tweets from csv file
tweets = pd.read_csv('./data/Putin.csv')
tweets = tweets.rename(columns={'0': 'text'})
tweets.head()
tweets.reset_index(drop=True)



Unnamed: 0,text
0,RT @TimothyDSnyder: It is senseless to shelter...
1,"@chaplinez70 Murderous Putin, over to you"
2,RT @EmbassyofRussia: 🇷🇺President #Putin on US ...
3,RT @AnonOpsSE: Putin's propagandist Vladimir S...
4,RT @co_co_no5: @OnlinePalEng I think these Isr...
...,...
56957,RT @yarotrof: They built monuments to Grandma ...
56958,@BorisJohnson @CMShehbaz Mr Putin is an evil m...
56959,RT @McFaul: Those arguing against weapons tran...
56960,RT @GlasnostGone: Illustrating peace talks wit...


In [40]:
from collections import Counter
from tqdm import tqdm
from Lexicon import lexicon
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
tqdm.pandas()

emoji_lexicon = pd.read_csv('resource_folder/EmoTag1200-scores.csv', sep =',')
emoji_dict = emoji_lexicon.set_index('emoji').T.to_dict()
emoji_factor = 0.1

# extention of NRCLex, rewrite build_word_affect mehtod to add emoji analysis
class NRCLexEx():
    def build_word_affect(self):
        #print('build_word_affect')
        # Build word affect function
        affect_list = []
        affect_dict = dict()
        affect_frequencies = Counter()
        lexicon_keys = lexicon.keys()
        for word in self.words:
            if word in lexicon_keys:
                affect_list.extend(lexicon[word])
                affect_dict.update({word: lexicon[word]})
        for word in affect_list:
            affect_frequencies[word] += 1
        sum_values = sum(affect_frequencies.values())
        affect_percent = {'fear': 0.0, 'anger': 0.0, 'anticipation': 0.0, 'trust': 0.0, 'surprise': 0.0, 'sadness': 0.0, 'disgust': 0.0, 'joy': 0.0}
        emoji_affect_percent = {'fear': 0.0, 'anger': 0.0, 'anticipation': 0.0, 'trust': 0.0, 'surprise': 0.0, 'sadness': 0.0, 'disgust': 0.0, 'joy': 0.0}
        for key in affect_frequencies.keys():
            affect_percent.update({key: float(affect_frequencies[key]) / float(sum_values)})

        emoji_affect_frequencies = Counter()
        if len(affect_list) > 0:
            for emoji in self.emojis:
                #print(emoji)
                if emoji in emoji_dict.keys():
                    emoji_affect_dic = emoji_dict[emoji]
                    #print(emoji_affect_frequencies)
                    for key in affect_percent.keys():
                        if emoji_affect_dic[key] > 0:
                            emoji_affect_frequencies[key] += emoji_affect_dic[key]
        emoji_sum_values = sum(emoji_affect_frequencies.values())
        #print('emoji_sum_values',emoji_sum_values)
        #print('emoji_affect_frequencies',emoji_affect_frequencies)
        for key in emoji_affect_frequencies.keys():
            #print(float(emoji_affect_frequencies[key]),float(emoji_sum_values),float(emoji_affect_frequencies[key]) / float(emoji_sum_values))
            #freq = float(emoji_affect_frequencies[key]) / float(emoji_sum_values)
            #print(freq)
            emoji_affect_percent.update({key: float(emoji_affect_frequencies[key]) / float(emoji_sum_values)})
            #print(key,'emoji_affect_frequencies',emoji_affect_frequencies)
        self.affect_list = affect_list
        self.raw_emotion_scores = dict(affect_frequencies)
        self.affect_frequencies = affect_percent
        self.emoji_affect_frequencies = emoji_affect_percent

    def __init__(self,text,emojis):
        self.text = text
        blob = TextBlob(text)
        self.words = list(blob.words)
        self.sentences = list(blob.sentences)
        self.emojis = emojis
        self.build_word_affect()
        self.top_emotions()

    # get top emotins based on accumulate score
    def top_emotions(self):
        emo_dict = self.affect_frequencies
        max_value = max(emo_dict.values())
        top_emotions = []
        for key in emo_dict.keys():
            if emo_dict[key] == max_value and max_value > 0:
                top_emotions.append((key, max_value))
        self.top_emotions = top_emotions
        emoji_emo_dict = self.emoji_affect_frequencies
        emoji_max_value = max(emoji_emo_dict.values())
        top_emoji_emotions = []
        for key in emoji_emo_dict.keys():
            if emoji_emo_dict[key] == emoji_max_value and emoji_max_value > 0:
                top_emoji_emotions.append((key, emoji_max_value))
        self.top_emoji_emotions = top_emoji_emotions

# Test
#text_object = NRCLexEx("o denote the measurements of some unknown quantity V,"
#                        " and sought the  estimator of that quantity",['🌈', '👏', '👏','🙄'])
#print(text_object.affect_frequencies);
#print(text_object.top_emotions);
#print(text_object.top_emoji_emotions);

In [41]:
class EmotionDetector:
    # copyed from nrclex.py, remove positive and negative emotions

    wordnet_lemmatizer = WordNetLemmatizer()
    tk = TweetTokenizer()
    stop_words = set(stopwords.words('english'))
    analyser = SentimentIntensityAnalyzer()
    stemmer = SnowballStemmer("english")

    # initialize lexicon
    def __init__(self, emoji_factor = 1):
        # load lexico from https://github.com/abushoeb/EmoTag for emoji emotion detection
        self.emoji_factor = emoji_factor

    def process(self, dataFrame, columnName='text'):
        print("begin to extract emoji ")
        emoji_summary = adv.extract_emoji(dataFrame[columnName])
        dataFrame['emojis'] = emoji_summary['emoji']
        print("pre process")
        pre_process_res= dataFrame[columnName].progress_apply(EmotionDetector.pre_process)
        dataFrame = pd.concat([dataFrame, pre_process_res], axis=1)
        print("emotion analysis")
        res = dataFrame.progress_apply(self.emotion_analysis,axis=1)
        res.head()
        dataFrame = pd.concat([dataFrame, res], axis=1)
        print("sentiment analysis")
        sentiment_analysis_res = dataFrame.progress_apply(self.sentiment_analysis,axis=1)
        dataFrame = pd.concat([dataFrame, sentiment_analysis_res], axis=1)
        return dataFrame

    # sentiment analysis method
    def sentiment_analysis(self, row):
        words = row["tokens"]
        score = EmotionDetector.analyser.polarity_scores(str(words))
        score=score['compound']
        res = 'Neutral'
        if score>=0.05:
            res = 'Positive'
        elif score<=-0.05:
            res = 'Negative'
        else:
            res = 'Neutral'
        return pd.Series([score,res],
                         index=['sentiment_score', 'sentiment'])

    # emotion analysis method
    def emotion_analysis(self, row):
        #print("1111")
        #print(row)
        text = row["text"]
        emojis = row['emojis']
        emotions = NRCLexEx(text,emojis)
        top_emotions = emotions.top_emotions
        top_emoji_emotions = emotions.top_emoji_emotions
        top_emotions_dict = {'fear': 0, 'anger': 0, 'anticipation': 0, 'trust': 0, 'surprise': 0,
                             'sadness': 0, 'disgust': 0, 'joy': 0}
        top_eomji_emotions_dict = {'e_fear': 0, 'e_anger': 0, 'e_anticipation': 0, 'e_trust': 0, 'e_surprise': 0,
                             'e_sadness': 0, 'e_disgust': 0, 'e_joy': 0}
        #print(top_emotions)
        for key,value in top_emotions:
            top_emotions_dict[key] = value
        for key,value in top_emoji_emotions:
            top_eomji_emotions_dict['e_'+key] = value
        # print(top_emotions_dict.values())
        # print(top_emotions_dict.keys())
        # res = [affect_list,affect_dict,dict(affect_frequencies),affect_percent,top_emotions]
        # res = res.append()
        return pd.Series([top_emotions,*top_emotions_dict.values(),*top_eomji_emotions_dict.values()],
                         index=['top_emotions',*top_emotions_dict.keys(),*top_eomji_emotions_dict.keys()])

    @staticmethod
    def pre_process(text):
        #print(text)
        # clean text
        text = EmotionDetector.cleanText(str(text))
        #print(text)
        #emoji_summary = adv.extract_emoji([text])
        #print(emoji_summary['emoji'])
        #tokenization = nltk.word_tokenize(text)
        # tokenization = EmotionDetector.tk.tokenize(text)

        blob = TextBlob(text)
        filtered_words = [word.lower() for word in blob.words]
        nouns = []
        adjectives = []
        for word, tag in pos_tag(filtered_words):
            # since most tweets contain such words, it is not helpful for future analysis
            if word.startswith("russi") or word.startswith("ukrai"):
                continue
            if tag.startswith("NN"): #Nouns
                nouns.append(word)
            elif tag.startswith("JJ"): #Adjective
                adjectives.append(word)

        # map_object = map(EmotionDetector.stemmer.stem, filtered_words)
        # lemma_words = list(map_object)
        #print(lemma_words)
        return pd.Series([' '.join(filtered_words),' '.join(nouns),' '.join(adjectives)],
                         index=['tokens', 'nouns', 'adjectives'])


    @staticmethod
    def cleanText(text):
        text=text.lower()
        text = re.sub(r'(?i)RT @\w+: ','', text) # remove RT
        text = re.sub(r'@\w+','', text) # remove @member
        text = re.sub(r'#','', text) # remove # symbol
        text = re.sub(r'https?:\/\/\S+','', text) # remove the hyper link
        text = re.sub(r'[^\w\s]', '', text) # remove punctuations
        return text

In [44]:
# test
tweets_short = tweets[0:100]
ed = EmotionDetector()
res = ed.process(tweets_short)
res.head(100)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataFrame['emojis'] = emoji_summary['emoji']


begin to extract emoji 
pre process


100%|██████████| 100/100 [00:00<00:00, 178.91it/s]


emotion analysis


100%|██████████| 100/100 [00:00<00:00, 609.39it/s]


sentiment analysis


100%|██████████| 100/100 [00:00<00:00, 1086.96it/s]


Unnamed: 0,text,emojis,tokens,nouns,adjectives,top_emotions,fear,anger,anticipation,trust,...,e_fear,e_anger,e_anticipation,e_trust,e_surprise,e_sadness,e_disgust,e_joy,sentiment_score,sentiment
0,RT @TimothyDSnyder: It is senseless to shelter...,[],it is senseless to shelter putin from the sens...,putin sense,senseless,"[(anger, 0.25), (sadness, 0.25)]",0.000000,0.250000,0.000000,0.000000,...,0,0,0,0,0.0,0,0,0.0,-0.3818,Negative
1,"@chaplinez70 Murderous Putin, over to you",[],murderous putin over to you,putin,murderous,[],0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.0,0,0,0.0,-0.6369,Negative
2,RT @EmbassyofRussia: 🇷🇺President #Putin on US ...,[🇷🇺],president putin on us biolabs in ukraine their...,president putin task materials spread viruses,biological analyze,[],0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.0,0,0,0.0,0.0000,Neutral
3,RT @AnonOpsSE: Putin's propagandist Vladimir S...,[],putins propagandist vladimir solovyov will nev...,putins solovyov villa shores lake como italy s...,vladimir,[],0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.0,0,0,0.0,0.0000,Neutral
4,RT @co_co_no5: @OnlinePalEng I think these Isr...,[],i think these israelites are definitely more b...,i israelites putin,barbaric,"[(fear, 0.3333333333333333), (anger, 0.3333333...",0.333333,0.333333,0.000000,0.000000,...,0,0,0,0,0.0,0,0,0.0,0.4019,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,RT @suzannelynch1: The ambassadors of #Finland...,[],the ambassadors of finland and sweden are due ...,ambassadors finland sweden headquarters brusse...,due nato,"[(anticipation, 1.0)]",0.000000,0.000000,1.000000,0.000000,...,0,0,0,0,0.0,0,0,0.0,0.4939,Positive
96,RT @TimothyDSnyder: It is senseless to create ...,[],it is senseless to create an offramp in the re...,offramp world putin world,senseless real virtual,"[(fear, 0.14285714285714285), (anger, 0.142857...",0.142857,0.142857,0.000000,0.142857,...,0,0,0,0,0.0,0,0,0.0,0.2732,Positive
97,@longshortgamma @simonkwo2012 Pictures tell a ...,[],pictures tell a thousand words videos tell ten...,pictures thousand words video,,"[(anger, 1.0)]",0.000000,1.000000,0.000000,0.000000,...,0,0,0,0,0.0,0,0,0.0,0.3400,Positive
98,"RT @maxseddon: Putin: ""Giving up Russian energ...",[],putin giving up russian energy resources will ...,putin energy resources region energy costs wor...,highest,"[(joy, 0.3333333333333333)]",0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.0,0,0,0.0,0.6808,Positive


In [None]:
### Process dataset
files = ["Putin.csv","Russia.csv","Ukraine.csv","Ukraine Russia War.csv","Zelensky.csv"]
#files = ["Putin.csv"]
ed = EmotionDetector()

for file in files:
    tweets = pd.read_csv('./data/'+file)
    tweets = tweets.rename(columns={'0': 'text'})
    tweets.head()
    tweets.reset_index(drop=True)
    res = ed.process(tweets)
    res.to_csv('./processed_dataset/'+file)

begin to extract emoji 
pre process


100%|██████████| 56962/56962 [04:10<00:00, 227.27it/s]


emotion analysis


  3%|▎         | 1474/56962 [00:02<03:50, 240.93it/s]