# Sentiment analysis - graph based

General idea:

For each text in dataset to get its sentiment:
1. Get nouns, verbs, adjectives and NERs (e.g. with nltk library) - they will be the nodes of the graph. Single words, bigrams in format word_word.
2. The weights of edges between nodes are created based on the distance in the original text between two given words. The weight is the sum of inverses of distances between them in the whole text.
3. DONE (update: parameter to change if compound or singular function) For each word in node the sentiment is calculated (e.g. with VADER model). The format of final outcome depends on the value of parameter *compound* - if True the normalised compound score from the VADER model is returned as the sentiment of a word in a given node. If False, value -1 (for negative), 0 (for neutral) or 1 (for positive) is returned.
4. The sentiment is later "weighted". This means that for each node the sentiment from model (VADER) is multiplied by scaled_sum_of_weights_of_edges_to_this_node (mean of sum of weights * number of occurencies of the word in original text).
5. The sentiment of the whole text is the normalized sum of weights for all nodes. There is a certain threshold, which scores are treated as neutral. In general, if sum_of_sentiment>0 than positive, < 0 negative, ~0 neutral.



More details about possible configurations are described in Master Thesis. The possibilities include choosing:
* maximum number of nodes in the graph - selected are topk words with most occurencies in the text
* max_distance between words to add the inverse to edge weight
* NER_list to provide NERs as the important ones to calculate and return the sentiment score for them instead of for the whole text
* calculate_overall_score -> to say if user wants to get the score of the whole text or just for given NERs

# Setup

In [15]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
import numpy as np
import pandas as pd
import time
from sklearn.metrics import balanced_accuracy_score
from config_experiments import (
    PATH_TWITTER_DATA,
    PATH_TO_SAVE_RESULTS,
    PATH_NEWS_DATA,
    PATH_NOT_PREPROCESSED,
    PATH_MY_GT,
    PATH_4O,
    PATH_HUMAN,
    PATH_REGEX
)
from langdetect import detect
import random

seed=42
random.seed(seed)
np.random.seed(seed)

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\akaga\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Implementation with examples

## Find words for nodes

In [16]:
def find_ents(doc):
  list_of_ents=[]
  if doc.ents:
    for ent in doc.ents:
      list_of_ents.append(ent.text)
  return list_of_ents

In [17]:
def get_words_for_nodes(text, nlp, list_ners=[], lemmatization=False, max_nodes=0):
  doc1 = nlp(text)
  # find ners
  if type(list_ners)==list and len(list_ners)>0:
    ners=list_ners
  else:
    ners = find_ents(doc1)
  if lemmatization:
    lemmatized_tokens=[token.lemma_ for token in doc1]
    text = ' '.join(lemmatized_tokens)
    doc1=nlp(text)
  tags=['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
  nouns_verbs_etc = [token.text for token in doc1 if token.tag_ in tags]
  text2=" ".join([str(word).replace(" ", "_") for word in list(doc1)])
  all_nodes=[str(word).replace(" ", "_") for word in list(set(ners+nouns_verbs_etc))]
  if max_nodes !=0:
    word_dict = {word: 0 for word in all_nodes}
    for word in text2.split():
      if word in word_dict.keys():
        word_dict[word] +=1
    word_dict_sorted=sorted(word_dict.items(), key=lambda x: x[1], reverse=True)[:max_nodes]
    all_nodes = [item[0] for item in word_dict_sorted]
  else:
    word_dict=[]
  return all_nodes,text2, ners, word_dict

In [18]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("merge_noun_chunks")
all, text3, ners, word_dict = get_words_for_nodes("what is super cool in NewYork and Abu Dhabi, this is NewYork", lemmatization=True, max_nodes=2, nlp=nlp)

In [19]:
word_dict

{'super': 1, 'be': 2, 'Abu_Dhabi': 1, 'NewYork': 2, 'cool': 1}

In [20]:
all

['be', 'NewYork']

In [21]:
all, text3, ners, word_dict = get_words_for_nodes("what is super cool in NewYork and Abu Dhabi, this is NewYork", lemmatization=True, max_nodes=3, nlp=nlp)

In [22]:
all

['be', 'NewYork', 'super']

In [23]:
all, text3, ners, word_dict = get_words_for_nodes("what is super cool in NewYork and Abu Dhabi, this is NewYork", lemmatization=True, max_nodes=5, nlp=nlp)

In [24]:
all

['be', 'NewYork', 'super', 'Abu_Dhabi', 'cool']

In [25]:
word_dict

{'super': 1, 'be': 2, 'Abu_Dhabi': 1, 'NewYork': 2, 'cool': 1}

In [26]:
text3

'what be super cool in NewYork and Abu_Dhabi , this be NewYork'

## Calculate weights of edges

In [27]:
def get_weights_of_edges(text, words, max_distance=20, ner_list=[]):

  list_from_text = text.split()
  weight_matrix=np.zeros((len(words), len(words)))
  occurences_list=np.zeros(len(words))
  for i, word1 in enumerate(list_from_text):
    j=i+1
    try:
      index1=words.index(word1)
      occurences_list[index1]+=1
    except ValueError:
      pass
    while j<=i+max_distance and j<len(list_from_text):
      word2=list_from_text[j]
      if len(ner_list) != 0 and (word1 in ner_list or word2 in ner_list):
        try:
          index1=words.index(word1)
          index2=words.index(word2)
          distance=j-i
          if distance!=0:
            inv_distance=1/distance
            weight_matrix[index1][index2]+=inv_distance
        except ValueError:
          pass
      elif len(ner_list)==0:
        try:
          index1=words.index(word1)
          index2=words.index(word2)
          distance=j-i
          if distance!=0:
            inv_distance=1/distance
            weight_matrix[index1][index2]+=inv_distance
        except ValueError:
          pass
      else:
        pass
      j+=1
  return weight_matrix, occurences_list

In [28]:
text3

'what be super cool in NewYork and Abu_Dhabi , this be NewYork'

In [29]:
all

['be', 'NewYork', 'super', 'Abu_Dhabi', 'cool']

In [30]:
weight_matrix, occ_list=get_weights_of_edges(text3, all, max_distance=20, ner_list=[])

In [31]:
occ_list

array([2., 2., 1., 1., 1.])

In [32]:
weight_matrix

array([[0.11111111, 1.35      , 1.        , 0.16666667, 0.5       ],
       [0.2       , 0.16666667, 0.        , 0.5       , 0.        ],
       [0.125     , 0.44444444, 0.        , 0.2       , 1.        ],
       [0.33333333, 0.25      , 0.        , 0.        , 0.        ],
       [0.14285714, 0.625     , 0.        , 0.25      , 0.        ]])

In [33]:
weight_matrix2, occ_list2=get_weights_of_edges(text3, all, max_distance=20, ner_list=['super'])
weight_matrix2

array([[0.        , 0.        , 1.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.125     , 0.44444444, 0.        , 0.2       , 1.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

## Find sentiment for each node

'compound' - The normalized compound score which calculates the sum of all lexicon ratings and takes values from -1 to 1

In [34]:
def calculate_sentiment_for_nodes(text, compound=False):
  analyzer = SentimentIntensityAnalyzer()
  sentiment_scores=np.zeros(len(text))
  for i, text in enumerate(text):
    scores=analyzer.polarity_scores(text)
    if compound:
      sentiment_scores[i]=scores['compound']
    else:
      if scores['neg']==1.0:
        sentiment_scores[i]=-1
      elif scores['pos']==1.0:
        sentiment_scores[i]=1
  return sentiment_scores

In [35]:
calculate_sentiment_for_nodes(all, compound=False)

array([0., 0., 1., 0., 1.])

In [36]:
sentiment_scores=calculate_sentiment_for_nodes(all, compound=True)

## Weighting of the sentiment

In [37]:
weight_matrix

array([[0.11111111, 1.35      , 1.        , 0.16666667, 0.5       ],
       [0.2       , 0.16666667, 0.        , 0.5       , 0.        ],
       [0.125     , 0.44444444, 0.        , 0.2       , 1.        ],
       [0.33333333, 0.25      , 0.        , 0.        , 0.        ],
       [0.14285714, 0.625     , 0.        , 0.25      , 0.        ]])

In [39]:
sentiment_scores

array([0.    , 0.    , 0.5994, 0.    , 0.3182])

In [40]:
occ_list

array([2., 2., 1., 1., 1.])

In [41]:
def weighted_sentiment_func(weight_matrix, occ_list, sentiment_scores, words, ner_list=[]):
  sum_columns_weights=np.sum(weight_matrix, axis=0)
  sum_rows_weights=np.sum(weight_matrix, axis=1)
  if len(ner_list)!=0: #if ners are predefined, calculate only for them
    for i, word in enumerate(words):
      if word not in ner_list:
        sum_columns_weights[i]=0
        sum_rows_weights[i]=0
  sum_all_weights=sum_columns_weights+sum_rows_weights
  count_nonzero_weights_columns=np.count_nonzero(weight_matrix, axis=0)
  count_nonzero_weights_rows=np.count_nonzero(weight_matrix, axis=1)
  count_nonzero_weights=count_nonzero_weights_columns+count_nonzero_weights_rows
  count_nonzero_weights[count_nonzero_weights==0]=1
  mean_columns_weights=sum_all_weights/count_nonzero_weights
  total_weight=mean_columns_weights*occ_list
  return total_weight*sentiment_scores

In [42]:
weight_matrix2

array([[0.        , 0.        , 1.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.125     , 0.44444444, 0.        , 0.2       , 1.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

In [43]:
weight_matrix

array([[0.11111111, 1.35      , 1.        , 0.16666667, 0.5       ],
       [0.2       , 0.16666667, 0.        , 0.5       , 0.        ],
       [0.125     , 0.44444444, 0.        , 0.2       , 1.        ],
       [0.33333333, 0.25      , 0.        , 0.        , 0.        ],
       [0.14285714, 0.625     , 0.        , 0.25      , 0.        ]])

In [44]:
weighted_sentiment=weighted_sentiment_func(weight_matrix, occ_list, sentiment_scores, all)

In [45]:
weighted_sentiment

array([0.        , 0.        , 0.332001  , 0.        , 0.16023643])

In [46]:
weighted_sentiment=weighted_sentiment_func(weight_matrix2, occ_list, sentiment_scores, all, ner_list=["super"])

In [47]:
all #'super' is third on list

['be', 'NewYork', 'super', 'Abu_Dhabi', 'cool']

In [48]:
weighted_sentiment #value of sentiment for 'super' is the third one - the only non zero here

array([0.      , 0.      , 0.332001, 0.      , 0.      ])

## Sentiment of the whole text

In [49]:
def calculate_sentiment_of_text(weighted_sentiment, threshold=0.05, output_number=False):
  sum_sentiment=np.sum(weighted_sentiment)
  if output_number:
    if sum_sentiment>threshold:
      return 1
    elif sum_sentiment<-threshold:
      return -1
    else:
      return 0
  else:
    if sum_sentiment>threshold:
      return "positive"
    elif sum_sentiment<-threshold:
      return "negative"
    else:
      return "neutral"

In [50]:
calculate_sentiment_of_text(weighted_sentiment)

'positive'

In [51]:
calculate_sentiment_of_text([-1,2,-4])

'negative'

In [52]:
calculate_sentiment_of_text([0.5, -0.45], output_number=True)

0

## Run graph-based solution

In [53]:
def graph_sentiment_analysis(text, nlp, lemmatization=False, max_distance=20, ner_list=[], compound=False, output_number=False, calculate_overall_score=1, threshold=0.05, max_nodes=0):
  words_all, text_all, ners_all, words_dict = get_words_for_nodes(text, lemmatization=lemmatization, nlp=nlp, max_nodes=max_nodes)
  weight_matrix_all, occ_list_all=get_weights_of_edges(text_all, words_all, max_distance=max_distance, ner_list=ner_list)
  sentiment_scores_all=calculate_sentiment_for_nodes(words_all, compound=compound)
  weighted_sentiment_all=weighted_sentiment_func(weight_matrix_all, occ_list_all, sentiment_scores_all, words=words_all, ner_list=ner_list)
  if calculate_overall_score==1:
    return calculate_sentiment_of_text(weighted_sentiment_all, output_number=output_number, threshold=threshold)
  elif calculate_overall_score==0:
    return dict(zip(words_all, weighted_sentiment_all))
  else:
    words_all.append("overall_sentiment")
    weighted_sentiment_all=list(weighted_sentiment_all)
    weighted_sentiment_all.append(calculate_sentiment_of_text(weighted_sentiment_all, output_number=True))
    return dict(zip(words_all, weighted_sentiment_all))

In [54]:
graph_sentiment_analysis("you are liar, Tom", calculate_overall_score=1, nlp=nlp)

'negative'

In [55]:
graph_sentiment_analysis("you are liar, Tom", calculate_overall_score=0, nlp=nlp)

{'Tom': np.float64(0.0), 'liar': np.float64(-0.75), 'are': np.float64(0.0)}

In [56]:
graph_sentiment_analysis("you are liar, Tom", calculate_overall_score=3, nlp=nlp, max_nodes=2)

{'Tom': np.float64(0.0), 'liar': np.float64(-0.5), 'overall_sentiment': -1}

In [57]:
graph_sentiment_analysis("you are liar, Tom", calculate_overall_score=3, nlp=nlp)

{'Tom': np.float64(0.0),
 'liar': np.float64(-0.75),
 'are': np.float64(0.0),
 'overall_sentiment': -1}

In [58]:
graph_sentiment_analysis("you are liar, Tom", calculate_overall_score=3, ner_list=['Tom', 'liar'], nlp=nlp)

{'Tom': np.float64(0.0),
 'liar': np.float64(-0.75),
 'are': np.float64(0.0),
 'overall_sentiment': -1}

# Tests for twitter sentiment extraction dataset

Source of dataset: https://www.kaggle.com/competitions/tweet-sentiment-extraction/data

In [59]:
test_tse_data=pd.read_csv(PATH_TWITTER_DATA)

In [60]:
test_tse_data

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive
...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative
3530,416863ce47,All alone in this old house again. Thanks for...,positive
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive


In [61]:
test_tse_data["predicted_sentiment"]=''

In [64]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("merge_noun_chunks")
for i in range(test_tse_data.shape[0]):
  if i%200==0:
    print(i)
  text_to_check=test_tse_data['text'][i]
  test_tse_data['predicted_sentiment'][i]=graph_sentiment_analysis(text_to_check, calculate_overall_score=1, nlp=nlp)

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400


In [65]:
test_tse_data

Unnamed: 0,textID,text,sentiment,predicted_sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,neutral
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,negative
3,01082688c6,happy bday!,positive,neutral
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,neutral
...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative,negative
3530,416863ce47,All alone in this old house again. Thanks for...,positive,positive
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative,negative
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive,positive


In [66]:
accuracy=np.sum(test_tse_data['sentiment']==test_tse_data['predicted_sentiment'])/test_tse_data.shape[0]
ba_tse=balanced_accuracy_score(test_tse_data['sentiment'], test_tse_data['predicted_sentiment'])
accuracy, ba_tse

(np.float64(0.601018675721562), np.float64(0.5960121474174058))

## compound=True

In [67]:
test_tse_data["predicted_sentiment_compound"]=''

In [68]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("merge_noun_chunks")
for i in range(test_tse_data.shape[0]):
  if i%200==0:
    print(i)
  text_to_check=test_tse_data['text'][i]
  test_tse_data['predicted_sentiment_compound'][i]=graph_sentiment_analysis(text_to_check, calculate_overall_score=1, nlp=nlp, compound=True)

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400


In [69]:
accuracy_compound=np.sum(test_tse_data['sentiment']==test_tse_data['predicted_sentiment_compound'])/test_tse_data.shape[0]
ba_tse_compound=balanced_accuracy_score(test_tse_data['sentiment'], test_tse_data['predicted_sentiment_compound'])
accuracy_compound, ba_tse_compound

(np.float64(0.6063950198075835), np.float64(0.5945859821653113))

In [70]:
test_tse_data.to_csv(PATH_TO_SAVE_RESULTS + 'test_tse_data_graph.csv', index=False)

# Tests for news sentiment

Source: https://www.kaggle.com/datasets/hoshi7/news-sentiment-dataset

In [71]:
test_ns_data2=pd.read_csv(PATH_NEWS_DATA)
test_ns_data2

Unnamed: 0,news_title,reddit_title,sentiment,text,url
0,Mark Cuban launches generic drug company,Billionaire Mark Cuban just launched a drug co...,1.0,Billionaire investor and Shark Tank star Mark ...,https://www.beckershospitalreview.com/pharmacy...
1,From Defendant to Defender: One Wrongfully Con...,"Man falsely imprisoned for 10 years, uses pris...",1.0,Attorney Jarrett Adams recently helped overtur...,https://www.nbcnews.com/news/us-news/defendant...
2,"Amazon Tribe Wins Lawsuit Against Big Oil, Sav...",Amazon tribe wins legal battle against oil com...,1.0,The Amazon Rainforest is well known across the...,https://www.disclose.tv/amazon-tribe-wins-laws...
3,Newark police: No officer fired a single shot ...,Newark police: No officer fired a single shot ...,1.0,Newark police: No officer fired a single shot ...,https://newjersey.news12.com/newark-police-no-...
4,Ingen barn døde i trafikken i 2019,No children died in traffic accidents in Norwa...,1.0,I 1970 døde det 560 mennesker i den norske tra...,https://www.nrk.no/trondelag/ingen-barn-dode-i...
...,...,...,...,...,...
843,Dee Why attack: Man allegedly choked and threa...,Dee Why attack: Man allegedly choked and threa...,0.0,Frightening details have emerged about a toile...,https://www.9news.com.au/2018/11/30/17/55/sydn...
844,Africa: Children and HIV/Aids - 'We Need to Ta...,Africa: Children and HIV/Aids - 'We Need to Ta...,0.0,"interview\n\nJohannesburg — 360,000 adolescent...",https://allafrica.com/stories/201811300567.html
845,Terrorism suspected in Eilat attack,Terrorism suspected in Eilat attack,0.0,A violent attack in the southern Israeli port ...,http://www.israelnationalnews.com/News/News.as...
846,Anti-Semitism never disappeared in Europe. It'...,Anti-Semitism never disappeared in Europe. It'...,0.0,"It's a 17-year-old boy, too frightened to wear...",https://edition.cnn.com/2018/11/27/europe/anti...


In [72]:
def map_values(x):
  if x==1:
    return 'positive'
  elif x==0:
    return 'negative'
  else:
    return 'neutral'

test_ns_data2['sentiment']=test_ns_data2['sentiment'].apply(map_values)
test_ns_data2["predicted_sentiment"]=''

In [73]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("merge_noun_chunks")
for i in range(test_ns_data2.shape[0]):
  if i%200==0:
    print(i)
  text_to_check=test_ns_data2['text'][i]
  test_ns_data2['predicted_sentiment'][i]=graph_sentiment_analysis(text_to_check, calculate_overall_score=1, nlp=nlp, threshold=0.0)

0
200
400
600
800


In [74]:
accuracy_ns=np.sum(test_ns_data2['sentiment']==test_ns_data2['predicted_sentiment'])/test_ns_data2.shape[0]
ba_ns=balanced_accuracy_score(test_ns_data2['sentiment'], test_ns_data2['predicted_sentiment'])
accuracy_ns, ba_ns

(np.float64(0.7570754716981132), np.float64(0.767005347593583))

In [75]:
test_ns_data2['predicted_sentiment'].value_counts()

predicted_sentiment
positive    583
negative    258
neutral       7
Name: count, dtype: int64

## compound=True

In [76]:
test_ns_data2["predicted_sentiment_compound"]=''

In [77]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("merge_noun_chunks")
for i in range(test_ns_data2.shape[0]):
  if i%200==0:
    print(i)
  text_to_check=test_ns_data2['text'][i]
  test_ns_data2['predicted_sentiment_compound'][i]=graph_sentiment_analysis(text_to_check, calculate_overall_score=1, nlp=nlp, threshold=0.0, compound=True)

0
200
400
600
800


In [78]:
accuracy_ns_compound=np.sum(test_ns_data2['sentiment']==test_ns_data2['predicted_sentiment_compound'])/test_ns_data2.shape[0]

In [79]:
ba_ns_compound=balanced_accuracy_score(test_ns_data2['sentiment'], test_ns_data2['predicted_sentiment_compound'])

In [80]:
accuracy_ns_compound,ba_ns_compound

(np.float64(0.7429245283018868), np.float64(0.7633155080213904))

In [81]:
test_ns_data2.to_csv(PATH_TO_SAVE_RESULTS+'/test_ns_data2.csv', index=False)

In [82]:
test_ns_data2['predicted_sentiment_compound'].value_counts()

predicted_sentiment_compound
positive    569
negative    272
neutral       7
Name: count, dtype: int64

# Check which ones are not in English

Using: https://github.com/fedelopez77/langdetect

In [83]:
def is_english(text):
  try:
    lang = detect(text)
    return lang == 'en'
  except: #e.g. unsupported language
    return False

### News Sentiment dataset

In [84]:
test_ns_data2.loc[63]['text']

'Üniversite birçok öğrenci için kazandıktan sonra rahatlayacağını düşündüğü bir aşama. Gençliğin tam olarak yaşandığı, özgürlüğün tadıldığı ve derslerin biraz ikinci planda kaldığı bu evre bazıları için hedeflerine ulaşacakları zorlu bir yol. Kocaeli’nde yaşayan 22 yaşındaki Berru Merve Kul da 4 yıl önce Sakarya Üniversitesi’ni kazandı ancak görme engelli olduğu için önünde zorlu ve uzun bir yol vardı. Neyse ki annesi 4 yıl boyunca olduğu gibi mezun olurken de yanındaydı…\n\n22 yaşındaki Berru Merve Kul da 4 yıl önce Sakarya Üniversitesi Hukuk Fakültesi’ni kazandı. Ancak görme engelli olduğu için üniversite yaşamı onun için diğer öğrencilere göre biraz daha zordu\n\n4 yıllık eğitim hayatı boyunca onun eli ayağı olan kişi ise annesiydi. Annesi Havva Kul 4 yıl boyunca kızına tüm notlarını, kitaplarını okuyarak ödevlerini yapmasını, sınavlardan başarıyla geçmesini sağladı\n\nVe 4 yılın ardından Berru Merve Kul okuldan başarıyla mezun oldu. Sakarya Üniversitesi Hukuk Fakültesi Binasında ya

In [85]:
is_english(test_ns_data2.loc[63]['text'])

False

In [86]:
test_ns_data_english=test_ns_data2[test_ns_data2['text'].apply(is_english)]
test_ns_data_english

Unnamed: 0,news_title,reddit_title,sentiment,text,url,predicted_sentiment,predicted_sentiment_compound
0,Mark Cuban launches generic drug company,Billionaire Mark Cuban just launched a drug co...,positive,Billionaire investor and Shark Tank star Mark ...,https://www.beckershospitalreview.com/pharmacy...,positive,positive
1,From Defendant to Defender: One Wrongfully Con...,"Man falsely imprisoned for 10 years, uses pris...",positive,Attorney Jarrett Adams recently helped overtur...,https://www.nbcnews.com/news/us-news/defendant...,positive,positive
2,"Amazon Tribe Wins Lawsuit Against Big Oil, Sav...",Amazon tribe wins legal battle against oil com...,positive,The Amazon Rainforest is well known across the...,https://www.disclose.tv/amazon-tribe-wins-laws...,positive,positive
3,Newark police: No officer fired a single shot ...,Newark police: No officer fired a single shot ...,positive,Newark police: No officer fired a single shot ...,https://newjersey.news12.com/newark-police-no-...,negative,negative
5,"Budweiser will sit out Super Bowl, funneling m...","Budweiser will sit out Super Bowl, funneling m...",positive,Budweiser will not be running a commercial dur...,https://www.cnbc.com/2021/01/25/super-bowl-bud...,positive,positive
...,...,...,...,...,...,...,...
843,Dee Why attack: Man allegedly choked and threa...,Dee Why attack: Man allegedly choked and threa...,negative,Frightening details have emerged about a toile...,https://www.9news.com.au/2018/11/30/17/55/sydn...,negative,negative
844,Africa: Children and HIV/Aids - 'We Need to Ta...,Africa: Children and HIV/Aids - 'We Need to Ta...,negative,"interview\n\nJohannesburg — 360,000 adolescent...",https://allafrica.com/stories/201811300567.html,positive,positive
845,Terrorism suspected in Eilat attack,Terrorism suspected in Eilat attack,negative,A violent attack in the southern Israeli port ...,http://www.israelnationalnews.com/News/News.as...,negative,negative
846,Anti-Semitism never disappeared in Europe. It'...,Anti-Semitism never disappeared in Europe. It'...,negative,"It's a 17-year-old boy, too frightened to wear...",https://edition.cnn.com/2018/11/27/europe/anti...,negative,negative


In [87]:
accuracy_ns_compound_english=np.sum(test_ns_data_english['sentiment']==test_ns_data_english['predicted_sentiment_compound'])/test_ns_data_english.shape[0]
ba_ns_compound_english=balanced_accuracy_score(test_ns_data_english['sentiment'], test_ns_data_english['predicted_sentiment_compound'])
accuracy_ns_compound_english, ba_ns_compound_english

(np.float64(0.747016706443914), np.float64(0.7763123113095983))

In [88]:
test_ns_data_english['predicted_sentiment_compound'].value_counts()

predicted_sentiment_compound
positive    565
negative    271
neutral       2
Name: count, dtype: int64

In [89]:
test_ns_data2[~test_ns_data2['text'].apply(is_english)] #seems to perform good for longer texts

Unnamed: 0,news_title,reddit_title,sentiment,text,url,predicted_sentiment,predicted_sentiment_compound
4,Ingen barn døde i trafikken i 2019,No children died in traffic accidents in Norwa...,positive,I 1970 døde det 560 mennesker i den norske tra...,https://www.nrk.no/trondelag/ingen-barn-dode-i...,positive,positive
63,Görme Engelli Kızına 4 Yıl Boyunca Notlarını O...,Turkish mom who read lecture notes for four ye...,positive,Üniversite birçok öğrenci için kazandıktan son...,https://listelist.com/sakarya-gorme-engelli-an...,positive,positive
104,Incendie à Notre-Dame : la famille Pinault déb...,French billionaire François-Henri Pinault pled...,positive,La famille Pinault va débloquer cent millions ...,http://www.lefigaro.fr/flash-actu/notre-dame-d...,neutral,neutral
159,Монголия приняла меры по облегчению положения ...,"Mongolia will pay for electricity, water, heat...",positive,13 декабря состоялось внеочередное заседание к...,http://www.mongolnow.com/mongoliya-prinyala-me...,neutral,neutral
187,Natale: 94enne solo a casa chiama Cc per fare ...,"94 years old man calls the police: ""I got ever...",positive,"(ANSA) - BOLOGNA, 25 DIC - Ha telefonato ai Ca...",https://www.ansa.it/amp/emiliaromagna/notizie/...,positive,positive
458,Coronavirus-Pandemie: Bosch erfindet eigenen C...,German company Bosch produces 95% accurate tes...,positive,Schneller und sicherer auf das Virus testen – ...,https://www.faz.net/aktuell/wirtschaft/digitec...,negative,negative
538,"Coronavirus, a Rimini guarisce anziano di 101 ...","101 year old man, born during the Spanish flu,...",positive,Nemmeno a 101 anni il futuro è scritto.Non lo ...,http://www.today.it/attualita/coronavirus-guar...,positive,positive
752,Больше половины краж в Казахстане не раскрываю...,More than half of thefts in Kazakstan are not ...,negative,"По данным Министерства внутренних дел, в 2017 ...",https://tengrinews.kz/crime/bolshe-polovinyi-k...,neutral,neutral
766,Мать бросила младенца на обочине дороги в спор...,Mother threw the baby on the side of the road ...,negative,В Туркестанской области женщина бросила новоро...,https://www.nur.kz/1746113-mat-brosila-mladenc...,neutral,neutral
771,Издевательства мальчика над женщиной в Капшага...,Bullying of a boy over a woman in Kapshagai: t...,negative,В ДВД Алматинской области прокомментировали ви...,https://tengrinews.kz/kazakhstan_news/izdevate...,neutral,neutral


### Tweet sentiment extraction dataset

In [90]:
test_tse_data_english=test_tse_data[test_tse_data['text'].apply(is_english)]

In [91]:
accuracy_tse_compound_english=np.sum(test_tse_data_english['sentiment']==test_tse_data_english['predicted_sentiment_compound'])/test_tse_data_english.shape[0]

In [92]:
ba_tse_compound_english=balanced_accuracy_score(test_tse_data_english['sentiment'], test_tse_data_english['predicted_sentiment_compound'])

In [93]:
accuracy_tse_compound_english, ba_tse_compound_english #in this dataset nothing changes

(np.float64(0.6047516198704104), np.float64(0.5950546252576678))

# test on ACTAWARE data - chosen subset

In [94]:
path_list=(PATH_NOT_PREPROCESSED)
df_my=pd.read_csv(PATH_MY_GT, sep=';')
with open(path_list, 'r') as file:
  list_of_contents_new = file.readlines()

In [95]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("merge_noun_chunks")

<function spacy.pipeline.functions.merge_noun_chunks(doc: spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc>

In [96]:
actaware_df=pd.DataFrame({'Text': list_of_contents_new, 'Sentiment': ""})
actaware_df.head()

Unnamed: 0,Text,Sentiment
0,People work in the Amazon Fulfillment Center i...,
1,A federal agency is seeking to force Starbucks...,
2,You might have seen a new energy drink on Amaz...,
3,The BBC's director-general has tried to calm t...,
4,Amazon is running a competition to give its br...,


In [97]:
for i in range(len(list_of_contents_new)):
  if i%200==0:
    print(i)
  text_to_check=list_of_contents_new[i]
  actaware_df['Sentiment'][i]=graph_sentiment_analysis(text_to_check, calculate_overall_score=1, nlp=nlp, threshold=0.0, compound=True)

0


In [98]:
actaware_df.to_csv(PATH_TO_SAVE_RESULTS+'actaware_df_with_sentiment_graph_chosen_articles.csv', index=False)

In [99]:
actaware_df.head()

Unnamed: 0,Text,Sentiment
0,People work in the Amazon Fulfillment Center i...,negative
1,A federal agency is seeking to force Starbucks...,positive
2,You might have seen a new energy drink on Amaz...,positive
3,The BBC's director-general has tried to calm t...,positive
4,Amazon is running a competition to give its br...,positive


In [100]:
balanced_accuracy_score(df_my['Sentiment'], actaware_df['Sentiment'])

np.float64(0.5584921614333379)

# Experiments

In [101]:
def run_actaware(file_name, df_my, nlp):
  if file_name=='chosen_articles':
    path_list=(PATH_NOT_PREPROCESSED)
  elif file_name=='chosen_articles_cleaned_4o':
    path_list=(PATH_4O)
  elif file_name=='chosen_articles_cleaned_by_me':
    path_list=(PATH_HUMAN)
  elif file_name=='chosen_articles_cleaned_regex':
    path_list=(PATH_REGEX)
  else:
    print("Wrong dataset name.")
  with open(path_list, 'r') as file:
    list_of_contents_new = file.readlines()
  actaware_df=pd.DataFrame({'Text': list_of_contents_new, 'Sentiment': ""})
  for i in range(len(list_of_contents_new)):
    text_to_check=list_of_contents_new[i]
    actaware_df['Sentiment'][i]=graph_sentiment_analysis(text_to_check, calculate_overall_score=1, nlp=nlp, threshold=0.0, compound=True)
  actaware_df.to_csv(f'{PATH_TO_SAVE_RESULTS}/actaware_df_with_sentiment_graph_{file_name}.csv', index=False)
  return balanced_accuracy_score(df_my['Sentiment'], actaware_df['Sentiment'])

In [102]:
df_my=pd.read_csv(PATH_MY_GT, sep=';')
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("merge_noun_chunks")

<function spacy.pipeline.functions.merge_noun_chunks(doc: spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc>

In [110]:
list_names_scores_iter_time=[]
for i in range(3):
  file_name='chosen_articles'
  print(i, file_name)
  start_time=time.time()
  ba_1=run_actaware(file_name, df_my, nlp)
  end_time=time.time()
  print(ba_1)
  list_names_scores_iter_time.append([file_name, ba_1, i, end_time-start_time])


for i in range(3):
  file_name='chosen_articles_cleaned_4o'
  print(i, file_name)
  start_time=time.time()
  ba_1=run_actaware(file_name, df_my, nlp)
  end_time=time.time()
  print(ba_1)
  list_names_scores_iter_time.append([file_name, ba_1, i, end_time-start_time])


for i in range(3):
  file_name='chosen_articles_cleaned_by_me'
  print(i, file_name)
  start_time=time.time()
  ba_1=run_actaware(file_name, df_my, nlp)
  end_time=time.time()
  print(ba_1)
  list_names_scores_iter_time.append([file_name, ba_1, i, end_time-start_time])

for i in range(3):
  file_name='chosen_articles_cleaned_regex'
  print(i, file_name)
  start_time=time.time()
  ba_1=run_actaware(file_name, df_my, nlp)
  end_time=time.time()
  print(ba_1)
  list_names_scores_iter_time.append([file_name, ba_1, i, end_time-start_time])

0 chosen_articles
0.5584921614333379
1 chosen_articles
0.5584921614333379
2 chosen_articles
0.5584921614333379
0 chosen_articles_cleaned_4o
0.6049179578591343
1 chosen_articles_cleaned_4o
0.6049179578591343
2 chosen_articles_cleaned_4o
0.6049179578591343
0 chosen_articles_cleaned_by_me
0.5584921614333379
1 chosen_articles_cleaned_by_me
0.5584921614333379
2 chosen_articles_cleaned_by_me
0.5584921614333379
0 chosen_articles_cleaned_regex
0.5584921614333379
1 chosen_articles_cleaned_regex
0.5584921614333379
2 chosen_articles_cleaned_regex
0.5584921614333379


As the algorithm is deterministic, each run for graph based solution gives exactly the same results for the same dataset.

In [112]:
list_names_scores_iter_time

[['chosen_articles', np.float64(0.5584921614333379), 0, 30.808367252349854],
 ['chosen_articles', np.float64(0.5584921614333379), 1, 19.118587970733643],
 ['chosen_articles', np.float64(0.5584921614333379), 2, 19.342491149902344],
 ['chosen_articles_cleaned_4o',
  np.float64(0.6049179578591343),
  0,
  14.115129947662354],
 ['chosen_articles_cleaned_4o',
  np.float64(0.6049179578591343),
  1,
  16.85368847846985],
 ['chosen_articles_cleaned_4o',
  np.float64(0.6049179578591343),
  2,
  19.26258420944214],
 ['chosen_articles_cleaned_by_me',
  np.float64(0.5584921614333379),
  0,
  23.69186496734619],
 ['chosen_articles_cleaned_by_me',
  np.float64(0.5584921614333379),
  1,
  19.002342224121094],
 ['chosen_articles_cleaned_by_me',
  np.float64(0.5584921614333379),
  2,
  22.143868684768677],
 ['chosen_articles_cleaned_regex', 0.5584921614333379, 0, 21.723713636398315],
 ['chosen_articles_cleaned_regex', 0.5584921614333379, 1, 20.625104904174805],
 ['chosen_articles_cleaned_regex', 0.5584

In [4]:
with open(PATH_TO_SAVE_RESULTS+'graph_sentiment_results_chosen_actaware.txt', 'w') as f:
    for line in list_names_scores_iter_time:
        f.write(f"{line}\n")