In [1]:
!pip install nltk



Vader scores calculation: https://github.dev/cjhutto/vaderSentiment/blob/master/vaderSentiment/vaderSentiment.py
- compound: float(sum(sentiments_of_words)) +/- coefficient connected to number of "!" and "?" in the sentence; later normalized
- pos: pos_sum +/- coefficient connected to number of "!" and "?", divided by total sentiment and abs
- neg: neg_sum +/- coefficient connected to number of "!" and "?", divided by total sentiment(pos+abs(neg)+neutral) and abs
- neu: neu_count, divided by total sentiment(pos+abs(neg)+neutral) and abs


# Sentiment analysis - graph based

Sentiment analysis - graph based - general idea:

For each text in dataset to get its sentiment:
1. Get nouns, verbs, adjectives and NERs (e.g. with nltk library) - they will be the nodes of the graph. Only single words, no bigrams.
2. The weights of edges between nodes will be created based on the distance in the original text between two given words. The weight will be the sum of inverses of distances between them in the whole text.
3. For each word in node the sentiment will be calculated (e.g. with VADER model). The final outcome will be -1 for negative, 0 for neutral and 1 for positive.
4. The sentiment will later be "weighted". This means that for each node the sentiment from VADER will be multiplied by scaled_sum_of_weights_of_edges_to_this_node (mean of sum of weights * number of occurencies of the word in original text).
5. The sentiment of the whole text will be the normalized sum of weights for all nodes. There will be a certain threshold, which scores will be treated as neutral. In general, if sum_of_sentiment > 0 than positive, < 0 negative, ~0 neutral.



More details:
- there will be a possibility to choose
		- maximum number of nodes in the graph - selected will be topk words with most occurencies in the text
		- max_distance between words to add the inverse to edge weight
		- NER_list to provide NERs as the important ones to calculate and return the sentiment score for them instead of for the whole text
		- if_calculate_overall_score -> to say if user wants to get the score of the whole text or just for given NERs
- maybe just available dict with words and their sentiment

# Ideas used:
- for NER https://spacy.io/api/entityrecognizer

- https://www.nltk.org/book/ch05.html

- "NLTK offers flexible algorithms for tasks like tokenization and part-of-speech tagging, while spaCy is renowned for its speed and performance, ideal for efficient NLP solutions."

- https://spacy.io/usage/linguistic-features

In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
import spacy

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


In [2]:
def find_ents(doc):
  list_of_ents=[]
  if doc.ents:
    for ent in doc.ents:
      list_of_ents.append(ent.text)
  return list_of_ents

In [3]:
def get_words_for_nodes(text, list_ners=[], lemmatization=False):
  nlp = spacy.load('en_core_web_sm')
  nlp.add_pipe("merge_noun_chunks")
  doc1 = nlp(text)
  # find ners
  if type(list_ners)==list and len(list_ners)>0:
    ners=list_ners
  else:
    ners = find_ents(doc1) #maybe TO DO: find better way to get NER (what was Actaware idea for that? they are happy with it, so...)
    print(ners)
  # if I put lemmatization before ners, they do not catch everything, e.g. "NY"
  if lemmatization:
    #a co z podmiankami w tekście, jeśli on zwróci all w których jest "be" a w tekście są "is"?
    #TO DO: one of the experiments: does lemmatization change accuracy? and what is the influence on the performance?
    lemmatized_tokens=[token.lemma_ for token in doc1]
    text = ' '.join(lemmatized_tokens)
    doc1=nlp(text)
    # print(text)
  tags=['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
  nouns_verbs_etc = [token.text for token in doc1 if token.tag_ in tags]
  text2=" ".join([str(word).replace(" ", "_") for word in list(doc1)])
  all_nodes=[str(word).replace(" ", "_") for word in list(set(ners+nouns_verbs_etc))]
  return all_nodes,text2, ners

In [21]:
all, text3, ners = get_words_for_nodes("what is super cool in NewYork and Abu Dhabi, this is NewYork", lemmatization=True)

['NewYork', 'Abu Dhabi', 'NewYork']


In [22]:
all

['be', 'NewYork', 'super', 'cool', 'Abu_Dhabi']

In [23]:
text3

'what be super cool in NewYork and Abu_Dhabi , this be NewYork'

Links:
- https://spacy.io/universe/project/video-spacys-ner-model
- https://stackoverflow.com/questions/15388831/what-are-all-possible-pos-tags-of-nltk

In [7]:
str1="what is good in New York?"
str2="New York"

str2 in str1

True

## weights of edges

In [8]:
import numpy as np

In [72]:
def get_weights_of_edges(text, words, max_distance=20, ner_list=[]):
  # if ner_list not empty, we should calculate the distances only between ners and other words
  # and do not between other words and other words

  list_from_text = text.split()
  weight_matrix=np.zeros((len(words), len(words)))
  occurences_list=np.zeros(len(words))
  for i, word1 in enumerate(list_from_text):
    j=i+1
    try:
      index1=words.index(word1)
      occurences_list[index1]+=1
    except ValueError:
      pass
    while j<=i+max_distance and j<len(list_from_text):
      word2=list_from_text[j]
      if len(ner_list) != 0 and (word1 in ner_list or word2 in ner_list):
        try:
          index1=words.index(word1)
          index2=words.index(word2)
          distance=j-i
          if distance!=0:
            inv_distance=1/distance
            weight_matrix[index1][index2]+=inv_distance
        except ValueError:
          pass
      elif len(ner_list)==0:
        try:
          index1=words.index(word1)
          index2=words.index(word2)
          distance=j-i
          if distance!=0:
            inv_distance=1/distance
            weight_matrix[index1][index2]+=inv_distance
        except ValueError:
          pass
      else:
        pass
      j+=1
  upper_right_ones=np.triu(np.ones(len(words)))
  return (weight_matrix+weight_matrix.T)*upper_right_ones, occurences_list

In [62]:
text3

'what be super cool in NewYork and Abu_Dhabi , this be NewYork'

In [63]:
all

['be', 'NewYork', 'super', 'cool', 'Abu_Dhabi']

In [89]:
weight_matrix, occ_list=get_weights_of_edges(text3, all, max_distance=20, ner_list=[])

In [74]:
occ_list

array([2., 2., 1., 1., 1.])

In [75]:
weight_matrix

array([[0.22222222, 1.55      , 1.125     , 0.64285714, 0.5       ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

# sentiment for each node

'compound' - The normalized compound score which calculates the sum of all lexicon ratings and takes values from -1 to 1

In [76]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [82]:
def calculate_sentiment_for_nodes(text, compound=False):
  analyzer = SentimentIntensityAnalyzer()
  # Loop through the words/ bigrams from 1. and get the sentiment scores for each one
  sentiment_scores=np.zeros(len(text))
  for i, text in enumerate(text):
    scores=analyzer.polarity_scores(text)
    if compound:
      sentiment_scores[i]=scores['compound']
    else:
      if scores['neg']==1.0:
        sentiment_scores[i]=-1
      elif scores['pos']==1.0:
        sentiment_scores[i]=1
    print(text)
    print(scores)
  return sentiment_scores

In [83]:
calculate_sentiment_for_nodes(all, compound=False)

be
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
NewYork
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
super
{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.5994}
cool
{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.3182}
Abu_Dhabi
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


array([0., 0., 1., 1., 0.])

In [85]:
sentiment_scores=calculate_sentiment_for_nodes(all, compound=True)

be
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
NewYork
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
super
{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.5994}
cool
{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.3182}
Abu_Dhabi
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [79]:
scores

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [81]:
scores['neg']

0.0

# weighting of the sentiment

In [90]:
weight_matrix

array([[0.22222222, 1.55      , 1.125     , 0.64285714, 0.5       ],
       [0.        , 0.33333333, 0.44444444, 0.625     , 0.75      ],
       [0.        , 0.        , 0.        , 1.        , 0.2       ],
       [0.        , 0.        , 0.        , 0.        , 0.25      ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

In [87]:
sentiment_scores

array([0.    , 0.    , 0.5994, 0.3182, 0.    ])

In [88]:
occ_list

array([2., 2., 1., 1., 1.])

In [103]:
def weighted_sentiment(weight_matrix, occ_list, sentiment_scores):
  sum_columns_weights=np.sum(weight_matrix, axis=0)
  count_nonzero_weights=np.count_nonzero(weight_matrix, axis=0)
  mean_columns_weights=np.sum(weight_matrix, axis=0)/np.count_nonzero(weight_matrix, axis=0)
  total_weight=(np.sum(weight_matrix, axis=0)/np.count_nonzero(weight_matrix, axis=0))*occ_list
  return total_weight*sentiment_scores

In [105]:
weighted_sentiment=weighted_sentiment(weight_matrix, occ_list, sentiment_scores)

# sentiment of the whole text

normalized?

In [108]:
np.sum(weighted_sentiment)

0.7109065476190476

In [111]:
def calculate_sentiment_of_text(weighted_sentiment, threshold=0.05):
  sum_sentiment=np.sum(weighted_sentiment)
  if sum_sentiment>threshold:
    return "positive"
  elif sum_sentiment<-threshold:
    return "negative"
  else:
    return "neutral"

In [112]:
calculate_sentiment_of_text(weighted_sentiment)

'positive'

In [113]:
calculate_sentiment_of_text([-1,2,-4])

'negative'

In [114]:
calculate_sentiment_of_text([0.5, -0.45])

'neutral'