# Anonalyze: An NLP-Enhanced and ML-Driven Platform for Sentiment and Insight Extraction
A platform designed like an online discussion board where users can freely share their thoughts and opinions anonymously. It uses AI, ML, and language processing tools to analyze the posts, helping to understand the overall mood and key ideas in the discussions.

## Initialization

In [1]:
import pickle
import nltk
import re
import numpy as np
import joblib

from nltk import pos_tag, sent_tokenize, RegexpParser
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag, ne_chunk
from nltk.tree import Tree
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('maxent_ne_chunker_tab')
nltk.download('tagsets')
nltk.download('tagsets_json')

[nltk_data] Downloading package stopwords to /home/jiji/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jiji/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/jiji/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jiji/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jiji/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jiji/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jiji/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[n

True

Loading the pickled vectorizer, selector, and model

In [3]:
vectorizer = joblib.load('./models/sentiment-emotion-classification/pkl/tfidf_vectorizer.pkl')
selector_sentiment = joblib.load('./models/sentiment-emotion-classification/pkl/selector_sentiment.pkl')
selector_emotion = joblib.load('./models/sentiment-emotion-classification/pkl/selector_emotion.pkl')
model_sentiment = joblib.load('./models/sentiment-emotion-classification/pkl/model_sentiment.pkl')
model_emotion = joblib.load('./models/sentiment-emotion-classification/pkl/model_emotion.pkl')

## Execution

Making a data pre-processor pipe composing of:
* Denoising: removing the non-alphabetical characters in the content
* Removing stopwords: removing stopwords such as `[a, an, the, and, but]`
* Lemmatizing: reducing words to their base form e.g. `[changing, changed, change] -> change`

In [4]:
class Preprocessor:
  @staticmethod
  def denoiser(text: str) -> str:
    text = re.sub(r'@\w+', '', text) 
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = re.sub(r'https\w+', '', text)
    text = re.sub(r'http\w+', '', text)
    text = text.strip()
    text = text.lower()
    return text

  @staticmethod
  def stopwords_remover(text: str) -> str:
    matcher = re.compile(r"|".join([fr"\b{word}\b" for word in stopwords.words("english")]))
    text = " ".join(matcher.sub('', text).split())
    return text

  @staticmethod
  def lemmatizer(text: str) -> str:
    wordnet_lemmatizer = WordNetLemmatizer()
    tokenizer = WordPunctTokenizer()

    wordnet_pos_tag_map = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV,
    }

    tokens = tokenizer.tokenize(text)
    pos_tags = pos_tag(tokens)

    lemmatized_tokens = []
    for token, tag in pos_tags:
        wordnet_tag = wordnet_pos_tag_map.get(tag[0].upper())
        if wordnet_tag is None:
            lemmatized_tokens.append(token)
        else:
            lemmatized_tokens.append(wordnet_lemmatizer.lemmatize(token, wordnet_tag))
            
    return ' '.join(lemmatized_tokens)
  
  @staticmethod
  def process_text(text: str) -> str:
    text = Preprocessor.denoiser(text)
    text = Preprocessor.stopwords_remover(text)
    text = Preprocessor.lemmatizer(text)
    return text

Making a static class for the sentiment model to simplify the workflow

In [5]:
class ModelSentiment:
  vectorizer = None
  selector_sentiment = None
  model_sentiment = None
  
  sentiment_label_description_map = {
    0: 'negative',
    1: 'positive',
    2: 'neutral',
  }

  @staticmethod
  def _initialize():
    if ModelSentiment.vectorizer is None:
      ModelSentiment.vectorizer = joblib.load('./models/sentiment-emotion-classification/pkl/tfidf_vectorizer.pkl')
    
    if ModelSentiment.selector_sentiment is None:
      ModelSentiment.selector_sentiment = joblib.load('./models/sentiment-emotion-classification/pkl/selector_sentiment.pkl')
    
    if ModelSentiment.model_sentiment is None:
      ModelSentiment.model_sentiment = joblib.load('./models/sentiment-emotion-classification/pkl/model_sentiment.pkl')
  
  @staticmethod
  def _vectorize(text: str):
    ModelSentiment._initialize()
    return ModelSentiment.vectorizer.transform([text])

  @staticmethod
  def _select_best_features(vector):
    return ModelSentiment.selector_sentiment.transform(vector)

  @staticmethod
  def predict(text: str):
    ModelSentiment._initialize()
    vector = ModelSentiment._vectorize(text)
    vector = ModelSentiment._select_best_features(vector)
    target_classes = ModelSentiment.model_sentiment.classes_

    predicted_probabilities_map = dict(zip(
      ModelSentiment.sentiment_label_description_map.values(),
      ModelSentiment.model_sentiment.predict_proba(vector)[0]
    ))

    predicted_sentiment = max(predicted_probabilities_map.items(), key=lambda x: x[1])

    return predicted_sentiment


Making a static class for the emotion model to simplify the workflow

In [6]:
class ModelEmotion:
  vectorizer = None
  selector_emotion = None
  model_emotion = None
  
  emotion_label_description_map = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprised',
  }

  @staticmethod
  def _initialize():
    if ModelEmotion.vectorizer is None:
      ModelEmotion.vectorizer = joblib.load('./models/sentiment-emotion-classification/pkl/tfidf_vectorizer.pkl')
    
    if ModelEmotion.selector_emotion is None:
      ModelEmotion.selector_emotion = joblib.load('./models/sentiment-emotion-classification/pkl/selector_emotion.pkl')
    
    if ModelEmotion.model_emotion is None:
      ModelEmotion.model_emotion = joblib.load('./models/sentiment-emotion-classification/pkl/model_emotion.pkl')
  
  @staticmethod
  def _vectorize(text: str):
    ModelEmotion._initialize()
    return ModelEmotion.vectorizer.transform([text])

  @staticmethod
  def _select_best_features(vector):
    return ModelEmotion.selector_emotion.transform(vector)

  @staticmethod
  def predict(text: str):
    ModelEmotion._initialize()
    vector = ModelEmotion._vectorize(text)
    vector = ModelEmotion._select_best_features(vector)
    target_classes = ModelEmotion.model_emotion.classes_

    predicted_probabilities_map = dict(zip(
      ModelEmotion.emotion_label_description_map.values(),
      ModelEmotion.model_emotion.predict_proba(vector)[0]
    ))

    predicted_emotion = max(predicted_probabilities_map.items(), key=lambda x: x[1])

    return predicted_emotion

### Simulating the platform
**Thread question**: How do you think about the impact of online anonymity on user behavior in social media platforms?

In [None]:
responses = [
  "I believe online anonymity encourages more honest and open communication, allowing users to express their true opinions",
  "In my view, online anonymity can lead to a significant increase in negative behaviors, such as trolling and cyberbullying, because users feel shielded from accountability.",
  "I think anonymity provides a double-edged sword; while it allows for free expression, it also creates an environment where people may engage in harmful or deceitful actions.",
  "Online anonymity empowers marginalized voices to speak out, but it also makes it difficult to identify and address harmful content effectively.",
  "I see online anonymity as a critical factor in fostering diverse discussions, but it also contributes to the spread of misinformation, as sources cannot always be verified.",
  "I think that online anonymity can lead to more genuine interactions in certain communities, but it may also reduce the quality of discourse by enabling users to avoid responsibility for their words.",
  "Anonymity online is essential for privacy, but it can also encourage users to engage in behavior they might avoid if their identity were known.",
  "In my opinion, the impact of online anonymity is largely context-dependent; it can promote both positive and negative behaviors depending on the platform and community norms.",
  "I believe online anonymity amplifies both the best and worst aspects of human behavior, providing a space for both creativity and cruelty.",
  "I think online anonymity allows people to connect more authentically, but it can also lead to a lack of trust and credibility in online interactions.",
]

In [15]:
for response in responses:
  preprocessed_text = Preprocessor.process_text(response)
  predicted_sentiment = ModelSentiment.predict(preprocessed_text)
  predicted_emotion = ModelEmotion.predict(preprocessed_text)
  print(f"{response[:90]}... \tsentiment: {predicted_sentiment} \temotion: {predicted_emotion}")

I believe online anonymity encourages more honest and open communication, allowing users t... 	sentiment: ('positive', np.float64(0.5444077019724737)) 	emotion: ('joy', np.float64(0.687167818271886))
In my view, online anonymity can lead to a significant increase in negative behaviors, suc... 	sentiment: ('negative', np.float64(0.5055017115435506)) 	emotion: ('joy', np.float64(0.6239800353485))
I think anonymity provides a double-edged sword; while it allows for free expression, it a... 	sentiment: ('positive', np.float64(0.5884018484494042)) 	emotion: ('joy', np.float64(0.6389433285739233))
Online anonymity empowers marginalized voices to speak out, but it also makes it difficult... 	sentiment: ('negative', np.float64(0.5839376905372277)) 	emotion: ('joy', np.float64(0.7105077533039598))
I see online anonymity as a critical factor in fostering diverse discussions, but it also ... 	sentiment: ('positive', np.float64(0.6507373940271595)) 	emotion: ('joy', np.float64(0.927362399432482))


In [9]:
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
pipe = pipeline('text2text-generation', model = model, tokenizer = tokenizer)

Subject-Verb-Object Extraction

In [10]:
tagdict = nltk.data.load('help/tagsets/upenn_tagset.pickle')
tokenizer = WordPunctTokenizer()

tokens = tokenizer.tokenize(responses[0])
tagged_tokens = pos_tag(tokens)

for tagged_token in tagged_tokens:
  print(f"{tagged_token[0]} \t\t\t[{tagged_token[1]}] {tagdict[tagged_token[1]][0]}")


I 			[PRP] pronoun, personal
believe 			[VBP] verb, present tense, not 3rd person singular
online 			[JJ] adjective or numeral, ordinal
anonymity 			[NN] noun, common, singular or mass
encourages 			[VBZ] verb, present tense, 3rd person singular
more 			[JJR] adjective, comparative
honest 			[JJ] adjective or numeral, ordinal
and 			[CC] conjunction, coordinating
open 			[JJ] adjective or numeral, ordinal
communication 			[NN] noun, common, singular or mass
, 			[,] comma
allowing 			[VBG] verb, present participle or gerund
users 			[NNS] noun, common, plural
to 			[TO] "to" as preposition or infinitive marker
express 			[VB] verb, base form
their 			[PRP$] pronoun, possessive
true 			[JJ] adjective or numeral, ordinal
opinions 			[NNS] noun, common, plural


In [11]:
tokenizer = WordPunctTokenizer()

for response in responses:
  tokens = tokenizer.tokenize(response)
  tagged_tokens = pos_tag(tokens)

  for i, (token, tag) in enumerate(tagged_tokens):
    
    # Verb might indicate relationship
    if tag.startswith('VB'):
      verb = token
      subj = None
      obj = None
      
      # Look for the nearest noun or pronoun before the verb
      for j in range(i-1, -1, -1):
        if tagged_tokens[j][1].startswith('NN'):
          subj = tagged_tokens[j][0]
          break
        
      # Look for the nearest noun or pronoun after the verb
      for j in range(i+1, len(tagged_tokens)):
        if tagged_tokens[j][1].startswith('NN'):
          obj = tagged_tokens[j][0]
          break
      
      if subj and obj:
        print(f"{subj} {verb} {obj}")

anonymity encourages communication
communication allowing users
users express opinions
anonymity lead increase
users feel accountability
users shielded accountability
anonymity provides sword
anonymity edged sword
sword allows expression
expression creates environment
people engage harmful
empowers marginalized voices
voices speak content
voices makes content
voices identify content
voices address content
factor fostering discussions
discussions contributes spread
anonymity lead interactions
communities reduce quality
discourse enabling users
users avoid responsibility
online is privacy
privacy encourage users
users engage behavior
behavior avoid identity
anonymity is dependent
dependent promote behaviors
behaviors depending platform
behavior providing space
anonymity allows people
people connect lack
people lead lack


In [12]:
tokenizer = WordPunctTokenizer()

for response in responses:
  tokens = tokenizer.tokenize(response)
  tagged_tokens = pos_tag(tokens)

  grammar = r"""
    NP: {<DT>?<JJ.*>*<NN.*>+}
    VP: {<VB.*><NP|PP|CLAUSE>+}
    CLAUSE: {<NP><VP>}
  """

  chunker = RegexpParser(grammar)
  chunked = chunker.parse(tagged_tokens)
  
  for tagged_token_group in chunked:
    if type(tagged_token_group) == nltk.Tree:
      subtree = tagged_token_group
      if subtree.label() == "CLAUSE":
        print(" ".join(word for word,tag in subtree.leaves()))
      

Online anonymity empowers marginalized voices


In [13]:
tokenizer = WordPunctTokenizer()

for response in responses:
  tokens = tokenizer.tokenize(response)
  tagged_tokens = pos_tag(tokens)

  grammar = r"""
    NP: {<DT>?<JJ.*>*<NN.*>+}
    VP: {<VB.*><NP|PP|CLAUSE>+}
  """

  chunker = RegexpParser(grammar)
  chunked = chunker.parse(tagged_tokens)
  
  for i, tagged_token in enumerate(chunked):
    if type(tagged_token) == nltk.Tree:
      subtree = tagged_token
      
      if subtree.label() == 'VP':
        verb_phrase = None
        start_noun_phrase = None
        
        verb_phrase = " ".join([word for word,tag in subtree.leaves()])
        
        # Look for the nearest noun phrase before verb phrase 
        for j in range(i-1, -1, -1):
          tagged_token_group = chunked[j]
          if type(tagged_token_group) == nltk.Tree and tagged_token_group.label() == 'NP':
            start_noun_phrase = " ".join([word for word,tag in tagged_token_group.leaves()])
            break
          
        if start_noun_phrase and verb_phrase:
          print(f"{start_noun_phrase} {verb_phrase}")

open communication allowing users
free expression creates an environment
Online anonymity empowers marginalized voices
Online anonymity empowers address harmful content
a critical factor fostering diverse discussions
certain communities reduce the quality
discourse enabling users
discourse avoid responsibility
privacy encourage users
human behavior providing a space
