In [1]:
import pandas as pd
import numpy as np
from numpy import argmax

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

"""
Sentiment Analysis model and tokenizer
"""

roberta = "cardiffnlp/twitter-roberta-base-sentiment-latest"
model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|██████████| 929/929 [00:00<00:00, 277kB/s]
Downloading: 100%|██████████| 501M/501M [00:39<00:00, 12.7MB/s] 
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 1.76MB/s]
Downloading: 100%|██████████| 456k/456k [00:0

## Sentiment Labels:
* 0: Negative
* 1: Neutral
* 2: Positive

In [70]:
from numpy import argmax

def roberta_preprocess(text: str) -> str:
    """
    Preprocess tweet text for the model
    """
    words = []
    for word in text.split(" "):
        if word.startswith("@") and len(word) > 1:
            word = "@user"
        elif word.startswith("http"):
            word = "http"
        words.append(word)
    return " ".join(words)


def roberta_sentiment(text: str):
    # labels = ["Negative", "Neutral", "Positive"]
    processed_text = roberta_preprocess(text)
    # sentiment analysis
    encoded_tweet = tokenizer(processed_text, return_tensors="pt")
    output = model(**encoded_tweet)

    # Convert output pytorch tensor to numpy array by detaching the computational graph
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ind = argmax(scores)

    # for label, score in zip(labels, scores):
        # print(f'\t{label}: {score}')
        # pass

    # sentiment = labels[ind]
    # return sentiment
    return ind

In [71]:
roberta_sentiment("Hello there amazing friend")

2

In [194]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
sentence = "The food was great!"
vs = analyzer.polarity_scores(sentence)
# print("{:-<65} {}".format(sentence, str(vs)))
print(sentence)
print(vs)
print(argmax(list(vs.values())[:3]))

def vader_sentiment(text: str):
    scores = analyzer.polarity_scores(text)
    # Drop compound score
    # print(scores)
    return argmax(list(scores.values())[:3])

The food was great!
{'neg': 0.0, 'neu': 0.406, 'pos': 0.594, 'compound': 0.6588}
2


In [172]:
vader_sentiment("average car")

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


1

In [126]:
from flair.models import TextClassifier
from flair.data import Sentence
# classifier = TextClassifier.load('en-sentiment')
classifier = TextClassifier.load("sentiment-fast")

def flair_sentiment(text):
  # Return neutral if confidence < 0.75, value determined through hyperparameter tuning
  sentence = Sentence(text)
  classifier.predict(sentence)
  if sentence.labels[0].score < 0.75:
    return 1
  return 0 if sentence.labels[0].value == "NEGATIVE" else 2

2023-03-04 14:54:08,524 loading file /Users/jasonzhang/.flair/models/sentiment-en-mix-ft-rnn_v8.pt


In [129]:
flair_sentiment("Ouch")

0

In [134]:
# Maybe label sentence as neutral if confidence < 0.65

sentence=Sentence("Oh")
classifier.predict(sentence)
sentence.labels
sentence.labels[0].to_dict()
# sentence.labels[0]

{'value': 'POSITIVE', 'confidence': 0.5508431196212769}

In [302]:
from textblob import TextBlob

def textblob_sentiment(text):
    blob = TextBlob(text)
    # return int(testimonial.sentiment.polarity>0.5)
    polarity = blob.sentiment.polarity
    if np.abs(polarity) < 0.15:
        return 1
    elif polarity < 0:
        return 0
    return 2

textblob_sentiment("big car")

1

# Performance Test
* Use labeled Tweets dataset to evaluate each model
* Track performance using:
  * Classification metrics (precision, recall, etc.)
  * Inference time (per tweet)

In [212]:
# Load test set
test = pd.read_csv("data/test_set.csv")
test

Unnamed: 0,text,sentiment
0,How unhappy some dogs like it though,0
1,talking to my over driver about where I'm goin...,0
2,Does anybody know if the Rand's likely to fall...,0
3,I miss going to gigs in Liverpool unhappy,0
4,There isnt a new Riverdale tonight ? unhappy,0
...,...,...
3346,Haha thanks fansnim! happy,2
3347,Annapurna studios production no.29 on floors t...,2
3348,I don't think you'd understand how much a stre...,2
3349,Well,2


In [141]:
test_sample = pd.concat([test.iloc[0:50], test.iloc[1500:1550], test.iloc[2500:2550]])
test_sample.sentiment.value_counts()

0    50
1    50
2    50
Name: sentiment, dtype: int64

In [277]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import time

def performance_test(pred_func, text_data, true_labels, class_labels = [0,1,2]):
    preds = []
    fails = []
    
    start_time = time.time()
    for text in text_data:
        try:
            preds.append(pred_func(text))
        except Exception as e:
            print(e, text)
            fails.append(text)
    exec_time = time.time() - start_time
    print(classification_report(true_labels, preds, labels=class_labels))
    report = classification_report(true_labels, preds, labels=class_labels, output_dict=True)
    return report, exec_time
    # return f"Total execution time: {exec_time}s, execution time per sample: {exec_time / len(text_data)}s", report
    


In [278]:
roberta_report = performance_test(roberta_sentiment, test.text, test.sentiment)

              precision    recall  f1-score   support

           0       0.75      0.67      0.71      1117
           1       0.68      0.79      0.73      1117
           2       0.83      0.78      0.80      1117

    accuracy                           0.75      3351
   macro avg       0.75      0.75      0.75      3351
weighted avg       0.75      0.75      0.75      3351



In [279]:
vader_report = performance_test(vader_sentiment, test.text, test.sentiment)

              precision    recall  f1-score   support

           0       0.81      0.22      0.34      1117
           1       0.42      0.94      0.58      1117
           2       0.85      0.40      0.54      1117

    accuracy                           0.52      3351
   macro avg       0.69      0.52      0.49      3351
weighted avg       0.69      0.52      0.49      3351



In [290]:
flair_report = performance_test(flair_sentiment, test.text, test.sentiment)

              precision    recall  f1-score   support

           0       0.66      0.75      0.70      1117
           1       0.53      0.37      0.44      1117
           2       0.61      0.71      0.66      1117

    accuracy                           0.61      3351
   macro avg       0.60      0.61      0.60      3351
weighted avg       0.60      0.61      0.60      3351



In [301]:

textblob_report = performance_test(textblob_sentiment, test.text, test.sentiment)

              precision    recall  f1-score   support

           0       0.91      0.66      0.76      1117
           1       0.65      0.80      0.72      1117
           2       0.77      0.80      0.78      1117

    accuracy                           0.75      3351
   macro avg       0.78      0.75      0.75      3351
weighted avg       0.78      0.75      0.75      3351



In [282]:
flair_report

({'0': {'precision': 0.6220028208744711,
   'recall': 0.7896150402864817,
   'f1-score': 0.6958579881656805,
   'support': 1117},
  '1': {'precision': 0.5550755939524838,
   'recall': 0.23008057296329454,
   'f1-score': 0.32531645569620254,
   'support': 1117},
  '2': {'precision': 0.5782312925170068,
   'recall': 0.7609668755595345,
   'f1-score': 0.6571318129107073,
   'support': 1117},
  'accuracy': 0.5935541629364369,
  'macro avg': {'precision': 0.5851032357813205,
   'recall': 0.5935541629364369,
   'f1-score': 0.5594354189241968,
   'support': 3351},
  'weighted avg': {'precision': 0.5851032357813206,
   'recall': 0.5935541629364369,
   'f1-score': 0.5594354189241968,
   'support': 3351}},
 5.092592000961304)

In [292]:
reports = [roberta_report, vader_report, flair_report, textblob_report]
metrics = {model: model_metrics[0] for model, model_metrics in zip(["r", "v", "f", "t"], reports)}
exec_times = {model: model_metrics[1] for model, model_metrics in zip(["r", "v", "f", "t"], reports)}
metrics
# exec_times

{'r': {'0': {'precision': 0.753,
   'recall': 0.6741271262309758,
   'f1-score': 0.711384034010392,
   'support': 1117},
  '1': {'precision': 0.6787644787644788,
   'recall': 0.7869292748433303,
   'f1-score': 0.7288557213930349,
   'support': 1117},
  '2': {'precision': 0.8267045454545454,
   'recall': 0.7815577439570277,
   'f1-score': 0.8034974689369534,
   'support': 1117},
  'accuracy': 0.747538048343778,
  'macro avg': {'precision': 0.752823008073008,
   'recall': 0.7475380483437779,
   'f1-score': 0.7479124081134602,
   'support': 3351},
  'weighted avg': {'precision': 0.7528230080730081,
   'recall': 0.747538048343778,
   'f1-score': 0.7479124081134602,
   'support': 3351}},
 'v': {'0': {'precision': 0.81,
   'recall': 0.21754700089525514,
   'f1-score': 0.3429781227946366,
   'support': 1117},
  '1': {'precision': 0.41570805236017455,
   'recall': 0.9382273948075202,
   'f1-score': 0.5761407366684992,
   'support': 1117},
  '2': {'precision': 0.8452830188679246,
   'recall': 0

In [293]:
# Compare accuracies of 4 models
for model, metric_report in metrics.items():
    print(f"Model: {model}, accuracy: {metric_report['accuracy']}")

Model: r, accuracy: 0.747538048343778
Model: v, accuracy: 0.5189495672933453
Model: f, accuracy: 0.6108624291256342
Model: t, accuracy: 0.7391823336317517


We can clearly see that the Roberta and Textblob models are far outperforming Vader and Flair in the accuracy measure. Because our test dataset was class-balanced, accuracy score is an effective metrics, so we can focus our analysis on the Roberta and Textblob models.

In [305]:
from pprint import pprint

pprint(roberta_report[0])

{'0': {'f1-score': 0.711384034010392,
       'precision': 0.753,
       'recall': 0.6741271262309758,
       'support': 1117},
 '1': {'f1-score': 0.7288557213930349,
       'precision': 0.6787644787644788,
       'recall': 0.7869292748433303,
       'support': 1117},
 '2': {'f1-score': 0.8034974689369534,
       'precision': 0.8267045454545454,
       'recall': 0.7815577439570277,
       'support': 1117},
 'accuracy': 0.747538048343778,
 'macro avg': {'f1-score': 0.7479124081134602,
               'precision': 0.752823008073008,
               'recall': 0.7475380483437779,
               'support': 3351},
 'weighted avg': {'f1-score': 0.7479124081134602,
                  'precision': 0.7528230080730081,
                  'recall': 0.747538048343778,
                  'support': 3351}}
