In [12]:
import numpy as np
import pandas as pd

from tqdm import tqdm 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score
)

analyzer = SentimentIntensityAnalyzer()
labels = ['negative', 'neutral', 'positive']


def preprocess(text):
    new_text = []
 
    for t in text.split(" "):
        # t = '@user' if t.startswith('@') and len(t) > 1 else t
        # t = 'http' if t.startswith('http') else t
        t = '' if t.startswith(('@', 'http')) and len(t) > 1 else t
        new_text.append(t)
    return " ".join(new_text)


def infer(text):
    vs = analyzer.polarity_scores(text)
    # text = preprocess(text)
    compound_score = vs['compound']

    if compound_score >= 0.05:
        label = 'positive'
    
    if (
        compound_score > -0.05 and
        compound_score < 0.05
    ):
        label = 'neutral'

    if compound_score <= -0.05:
        label = 'negative'

    # scores = [vs['neg'], vs['neu'], vs['pos']]
    # ranking = np.argsort(scores)
    # ranking = ranking[::-1]
    # output = ranking[0]
    # label = labels[output]
    return text, label

In [13]:
def evaluate(predictions, true_labels):
    cls_report = classification_report(
        true_labels, 
        predictions
    )
    cnf_matrix = confusion_matrix(
        true_labels, 
        predictions
    )
    accuracy = accuracy_score(true_labels, predictions)

    print("Classification Report:\n", cls_report)
    print("Confusion Matrix:\n", cnf_matrix)
    print("\n\nAccuracy: ", accuracy)


def test_batch(csv_path):
    df = pd.read_csv(csv_path)
    data = df['text'].tolist()
    true_labels = df['expected_sentiment'].tolist()
    predictions = []

    for i in tqdm(range(len(data)), desc="Processing"):
        text = data[i]
        true_label = true_labels[i]
        processed_text, predicted_label = infer(text)
        predictions.append(predicted_label)

    evaluate(predictions, true_labels)

In [14]:
csv_path = '../data/sentiment_test_cases.csv'
test_batch(csv_path)

Processing: 100%|██████████| 498/498 [00:00<00:00, 15655.43it/s]

Classification Report:
               precision    recall  f1-score   support

    negative       0.84      0.64      0.73       177
     neutral       0.67      0.70      0.68       139
    positive       0.68      0.81      0.74       182

    accuracy                           0.72       498
   macro avg       0.73      0.72      0.72       498
weighted avg       0.73      0.72      0.72       498

Confusion Matrix:
 [[114  25  38]
 [ 10  97  32]
 [ 11  23 148]]


Accuracy:  0.7208835341365462



