## Performance evaluation of the trained BERT model
#### Author: Rishikesh Kakde

#### Import Required Libraries

In [1]:
import pandas as pd
from empath import Empath
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Initialize the EMPATH tool
lexicon = Empath()



#### Load the Dataset with Predicted Sentiments

In [2]:
# Load the CSV file
file_path = 'analysis_dataset_with_sentiments.csv'
df = pd.read_csv(file_path)

# Display the first few rows
print("Dataset Preview:")
print(df.head())

Dataset Preview:
   rating                                              title  \
0     5.0                                         Five Stars   
1     5.0                                         Five Stars   
2     3.0                       Some decent moments...but...   
3     4.0  Decent Depiction of Lower-Functioning Autism, ...   
4     5.0                                    What Love Is...   

                                                text images        asin  \
0           Amazon, please buy the show! I'm hooked!     []  B013488XFS   
1                         My Kiddos LOVE this show!!     []  B00CB6VTDS   
2  Annabella Sciorra did her character justice wi...     []  B096Z8Z3R6   
3  ...there should be more of a range of characte...     []  B09M14D9FZ   
4  ...isn't always how you expect it to be, but w...     []  B001H1SVZC   

  parent_asin                       user_id      timestamp  helpful_vote  \
0  B013488XFS  AGGZ357AO26RQZVRLGU4D4N52DZQ  1440385637000             

#### # Generate the Ground Truth Labels Using EMPATH

In [3]:
# Define a function to generate the ground truth label based on the text
def assign_ground_truth_label(text):
    if not text:
        return None  # Handle empty or missing text
    analysis = lexicon.analyze(text, categories=['positive_emotion', 'negative_emotion'])
    positive_score = analysis.get('positive_emotion', 0)
    negative_score = analysis.get('negative_emotion', 0)
    
    if positive_score > negative_score:
        return 'positive'
    elif negative_score > positive_score:
        return 'negative'
    else:
        return 'neutral'

# Apply the function to the text column to create the 'ground_truth_label' column
df['ground_truth_label'] = df['text'].apply(assign_ground_truth_label)

# Drop rows where ground truth could not be assigned
df = df.dropna(subset=['ground_truth_label'])

# Display the updated dataset
print("Updated Dataset with Ground Truth Labels:")
print(df.head())

Updated Dataset with Ground Truth Labels:
   rating                                              title  \
0     5.0                                         Five Stars   
1     5.0                                         Five Stars   
2     3.0                       Some decent moments...but...   
3     4.0  Decent Depiction of Lower-Functioning Autism, ...   
4     5.0                                    What Love Is...   

                                                text images        asin  \
0           Amazon, please buy the show! I'm hooked!     []  B013488XFS   
1                         My Kiddos LOVE this show!!     []  B00CB6VTDS   
2  Annabella Sciorra did her character justice wi...     []  B096Z8Z3R6   
3  ...there should be more of a range of characte...     []  B09M14D9FZ   
4  ...isn't always how you expect it to be, but w...     []  B001H1SVZC   

  parent_asin                       user_id      timestamp  helpful_vote  \
0  B013488XFS  AGGZ357AO26RQZVRLGU4D4N52DZQ  1

#### Map Sentiment Labels to Numeric Values

In [4]:
# Define mappings for both predicted and ground truth sentiments
sentiment_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}

# Map the sentiments to numeric labels
df['predicted_label'] = df['predicted_sentiment'].map(sentiment_mapping)
df['ground_truth_label'] = df['ground_truth_label'].map(sentiment_mapping)

# Drop rows with invalid mappings (if any)
df = df.dropna(subset=['predicted_label', 'ground_truth_label'])

# Display the numeric labels
print("Mapped Sentiment Labels:")
print(df[['predicted_label', 'ground_truth_label']].head())

Mapped Sentiment Labels:
   predicted_label  ground_truth_label
0                1                   1
1                2                   1
2                0                   2
3                2                   2
4                1                   2


#### Calculate Evaluation Metrics

In [5]:
# Extract the true and predicted labels
y_true = df['ground_truth_label']
y_pred = df['predicted_label']

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)

# Calculate precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')

# Display the metrics
print(f"Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Model Evaluation Metrics:
Accuracy: 0.80
Precision: 0.80
Recall: 0.80
F1-Score: 0.80


#### Calculate Accuracy and Metrics Per Class

In [6]:
# Use precision_recall_fscore_support to get metrics for each class
class_metrics = precision_recall_fscore_support(y_true, y_pred, average=None, labels=[0, 1, 2])

# Extract precision, recall, F1-score, and support for each class
precision_per_class, recall_per_class, f1_per_class, support_per_class = class_metrics

# Calculate accuracy per class
# Accuracy per class = Correct predictions / Total samples in that class
class_accuracies = []
for cls in range(3):  # For each class (0: negative, 1: neutral, 2: positive)
    class_total = support_per_class[cls]  # Total samples in the class
    correct_predictions = sum((y_true == cls) & (y_pred == cls))  # Correct predictions for the class
    class_accuracy = correct_predictions / class_total if class_total > 0 else 0
    class_accuracies.append(class_accuracy)

# Display per-class metrics
class_labels = ['negative', 'neutral', 'positive']
print(f"{'Class':<10}{'Precision':<10}{'Recall':<10}{'F1-Score':<10}{'Support':<10}{'Accuracy':<10}")
for i, label in enumerate(class_labels):
    print(f"{label:<10}{precision_per_class[i]:<10.2f}{recall_per_class[i]:<10.2f}{f1_per_class[i]:<10.2f}{support_per_class[i]:<10}{class_accuracies[i]:<10.2f}")

Class     Precision Recall    F1-Score  Support   Accuracy  
negative  0.60      0.66      0.63      213       0.66      
neutral   0.88      0.80      0.84      767       0.80      
positive  0.78      0.84      0.81      520       0.84      


The precision, recall, F1 scores are all mediocre due to less trainig. With more compute and RAM available this model can be fine-tuned more to have better performnce metrics.