# ***Quantitative Evaluation***

This notebook shows the process of performing quantitative evaluation on the results.

### ***Import packages***

Before we begin, let's import all the necessary packages for this notebook:

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.metrics as s_metrics
from matplotlib import pyplot as plt

# Set plotting theme
sns.set_theme(style="whitegrid")

### ***Read data***

Next, let's read the data:

In [None]:
# Read the results csv
results = pd.read_csv("results.csv")

# Read data
hate_target_groups = results[~results.hate_target_label.isna(
)].hate_target_label.unique()
hate_target_groups = np.append(hate_target_groups, "other")

### ***Get precision, recall and f1-score***

Next, let's get precision, recall and f1-score for hate detection:

In [None]:
# Loop over each prompt type
for prompt_type in ["Zero", "One", "Few"]:
  # Get metrics
  precision = round(
      s_metrics.precision_score(
          results["hate_label"], results[f"{prompt_type}-Shot Hate Detection"], pos_label="yes"
      ), 3
  )
  recall = round(
      s_metrics.recall_score(
          results["hate_label"], results[f"{prompt_type}-Shot Hate Detection"], zero_division=0, pos_label="yes"
      ), 3
  )
  f1 = round(
      s_metrics.f1_score(
          results["hate_label"], results[f"{prompt_type}-Shot Hate Detection"], pos_label="yes"
      ), 3
  )
  # Print metrics
  print(
      f"{prompt_type}-Shot: precision({precision}), recall({recall}), F1({f1})",
      end="\n\n"
  )

Next, let's get precision, recall and f1-score for hate target detection:

In [None]:
# Get only hate records
hate_results = results[results.hate_label == "yes"]
# Loop over each prompt type
for prompt_type in ["Zero", "One", "Few"]:
  # Get metrics
  precision = round(
      s_metrics.precision_score(
          hate_results["hate_target_label"],
          hate_results[f"{prompt_type}-Shot Hate Target Detection"],
          average="micro"
      ), 3
  )
  recall = round(
      s_metrics.recall_score(
          hate_results["hate_target_label"],
          hate_results[f"{prompt_type}-Shot Hate Target Detection"],
          average="micro", zero_division=0
      ), 3
  )
  f1 = round(
      s_metrics.f1_score(
          hate_results["hate_target_label"],
          hate_results[f"{prompt_type}-Shot Hate Target Detection"],
          average="micro"
      ), 3
  )
  print(
      f"{prompt_type}-Shot: precision({precision}), recall({recall}), F1({f1})",
      end="\n\n"
  )


### ***Plot Confusion matrices***

Next, let's plot confusion matrices for hate detection:

In [None]:
# Define confusion matrix labels
labels = ["yes", "no"]

# Loop over each prompt type
for prompt_type in ["Zero", "One", "Few"]:
  # Plot confusion matrix
  col = f"{prompt_type}-Shot Hate Detection"
  cm = s_metrics.confusion_matrix(
      results["hate_label"], results[col], labels=labels
  )
  ax = sns.heatmap(
      cm, annot=True, linewidths=0.5, annot_kws={"size": 22},
      xticklabels=labels, yticklabels=labels, vmin=0, vmax=36
  )
  ax.set_ylabel("True Label", fontsize=22)
  ax.set_xlabel("Predicted Label", fontsize=22)
  ax.set_title(col, fontsize=24)
  cbar = ax.collections[0].colorbar
  cbar.ax.tick_params(labelsize=22)
  plt.xticks(fontsize=22, rotation=90)
  plt.yticks(fontsize=22, rotation=0)
  plt.tight_layout()
  plt.show()

Next, let's plot confusion matrices for hate target detection:

In [None]:
# Get only hate records
hate_results = results[results.hate_label == "yes"]
# Define confusion matrix labels
labels = hate_target_groups
# Loop over each prompt type
for prompt_type in ["Zero", "One", "Few"]:
  # Plot confusion matrix
  col = f"{prompt_type}-Shot Hate Target Detection"
  cm = s_metrics.confusion_matrix(
      hate_results["hate_target_label"], hate_results[col], labels=labels
  )
  ax = sns.heatmap(
      cm, annot=True, linewidths=0.5, annot_kws={"size": 22}, vmin=0, vmax=4,
      xticklabels=labels, yticklabels=labels
  )
  ax.set_ylabel("True Label", fontsize=22, labelpad=20)
  ax.set_xlabel("Predicted Label", fontsize=22, labelpad=20)
  ax.set_title(col, fontsize=24, pad=20)
  cbar = ax.collections[0].colorbar
  cbar.ax.tick_params(labelsize=22)
  plt.xticks(fontsize=22, rotation=90)
  plt.yticks(fontsize=22, rotation=0)
  plt.tight_layout()
  plt.show()

### ***Implicit vs Explicit***

Next, let's plot explicit vs implicit hate detection:

In [None]:
# Get all implicit hate results
hate_implicit_results = results[(results.hate_label == "yes") & (results.implicit_hate == "yes")].copy()

# Loop over each prompt type
for prompt_type in ["Zero", "One", "Few"]:
  # Plot confusion matrix
  col = f"{prompt_type}-Shot Hate Detection"
  # Get match and mismatch
  match_count = (hate_implicit_results[col] == hate_implicit_results["hate_label"]).sum()
  mismatch_count = (hate_implicit_results[col] != hate_implicit_results["hate_label"]).sum()
  # Plot count plot
  sns.barplot(y=[mismatch_count, match_count], x=["Mistake", "Correct"])
  plt.title(f"{prompt_type} Hate Detection Implicit Text")
  plt.ylabel("")
  plt.yticks([])
  plt.show()

In [None]:
# Get all implicit hate results
hate_implicit_results = results[(results.hate_label == "yes") & (results.implicit_hate == "yes")].copy()

# Loop over each prompt type
for prompt_type in ["Zero", "One", "Few"]:
  # Plot confusion matrix
  col = f"{prompt_type}-Shot Hate Target Detection"
  # Get match and mismatch
  match_count = (hate_implicit_results[col] == hate_implicit_results["hate_target_label"]).sum()
  mismatch_count = (hate_implicit_results[col] != hate_implicit_results["hate_target_label"]).sum()
  # Plot count plot
  sns.barplot(y=[mismatch_count, match_count], x=["Mistake", "Correct"])
  plt.title(f"{prompt_type} Hate Detection Implicit Text")
  plt.ylabel("")
  plt.yticks([])
  plt.show()