# Evaluation and metrics

### Imports

In [1]:
import re
import docx
import numpy as np
import pandas as pd
from collections import defaultdict
from IPython.display import display, Markdown

## Load annotated documents

In [2]:
def extract_comments_and_labels(doc, is_chatgpt=False):
    comments_and_labels = []
    current_text = ""

    for paragraph in doc.paragraphs:
        for i, run in enumerate(paragraph.runs):
            if run.comments:
                text = run.text if is_chatgpt else paragraph.runs[i - 1].text
                if current_text:
                    comments_and_labels.append((current_text if is_chatgpt else current_text[:-len(text)], None))
                    current_text = ""
                labels = ', '.join([comment.text for comment in run.comments]).split(', ')
                if is_chatgpt:
                    labels = [re.sub(r" \(confidence:.*?\)", "", label) for label in labels]
                comments_and_labels.append((text, labels))
            else:
                current_text += run.text
        current_text += "\n"

    if current_text:
        comments_and_labels.append((current_text, None))

    return comments_and_labels

In [3]:
def translate_labels_into_spans(comments_and_labels):
    spans = []
    current_len = 0
    for unlabeled_span, labeled_span in zip(comments_and_labels[::2], comments_and_labels[1::2]):
        current_len += len(unlabeled_span[0])
        for label in labeled_span[1]:
            spans.append((label, (current_len, current_len + len(labeled_span[0]))))
        current_len += len(labeled_span[0])
    return spans

In [4]:
input_files = ['trump_ground_truth.docx', 'biden_ground_truth.docx', 'putin_ground_truth.docx']

### Load ground truth

In [5]:
ground_truth = {}

for input_file in input_files:
    speaker = input_file.split("_")[0]
    ground_truth[speaker] = []
    original_doc = docx.Document(input_file)
    labs = extract_comments_and_labels(original_doc)
    ground_truth[speaker] += labs
    ground_truth[speaker] = translate_labels_into_spans(ground_truth[speaker])

### Load documents annotated by ChatGPT

In [6]:
ai_labels = {}

for input_file in input_files:
    speaker = input_file.split("_")[0]
    ai_labels[speaker] = []
    original_doc = docx.Document(speaker + "_chatgpt_final.docx")
    labs = extract_comments_and_labels(original_doc, is_chatgpt=True)
    ai_labels[speaker] += labs
    ai_labels[speaker] = translate_labels_into_spans(ai_labels[speaker])

## Evaluate function

In [7]:
def jaccard_similarity(a, b):
    intersection = max(0, min(a[1], b[1]) - max(a[0], b[0]))
    union = max(a[1], b[1]) - min(a[0], b[0])
    return intersection / union

def evaluate_spans(ground_truth, ai_spans):
    label_metrics = defaultdict(lambda: {'tp': 0, 'fp': 0, 'fn': 0})
    threshold = 0.5

    for label, span in ground_truth:
        matched = False
        for ai_label, ai_span in ai_spans:
            if label == ai_label and jaccard_similarity(span, ai_span) >= threshold:
                label_metrics[label]['tp'] += 1
                matched = True
                ai_spans.remove((ai_label, ai_span))
                break
        if not matched:
            label_metrics[label]['fn'] += 1

    for ai_label, _ in ai_spans:
        label_metrics[ai_label]['fp'] += 1

    for label, metrics in label_metrics.items():
        precision = metrics['tp'] / (metrics['tp'] + metrics['fp']) if metrics['tp'] + metrics['fp'] > 0 else 0
        recall = metrics['tp'] / (metrics['tp'] + metrics['fn']) if metrics['tp'] + metrics['fn'] > 0 else 0
        f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        label_metrics[label]['precision'] = precision * 100
        label_metrics[label]['recall'] = recall * 100
        label_metrics[label]['f1_score'] = f1_score * 100

    return label_metrics

## Metrics calculation and display

In [8]:
def display_metrics(eval_result):
    metrics_df = pd.DataFrame(eval_result).transpose()
    metrics_df = metrics_df.astype({"tp":"int", "fp":"int", "fn":"int"})
    metrics_df.sort_index(inplace=True)
    metrics_df.columns = ["True positive", "False positive", "False negative", "Precision", "Recall", "F1 Score"]
    return metrics_df

In [9]:
def weighted_average_metrics(df):
    # Calculate the number of ground truth labels for each category
    ground_truth_counts = df['True positive'] + df['False negative']
    
    # Calculate the weights for each category
    weights = ground_truth_counts / ground_truth_counts.sum()
    
    # Calculate the weighted average precision, recall, and F1 score
    weighted_precision = (df['Precision'] * weights).sum()
    weighted_recall = (df['Recall'] * weights).sum()
    weighted_f1_score = (df['F1 Score'] * weights).sum()
    
    return weighted_precision, weighted_recall, weighted_f1_score

In [10]:
metrics = {}

In [11]:
speaker = "trump"
metrics[speaker] = {}
metrics[speaker]["by_labels"] = display_metrics(evaluate_spans(ground_truth[speaker], ai_labels[speaker]))
metrics[speaker]["average"] = weighted_average_metrics(metrics[speaker]["by_labels"])

display(Markdown(f'### {speaker.capitalize()} Text Metrics'))
display(metrics[speaker]["by_labels"])
display(Markdown(f'**Weighted Precision:** {metrics[speaker]["average"][0]}'))
display(Markdown(f'**Weighted Recall:** {metrics[speaker]["average"][1]}'))
display(Markdown(f'**Weighted F1 Score:** {metrics[speaker]["average"][2]}'))

### Trump Text Metrics

Unnamed: 0,True positive,False positive,False negative,Precision,Recall,F1 Score
accusation,0,0,2,0.0,0.0,0.0
ad hominem,0,1,0,0.0,0.0,0.0
appeal to a sense of community,0,0,4,0.0,0.0,0.0
appeal to authority,0,1,0,0.0,0.0,0.0
appeal to emotion,0,6,0,0.0,0.0,0.0
appeal to emotions,0,3,3,0.0,0.0,0.0
appeal to equality,0,0,2,0.0,0.0,0.0
appeal to fear,0,2,0,0.0,0.0,0.0
appeal to history,0,0,2,0.0,0.0,0.0
appeal to self-interest,0,1,0,0.0,0.0,0.0


**Weighted Precision:** 3.14654481321148

**Weighted Recall:** 5.555555555555555

**Weighted F1 Score:** 3.8852578068264343

In [12]:
speaker = "biden"
metrics[speaker] = {}
metrics[speaker]["by_labels"] = display_metrics(evaluate_spans(ground_truth[speaker], ai_labels[speaker]))
metrics[speaker]["average"] = weighted_average_metrics(metrics[speaker]["by_labels"])

display(Markdown(f'### {speaker.capitalize()} Text Metrics'))
display(metrics[speaker]["by_labels"])
display(Markdown(f'**Weighted Precision:** {metrics[speaker]["average"][0]}'))
display(Markdown(f'**Weighted Recall:** {metrics[speaker]["average"][1]}'))
display(Markdown(f'**Weighted F1 Score:** {metrics[speaker]["average"][2]}'))

### Biden Text Metrics

Unnamed: 0,True positive,False positive,False negative,Precision,Recall,F1 Score
accusation,0,0,8,0.0,0.0,0.0
antithesis,0,1,0,0.0,0.0,0.0
appeal to a sense of community,0,0,7,0.0,0.0,0.0
appeal to authority,0,2,0,0.0,0.0,0.0
appeal to common folks,0,1,0,0.0,0.0,0.0
appeal to consequences,0,5,0,0.0,0.0,0.0
appeal to emotions,0,0,11,0.0,0.0,0.0
appeal to equality,0,0,2,0.0,0.0,0.0
appeal to fear,0,2,0,0.0,0.0,0.0
appeal to history,0,0,2,0.0,0.0,0.0


**Weighted Precision:** 1.222222222222222

**Weighted Recall:** 1.3333333333333335

**Weighted F1 Score:** 1.2753623188405796

In [13]:
speaker = "putin"
metrics[speaker] = {}
metrics[speaker]["by_labels"] = display_metrics(evaluate_spans(ground_truth[speaker], ai_labels[speaker]))
metrics[speaker]["average"] = weighted_average_metrics(metrics[speaker]["by_labels"])

display(Markdown(f'### {speaker.capitalize()} Text Metrics'))
display(metrics[speaker]["by_labels"])
display(Markdown(f'**Weighted Precision:** {metrics[speaker]["average"][0]}'))
display(Markdown(f'**Weighted Recall:** {metrics[speaker]["average"][1]}'))
display(Markdown(f'**Weighted F1 Score:** {metrics[speaker]["average"][2]}'))

### Putin Text Metrics

Unnamed: 0,True positive,False positive,False negative,Precision,Recall,F1 Score
accusation,0,2,4,0.0,0.0,0.0
ad hominem,0,3,0,0.0,0.0,0.0
appeal to a sense of community,0,0,1,0.0,0.0,0.0
appeal to authority,0,2,0,0.0,0.0,0.0
appeal to consequences,0,2,0,0.0,0.0,0.0
appeal to emotion,0,2,0,0.0,0.0,0.0
appeal to emotions,0,2,2,0.0,0.0,0.0
appeal to equality,0,0,1,0.0,0.0,0.0
appeal to reason,0,1,0,0.0,0.0,0.0
appeal to tradition,0,2,0,0.0,0.0,0.0


**Weighted Precision:** 0.33333333333333337

**Weighted Recall:** 0.6666666666666667

**Weighted F1 Score:** 0.4444444444444444

In [14]:
display(Markdown(f'**Average Precision:** {np.mean([metrics[speaker]["average"][0] for speaker in metrics])}'))
display(Markdown(f'**Average Recall:** {np.mean([metrics[speaker]["average"][1] for speaker in metrics])}'))
display(Markdown(f'**Average F1 Score:** {np.mean([metrics[speaker]["average"][2] for speaker in metrics])}'))

**Average Precision:** 1.5673667895890118

**Average Recall:** 2.5185185185185186

**Average F1 Score:** 1.8683548567038197