## Evaluate Classifiers

In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk, concatenate_datasets
from sklearn.metrics import accuracy_score
import random
from tqdm import tqdm
import numpy as np
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


## Set Random Seed for Reproducibility

In [2]:
# Set a seed for random module
random.seed(42)

# Set a seed for numpy module
np.random.seed(42)

# Set a seed for torch module
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Define Parameters

In [3]:
SAMPLING = "random" # "random", "stratified", "clustered", "shared_domain"
SUFFIX = "_extended" #"", "_holdout", "_extended"
SPLITS = ['train', 'test', 'holdout', 'extended', 'holdout_url', 'extended_url']
MAX_CONTENT_LENGTH = 384 # 496, 192
OVERLAP = 64
FEATURES = "url_and_content" # "url", "content", "url_and_content"

In [4]:
TOPIC = "cannabis" # "cannabis", "kinder", "energie"
MODEL = "deepset/gbert-large"

## Helpers

In [5]:
from tabulate import tabulate
from IPython.display import display, HTML

def display_metrics_table(eval_results):
    """
    Display evaluation metrics as an HTML table.

    Parameters:
    eval_results (dict): A dictionary where keys are split names (e.g., 'train', 'test')
                         and values are dictionaries containing 'metrics' and 'count'.
    """
    # Prepare headers for the table
    headers = ["Split", "Accuracy", "Precision", "Recall", "F1 Score", "Count"]

    # Prepare rows
    rows = []
    for split, data in eval_results.items():
        metrics = data.get("metrics", {})
        count = data.get("count", 0)
        row = [
            split,
            metrics.get('accuracy', 0.0),
            metrics.get('precision', 0.0),
            metrics.get('recall', 0.0),
            metrics.get('f1', 0.0),
            count
        ]
        rows.append(row)

    # Generate the HTML table
    table_html = tabulate(rows, headers=headers, tablefmt="html", floatfmt=".3f")

    # Display the HTML table
    display(HTML(table_html))


## Evaluate Models

In [6]:
def calc_metrics(labels, preds):
    """
    Calculates the accuracy, precision, recall, and F1 score for the given labels and predictions and returns them in a dictionary.
    """
    
    metrics = {
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds, average='binary'),
        'recall': recall_score(labels, preds, average='binary'),
        'f1': f1_score(labels, preds, average='binary'),
    }

    return metrics

## Load Predictions

In [7]:
dataset = load_from_disk(f"../../data_ccu/tmp/processed_dataset_{TOPIC}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}_with_urls_{MODEL.split('/')[1]}_{FEATURES}_with_predictions")

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 3815
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 507
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 33702
    })
    extended: Datas

In [9]:
dataset["test"][0]

{'_id': '64a09474749484eec86957cf',
 'batch_id': 16,
 'domain': 'cheezburger.com',
 'view_url': 'amp.cheezburger.com/21165573/with-her-own-card-manager-makes-hotel-desk-worker-pay-for-a-guests-room-when-he-refuses-to',
 'lang': 'en',
 'text': "Log In Cheezburger Search Submit FAIL Blog Channels FAIL Blog After 12 Autocowrecks Dating Fails FAIL Nation Failbook Monday Thru Friday Music Parenting Poorly Dressed School of Fail There, I Fixed It Ugliest Tattoos WIN! Cheezburger Channels I Can Has FAIL Blog Memebase Animal Comedy Geek Universe CheezCake Loquillo FAIL Blog search email community favorite this article chev-right latest posts article list comments tags video article login twitter facebook menu pinterest whatsapp 'With HER own card': Manager makes hotel desk worker pay for a guest's room when he refuses to Businesses and their owners take on more of the risk and, in return, take more of the reward, paying workers a small portion of the value of their production and taking the re

## Get Chunk Level Predictions

In [10]:
# Initialize dictionaries
preds_and_labels_per_chunk = {}
labels_per_chunk = []
preds_per_chunk = []

# Iterate over the splits
for split in SPLITS:
    
    # Get the labels and predictions for the current split
    labels = dataset[split]["label"]
    preds = dataset[split]["preds"]
    preds_and_labels_per_chunk[split] = {"preds": preds, "labels": labels, "count": len(labels)}
    labels_per_chunk.extend(labels)
    preds_per_chunk.extend(preds)

    # Calculate the metrics for the current split
    metrics = calc_metrics(labels, preds)
    preds_and_labels_per_chunk[split]["metrics"] = metrics
    print(f"Metrics for {MODEL} on {TOPIC} for split {split}:\n {metrics}\n")

Metrics for deepset/gbert-large on cannabis for split train:
 {'accuracy': 0.9955439056356488, 'precision': 0.9925373134328358, 'recall': 0.99899849774662, 'f1': 0.9957574245071126}

Metrics for deepset/gbert-large on cannabis for split test:
 {'accuracy': 0.9842209072978304, 'precision': 0.9763313609467456, 'recall': 1.0, 'f1': 0.9880239520958084}

Metrics for deepset/gbert-large on cannabis for split holdout:
 {'accuracy': 0.9857871936383598, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Metrics for deepset/gbert-large on cannabis for split extended:
 {'accuracy': 0.9970187374575615, 'precision': 0.2591362126245847, 'recall': 0.9957446808510638, 'f1': 0.4112478031634446}

Metrics for deepset/gbert-large on cannabis for split holdout_url:
 {'accuracy': 0.9990859232175503, 'precision': 0.8, 'recall': 1.0, 'f1': 0.8888888888888888}

Metrics for deepset/gbert-large on cannabis for split extended_url:
 {'accuracy': 0.999518060064935, 'precision': 0.9348837209302325, 'recall': 0.9757281553398058, 'f1': 0.9548693586698337}



In [11]:
metrics = calc_metrics(labels_per_chunk, preds_per_chunk)
print(f"Metrics for {MODEL} on {TOPIC} for all splits:\n {metrics}\n")

Metrics for deepset/gbert-large on cannabis for all splits:
 {'accuracy': 0.9960630310704005, 'precision': 0.699746835443038, 'recall': 0.9971139971139971, 'f1': 0.82237429336507}



In [12]:
display_metrics_table(preds_and_labels_per_chunk)

Split,Accuracy,Precision,Recall,F1 Score,Count
train,0.996,0.993,0.999,0.996,3815
test,0.984,0.976,1.0,0.988,507
holdout,0.986,0.0,0.0,0.0,33702
extended,0.997,0.259,0.996,0.411,224737
holdout_url,0.999,0.8,1.0,0.889,1094
extended_url,1.0,0.935,0.976,0.955,39424


In [13]:
# Concatenate the labels for all splits except train
labels_per_chunk_except_train = [
    label for split in SPLITS if split != 'train' for label in preds_and_labels_per_chunk[split].get("labels", [])
]

# Concatenate the predictions for all splits except train
preds_per_chunk_except_train = [
    prediction for split in SPLITS if split != 'train' for prediction in preds_and_labels_per_chunk[split].get("preds", [])
]

# Calculate the metrics for all splits except train
metrics = calc_metrics(labels_per_chunk_except_train, preds_per_chunk_except_train)
print(f"Metrics for {MODEL} on {TOPIC} for all splits:\n {metrics}\n")

Metrics for deepset/gbert-large on cannabis for all splits:
 {'accuracy': 0.9960696444313841, 'precision': 0.39639175257731957, 'recall': 0.9922580645161291, 'f1': 0.5664825046040516}



In [14]:
preds_and_labels_per_chunk.keys()

dict_keys(['train', 'test', 'holdout', 'extended', 'holdout_url', 'extended_url'])

In [15]:
preds_and_labels_per_chunk_merged = preds_and_labels_per_chunk.copy()

#### Create unbalanced test set ####
preds_and_labels_per_chunk_merged["holdout"]["labels"] = preds_and_labels_per_chunk["holdout_url"]["labels"] + preds_and_labels_per_chunk["holdout"]["labels"] + preds_and_labels_per_chunk["test"]["labels"]

preds_and_labels_per_chunk_merged["holdout"]["preds"] = preds_and_labels_per_chunk["holdout_url"]["preds"] + preds_and_labels_per_chunk["holdout"]["preds"] + preds_and_labels_per_chunk["test"]["preds"]

#### Create extended test set ####
preds_and_labels_per_chunk_merged["extended"]["labels"] = preds_and_labels_per_chunk["extended_url"]["labels"] + preds_and_labels_per_chunk["extended"]["labels"]

preds_and_labels_per_chunk_merged["extended"]["preds"] = preds_and_labels_per_chunk["extended_url"]["preds"] + preds_and_labels_per_chunk["extended"]["preds"]


#### Remove unnecessary splits ####
preds_and_labels_per_chunk_merged.pop("holdout_url")
preds_and_labels_per_chunk_merged.pop("extended_url")
preds_and_labels_per_chunk_merged.pop("train")
print(preds_and_labels_per_chunk_merged.keys())

dict_keys(['test', 'holdout', 'extended'])


In [16]:
labels_per_chunk = []
preds_per_chunk = []

# Iterate over the splits
for split in preds_and_labels_per_chunk_merged.keys():
    
    # Get the labels and predictions for the current split
    labels = preds_and_labels_per_chunk_merged[split]["labels"]
    preds = preds_and_labels_per_chunk_merged[split]["preds"]
    labels_per_chunk.extend(labels)
    preds_per_chunk.extend(preds)

    # Calculate the metrics for the current split
    metrics = calc_metrics(labels, preds)
    preds_and_labels_per_chunk_merged[split]["metrics"] = metrics
    preds_and_labels_per_chunk_merged[split]["count"] = len(labels)
    print(f"Metrics for {MODEL} on {TOPIC} for split {split}:\n {metrics}\n")

Metrics for deepset/gbert-large on cannabis for split test:
 {'accuracy': 0.9842209072978304, 'precision': 0.9763313609467456, 'recall': 1.0, 'f1': 0.9880239520958084}

Metrics for deepset/gbert-large on cannabis for split holdout:
 {'accuracy': 0.9861768121689375, 'precision': 0.40632603406326034, 'recall': 1.0, 'f1': 0.5778546712802768}

Metrics for deepset/gbert-large on cannabis for split extended:
 {'accuracy': 0.997391742157245, 'precision': 0.389087656529517, 'recall': 0.9863945578231292, 'f1': 0.5580500320718409}



In [17]:

# Concatenate the labels for all splits 
labels_per_chunk_except_merged = [
    label for split in preds_and_labels_per_chunk_merged.keys() if split != 'train' for label in preds_and_labels_per_chunk[split].get("labels", [])
]

# Concatenate the predictions for all splits
preds_per_chunk_except_merged = [
    prediction for split in preds_and_labels_per_chunk_merged.keys() if split != 'train' for prediction in preds_and_labels_per_chunk[split].get("preds", [])
]

# Calculate the metrics for all splits
metrics = calc_metrics(labels_per_chunk_except_merged, preds_per_chunk_except_merged)
print(f"Metrics for {MODEL} on {TOPIC} for all splits:\n {metrics}\n")

Metrics for deepset/gbert-large on cannabis for all splits:
 {'accuracy': 0.9960496181297526, 'precision': 0.48244073748902544, 'recall': 0.9945701357466064, 'f1': 0.6497191841560744}



In [18]:
display_metrics_table(preds_and_labels_per_chunk_merged)

Split,Accuracy,Precision,Recall,F1 Score,Count
test,0.984,0.976,1.0,0.988,507
holdout,0.986,0.406,1.0,0.578,35303
extended,0.997,0.389,0.986,0.558,264161


In [19]:
# # Identify all topics (assuming all models are evaluated on the same topics)
# topics = list(next(iter(eval_results.values())).keys())

# # Prepare headers for the table: each topic will have four metrics
# headers = ["Model"] + [f"{topic} {metric}" for topic in topics for metric in ["Acc.", "Prec.", "Rec.", "F1"]]

# # Prepare rows: one row per model, containing metrics for each topic
# rows = []
# for model, topics_metrics in eval_results.items():
#     row = [model]  # Start with the model name
#     for topic in topics:
#         metrics = topics_metrics.get(topic, {})
#         row.extend([metrics.get('accuracy',0.0), metrics.get('precision',0.0), metrics.get('recall',0.0), metrics.get('f1',0.0)])
#     rows.append(row)

# # Generate the HTML table
# table_html = tabulate(rows, headers=headers, tablefmt="html", showindex="never", floatfmt=".3f")

In [20]:
# from IPython.display import display, HTML
# display(HTML(table_html))


## Get Page Level Predictions

In [21]:
from collections import Counter

In [22]:
def majority_voting(answers):
    """Apply majority voting to a list of arbitrary classification answers."""
    count = Counter(answers)
    most_common = count.most_common()  # Get all common answers sorted by frequency

    if not most_common:
        return 0 # Handle empty input scenario

    # Check for ties at the highest count
    max_votes = most_common[0][1]
    tied_classes = [cls for cls, votes in most_common if votes == max_votes]

    if len(tied_classes) > 1:
        return max(tied_classes)  # Return the maximum class label in case of a tie
    return tied_classes[0]  # Return the class with the most votes

majority_voting([1, 1, 2, 2, 2, 3])

2

In [23]:
# Initialize dictionaries
preds_and_labels_per_page = {}
labels_per_page = []
preds_per_page = []

# Iterate over the splits
for split in SPLITS:
            
    # Group dataset examples by URL, with a fallback to domain
    grouped_dataset = {}
    for example in tqdm(dataset[split]):
        url = example.get("view_url") or example.get("domain")
        example_filtered = {k: example[k] for k in ["text", "domain", "preds", "label", "category", "annotation_type", "lang"]}
        grouped_dataset.setdefault(url, []).append(example_filtered)
        
    # Extract labels
    labels = []
    for url, chunks in grouped_dataset.items():
        label = max([chunk["label"] for chunk in chunks])
        labels.append(label)
        
    # Merge chunk level predictions
    predictions = []
    for url, chunks in grouped_dataset.items():
        preds = [chunk["preds"] for chunk in chunks]
        pred = majority_voting([pred for pred in preds if pred > 0]) if max(preds) > 0 else 0
        predictions.append(pred)

    # Store the predictions and labels for the current split
    preds_and_labels_per_page[split] = {"preds": predictions, "labels": labels, "count": len(labels)}
    labels_per_page.extend(labels)
    preds_per_page.extend(predictions)

    # Use the trained model to make predictions on the test set
    metrics = calc_metrics(labels, predictions)
    preds_and_labels_per_page[split]["metrics"] = metrics
    print(f"Metrics for {MODEL} on {TOPIC}: {metrics}")
    

100%|██████████| 3815/3815 [00:00<00:00, 6240.26it/s]


Metrics for deepset/gbert-large on cannabis: {'accuracy': 0.9796954314720813, 'precision': 0.9593908629441624, 'recall': 1.0, 'f1': 0.9792746113989638}


100%|██████████| 507/507 [00:00<00:00, 6526.05it/s]


Metrics for deepset/gbert-large on cannabis: {'accuracy': 0.9534883720930233, 'precision': 0.9090909090909091, 'recall': 1.0, 'f1': 0.9523809523809523}


100%|██████████| 33702/33702 [00:05<00:00, 6366.52it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Metrics for deepset/gbert-large on cannabis: {'accuracy': 0.9514675966288869, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}


100%|██████████| 224737/224737 [00:36<00:00, 6209.41it/s]


Metrics for deepset/gbert-large on cannabis: {'accuracy': 0.9932232487745375, 'precision': 0.0856269113149847, 'recall': 0.9655172413793104, 'f1': 0.15730337078651685}


100%|██████████| 1094/1094 [00:00<00:00, 5688.45it/s]


Metrics for deepset/gbert-large on cannabis: {'accuracy': 0.9990850869167429, 'precision': 0.8, 'recall': 1.0, 'f1': 0.8888888888888888}


100%|██████████| 39424/39424 [00:07<00:00, 5374.73it/s]


Metrics for deepset/gbert-large on cannabis: {'accuracy': 0.9995144762732221, 'precision': 0.9342723004694836, 'recall': 0.9754901960784313, 'f1': 0.9544364508393285}


In [24]:
metrics = calc_metrics(labels_per_page, preds_per_page)
print(f"Metrics for {MODEL} on {TOPIC} for all splits:\n {metrics}\n")

Metrics for deepset/gbert-large on cannabis for all splits:
 {'accuracy': 0.9943761103504464, 'precision': 0.47261009667024706, 'recall': 0.9865470852017937, 'f1': 0.6390704429920117}



In [25]:
display_metrics_table(preds_and_labels_per_page)

Split,Accuracy,Precision,Recall,F1 Score,Count
train,0.98,0.959,1.0,0.979,394
test,0.953,0.909,1.0,0.952,43
holdout,0.951,0.0,0.0,0.0,3441
extended,0.993,0.086,0.966,0.157,44269
holdout_url,0.999,0.8,1.0,0.889,1093
extended_url,1.0,0.934,0.975,0.954,39133


In [26]:
preds_and_labels_per_page.keys()

dict_keys(['train', 'test', 'holdout', 'extended', 'holdout_url', 'extended_url'])

In [27]:
# Concatenate the labels for all splits except train
labels_per_page_except_train = [
    label for split in SPLITS if split != 'train' for label in preds_and_labels_per_page[split].get("labels", [])
]

# Concatenate the predictions for all splits except train
preds_per_page_except_train = [
    prediction for split in SPLITS if split != 'train' for prediction in preds_and_labels_per_page[split].get("preds", [])
]

# Calculate the metrics for all splits except train
metrics = calc_metrics(labels_per_page_except_train, preds_per_page_except_train)
print(f"Metrics for {MODEL} on {TOPIC} for all splits:\n {metrics}\n")

Metrics for deepset/gbert-large on cannabis for all splits:
 {'accuracy': 0.9944418554427761, 'precision': 0.3419618528610354, 'recall': 0.9766536964980544, 'f1': 0.5065590312815338}



In [28]:
preds_and_labels_per_page_merged = preds_and_labels_per_page.copy()

#### Create unbalanced test set ####
preds_and_labels_per_page_merged["holdout"]["labels"] = preds_and_labels_per_page["holdout_url"]["labels"] + preds_and_labels_per_page["holdout"]["labels"] + preds_and_labels_per_page["test"]["labels"]

preds_and_labels_per_page_merged["holdout"]["preds"] = preds_and_labels_per_page["holdout_url"]["preds"] + preds_and_labels_per_page["holdout"]["preds"] + preds_and_labels_per_page["test"]["preds"]

#### Create extended test set ####
preds_and_labels_per_page_merged["extended"]["labels"] = preds_and_labels_per_page["extended_url"]["labels"] + preds_and_labels_per_page["extended"]["labels"]

preds_and_labels_per_page_merged["extended"]["preds"] = preds_and_labels_per_page["extended_url"]["preds"] + preds_and_labels_per_page["extended"]["preds"]


#### Remove unnecessary splits ####
preds_and_labels_per_page_merged.pop("holdout_url")
preds_and_labels_per_page_merged.pop("extended_url")
preds_and_labels_per_page_merged.pop("train")
print(preds_and_labels_per_page_merged.keys())

dict_keys(['test', 'holdout', 'extended'])


In [29]:
labels_per_page = []
preds_per_page = []

# Iterate over the splits
for split in preds_and_labels_per_page_merged.keys():
    
    # Get the labels and predictions for the current split
    labels = preds_and_labels_per_page_merged[split]["labels"]
    preds = preds_and_labels_per_page_merged[split]["preds"]
    labels_per_page.extend(labels)
    preds_per_page.extend(preds)

    # Calculate the metrics for the current split
    metrics = calc_metrics(labels, preds)
    preds_and_labels_per_page_merged[split]["metrics"] = metrics
    preds_and_labels_per_page_merged[split]["count"] = len(labels)
    print(f"Metrics for {MODEL} on {TOPIC} for split {split}:\n {metrics}\n")

Metrics for deepset/gbert-large on cannabis for split test:
 {'accuracy': 0.9534883720930233, 'precision': 0.9090909090909091, 'recall': 1.0, 'f1': 0.9523809523809523}

Metrics for deepset/gbert-large on cannabis for split holdout:
 {'accuracy': 0.9628577670963513, 'precision': 0.12371134020618557, 'recall': 1.0, 'f1': 0.22018348623853212}

Metrics for deepset/gbert-large on cannabis for split extended:
 {'accuracy': 0.9961751516750198, 'precision': 0.4203703703703704, 'recall': 0.9742489270386266, 'f1': 0.5873221216041398}



In [30]:

# Concatenate the labels for all splits 
labels_per_page_merged = [
    label for split in preds_and_labels_per_page_merged.keys() if split != 'train' for label in preds_and_labels_per_page[split].get("labels", [])
]

# Concatenate the predictions for all splits
preds_per_page_merged = [
    prediction for split in preds_and_labels_per_page_merged.keys() if split != 'train' for prediction in preds_and_labels_per_page[split].get("preds", [])
]

# Calculate the metrics for all splits
metrics = calc_metrics(labels_per_page_merged, preds_per_page_merged)
print(f"Metrics for {MODEL} on {TOPIC} for all splits:\n {metrics}\n")

Metrics for deepset/gbert-large on cannabis for all splits:
 {'accuracy': 0.9944218490831838, 'precision': 0.3584656084656085, 'recall': 0.9783393501805054, 'f1': 0.5246853823814134}



In [31]:
display_metrics_table(preds_and_labels_per_page_merged)

Split,Accuracy,Precision,Recall,F1 Score,Count
test,0.953,0.909,1.0,0.952,43
holdout,0.963,0.124,1.0,0.22,4577
extended,0.996,0.42,0.974,0.587,83402


In [32]:
# # Identify all topics (assuming all models are evaluated on the same topics)
# topics = list(next(iter(eval_results_pages.values())).keys())

# # Prepare headers for the table: each topic will have four metrics
# headers = ["Model"] + [f"{topic} {metric}" for topic in topics for metric in ["Acc.", "Prec.", "Rec.", "F1"]]

# # Prepare rows: one row per model, containing metrics for each topic
# rows = []
# for model, topics_metrics in eval_results_pages.items():
#     row = [model]  # Start with the model name
#     for topic in topics:
#         metrics = topics_metrics.get(topic, {})
#         row.extend([metrics.get('accuracy',0.0), metrics.get('precision',0.0), metrics.get('recall',0.0), metrics.get('f1',0.0)])
#     rows.append(row)

# # Generate the HTML table
# table_html = tabulate(rows, headers=headers, tablefmt="html", showindex="never", floatfmt=".3f")

In [33]:
#display(HTML(table_html))