## Evaluate Multiple Classifiers

In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from IPython.display import display, HTML
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk, concatenate_datasets
from sklearn.metrics import accuracy_score
import random
from tqdm import tqdm
import numpy as np
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


## Set Random Seed for Reproducibility

In [2]:
# Set a seed for random module
random.seed(42)

# Set a seed for numpy module
np.random.seed(42)

# Set a seed for torch module
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Define Parameters

In [3]:
SAMPLING = "random" # "random", "stratified", "clustered", "shared_domain"
SUFFIX = "_extended" #"", "_holdout", "_extended"
SPLITS = ["test", "holdout", "extended"] # "train", "test", "holdout", "extended"
MAX_CONTENT_LENGTH = 384 # 496, 192
OVERLAP = 64
FEATURES = "url_and_content" # "url", "content", "url_and_content"

In [4]:
TOPICS = ["cannabis", "kinder", "energie"]
#TOPICS = ["cannabis"]

In [5]:
MODELS = [#"distilbert/distilbert-base-multilingual-cased",
          "google-bert/bert-base-multilingual-cased", 
          "FacebookAI/xlm-roberta-base", 
          "FacebookAI/xlm-roberta-large", 
          "dbmdz/bert-base-german-uncased", 
          "deepset/gelectra-large",
          "deepset/gelectra-base",
          "deepset/gbert-large",
          "deepset/gbert-base",
          ]

## Helpers

In [6]:
def calc_metrics(labels, preds):
    """
    Calculates the accuracy, precision, recall, and F1 score for the given labels and predictions and returns them in a dictionary.
    """
    
    metrics = {
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds, average='binary'),
        'recall': recall_score(labels, preds, average='binary'),
        'f1': f1_score(labels, preds, average='binary'),
    }

    return metrics

In [7]:
def sample_random_from_dataset(dataset, n=5, subset='test'):
    """
    Samples n random examples from a specified subset of the dataset.
    """
    n = min(n, len(dataset[subset]))
    random_indices = random.sample(range(len(dataset[subset])), n)
    sampled_dataset = dataset[subset].select(random_indices)
    return sampled_dataset

## Page Level Predictions

In [8]:
from collections import Counter
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt
import numpy as np

In [9]:
from datasets import DatasetDict, concatenate_datasets, load_from_disk

def load_and_merge_datasets(topic, model_name, sampling, suffix, max_content_length, features, splits):
    """Loads specified splits from disk and merges them into a single dataset. """
    datasets_to_merge = []
    
    for split in splits:
        path = f"../../data_ccu/tmp/processed_dataset_{topic}_buffed_chunkified_{sampling}{suffix}_{max_content_length}_s_{model_name.split('/')[1]}_{features}_{split}/processed_dataset_{topic}_buffed_chunkified_{sampling}{suffix}_{max_content_length}_s_{model_name.split('/')[1]}_{features}_{split}"
        try:
            dataset = load_from_disk(path)
            if split in dataset:
                datasets_to_merge.append(dataset[split])
            else:
                print(f"Warning: Split '{split}' not found in the loaded dataset from {path}.")
        except Exception as e:
            print(f"Error loading split '{split}' from path '{path}': {e}")
    
    if datasets_to_merge:
        merged_dataset = concatenate_datasets(datasets_to_merge)
        return merged_dataset
    else:
        print("No valid splits provided for merging.")
        return None

def load_dataset(topic, model_name, sampling, suffix, max_content_length, features, split):
    path = f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{sampling}{suffix}_{max_content_length}_s_{model_name.split('/')[1]}_{features}_{split}"
    return load_from_disk(path)

def merge_dataset_splits(dataset_dict, splits):
    """Merges specified splits from a DatasetDict into a single dataset."""
    datasets_to_merge = []
    
    for split in splits:
        if split in dataset_dict:
            datasets_to_merge.append(dataset_dict[split])
        else:
            print(f"Warning: Split '{split}' not found in dataset_dict.")
    
    if datasets_to_merge:
        merged_dataset = concatenate_datasets(datasets_to_merge)
        return merged_dataset
    else:
        print("No valid splits provided for merging.")
        return None


from collections import defaultdict
from tqdm import tqdm

def group_dataset_by_url(dataset):
    grouped_dataset = defaultdict(list)
    keys_to_extract = ["text", "domain", "preds", "label", "category", "annotation_type", "lang"]

    for example in dataset:
        url = example.get("view_url") or example.get("domain")
        example_filtered = {key: example[key] for key in keys_to_extract}
        grouped_dataset[url].append(example_filtered)

    return dict(grouped_dataset)


def extract_labels(grouped_dataset):
    labels = []
    for chunks in grouped_dataset.values():
        preds = [chunk["label"] for chunk in chunks]
        labels.append(max(preds))
    return labels

def merge_predictions(grouped_dataset):
    predictions = []
    for chunks in grouped_dataset.values():
        preds = [chunk["preds"] for chunk in chunks]
        predictions.append(max(preds))
    return predictions

def merge_probabilities(grouped_dataset):
    probas = []
    for chunks in grouped_dataset.values():
        probas.append(max(chunk["probas"] for chunk in chunks))
    return probas

def plot_precision_recall_curve(recall, precision, pr_auc, model_name, topic, splits):
    plt.figure()
    plt.plot(recall, precision, label=f'PR AUC = {pr_auc:.2f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'P-R Curve {model_name} - {topic}')
    plt.legend(loc='best')

    filename = f'precision_recall_curve_{model_name.split("/")[1]}_{topic}_{"_".join(splits)}.png'.replace(' ', '_')
    plt.savefig(filename, dpi=300)
    plt.show()

**PR-Curves per Model:**

In [10]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed

# Dictionary to store the processed data
data_dict = defaultdict(dict)

def process_topic(MODEL, topic):
    print(f"\n\n###### Loading and processing data for {topic} and model {MODEL} ###### \n\n")

    try:
        # Load and merge datasets
        dataset = load_and_merge_datasets(topic, MODEL, SAMPLING, SUFFIX, MAX_CONTENT_LENGTH, FEATURES, SPLITS)

        # Group dataset by URL
        grouped_dataset = group_dataset_by_url(dataset)

        # Extract labels and probabilities
        labels = extract_labels(grouped_dataset)
        predictions = merge_predictions(grouped_dataset)

        print(f"Finished processing {MODEL} on {topic}.")
        return (MODEL, topic, labels, predictions)
    
    except Exception as e:
        print(f"An error occurred while processing {MODEL} on {topic}: {e}")
        return (MODEL, topic, None, None)

with ProcessPoolExecutor(max_workers=24) as executor:
    future_to_topic = {executor.submit(process_topic, MODEL, topic): (MODEL, topic) for MODEL in MODELS for topic in TOPICS}
    for future in as_completed(future_to_topic):
        MODEL, topic = future_to_topic[future]
        labels, predictions = future.result()[2], future.result()[3]
        if labels is not None and predictions is not None:
            data_dict[MODEL][topic] = (labels, predictions)

#print(data_dict)




###### Loading and processing data for cannabis and model google-bert/bert-base-multilingual-cased ###### 



###### Loading and processing data for energie and model google-bert/bert-base-multilingual-cased ###### 



###### Loading and processing data for cannabis and model FacebookAI/xlm-roberta-base ###### 



###### Loading and processing data for kinder and model FacebookAI/xlm-roberta-base ###### 



###### Loading and processing data for kinder and model google-bert/bert-base-multilingual-cased ###### 




###### Loading and processing data for energie and model FacebookAI/xlm-roberta-base ###### 



###### Loading and processing data for cannabis and model FacebookAI/xlm-roberta-large ###### 



###### Loading and processing data for kinder and model FacebookAI/xlm-roberta-large ###### 



###### Loading and processing data for energie and model FacebookAI/xlm-roberta-large ###### 




###### Loading and processing data for cannabis and model dbmdz/bert-base-german-uncased #

In [11]:
data_dict.keys()

dict_keys(['FacebookAI/xlm-roberta-base', 'deepset/gelectra-large', 'dbmdz/bert-base-german-uncased', 'FacebookAI/xlm-roberta-large', 'deepset/gbert-large', 'deepset/gelectra-base', 'google-bert/bert-base-multilingual-cased', 'deepset/gbert-base'])

In [12]:
import numpy as np
from matplotlib import cm

In [13]:
from collections import defaultdict
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import pandas as pd

# Mapping of topics to legend texts
legend_texts = {
    'kinder': 'Children',
    'energie': 'Energy',
    'cannabis': 'Cannabis'
}

# Desired order of topics for calculation
ordered_topics = ['kinder', 'energie', 'cannabis']

# Dictionary to store evaluation results
evaluation_results = defaultdict(dict)

# Iterate over each model
for MODEL in MODELS:
    
    for topic in ordered_topics:
        if topic in data_dict[MODEL]:
            labels, predictions = data_dict[MODEL][topic]

            try:
                # Calculate additional metrics
                predictions_binary = [1 if p >= 0.5 else 0 for p in predictions]
                topic_precision = precision_score(labels, predictions)
                topic_recall = recall_score(labels, predictions)
                topic_accuracy = accuracy_score(labels, predictions)
                topic_f1 = f1_score(labels, predictions)

                # Store metrics in the dictionary
                evaluation_results[MODEL][topic] = {
                    'accuracy': topic_accuracy,
                    'precision': topic_precision,
                    'recall': topic_recall,
                    'f1': topic_f1
                }

                print(f"Metrics for {MODEL} on {topic}: {evaluation_results[MODEL][topic]}")

            except Exception as e:
                print(f"An error occurred while processing {MODEL} on {topic}: {e}")

# Convert the evaluation results to a nested dictionary
evaluation_results = dict(evaluation_results)

# Display evaluation results
print(evaluation_results)


Metrics for google-bert/bert-base-multilingual-cased on kinder: {'accuracy': 0.9904928829403017, 'precision': 0.10552763819095477, 'recall': 0.9545454545454546, 'f1': 0.19004524886877827}
Metrics for google-bert/bert-base-multilingual-cased on energie: {'accuracy': 0.9117582527176623, 'precision': 0.011404293381037567, 'recall': 0.9444444444444444, 'f1': 0.022536456031816175}
Metrics for google-bert/bert-base-multilingual-cased on cannabis: {'accuracy': 0.9966494251670052, 'precision': 0.2318840579710145, 'recall': 0.9795918367346939, 'f1': 0.375}
Metrics for FacebookAI/xlm-roberta-base on kinder: {'accuracy': 0.9898909425678069, 'precision': 0.09984152139461172, 'recall': 0.9545454545454546, 'f1': 0.18077474892395984}
Metrics for FacebookAI/xlm-roberta-base on energie: {'accuracy': 0.9605465243841628, 'precision': 0.02564102564102564, 'recall': 0.9629629629629629, 'f1': 0.049951969260326606}
Metrics for FacebookAI/xlm-roberta-base on cannabis: {'accuracy': 0.9975708332460788, 'precisi

In [14]:
import json

# Define the file path to save the dictionary
file_path = f"eval_results_{FEATURES}_all_pages.json"

In [15]:
# Save the dictionary to disk as JSON
with open(file_path, "w") as file:
    json.dump(evaluation_results, file)