## Evaluate Multiple Classifiers

In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk, concatenate_datasets
from sklearn.metrics import accuracy_score
import random
from tqdm import tqdm
import numpy as np
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


## Set Random Seed for Reproducibility

In [2]:
# Set a seed for random module
random.seed(42)

# Set a seed for numpy module
np.random.seed(42)

# Set a seed for torch module
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Define Parameters

In [3]:
SAMPLING = "random" # "random", "stratified", "clustered", "shared_domain"
SUFFIX = "_extended" #"", "_holdout", "_extended"
SPLIT = "test" # "train", "test", "holdout", "extende
MAX_CONTENT_LENGTH = 384 # 496, 192
OVERLAP = 64
FEATURES = "url_and_content" # "url", "content", "url_and_content"

In [4]:
TOPICS = ["cannabis", "kinder", "energie"]
#TOPICS = ["cannabis"]

In [5]:
MODELS = ["distilbert/distilbert-base-multilingual-cased",
          "google-bert/bert-base-multilingual-cased", 
          #"FacebookAI/xlm-roberta-base", 
          #"FacebookAI/xlm-roberta-large", 
          #"dbmdz/bert-base-german-uncased", 
          #"deepset/gelectra-large",
          #"deepset/gelectra-base",
          #"deepset/gbert-large",
          #"deepset/gbert-base",
          ]

In [6]:
CUDA_ID = 0

**Extract URL-path:**

In [7]:
from urllib.parse import urlparse, urlunparse

def extract_url_path(example):
    view_url = example['view_url']
    if "://" not in view_url:
        view_url = "http://" + view_url  # Assume http if no protocol specified
    parsed_url = urlparse(view_url)
    new_url = urlunparse(('', '', parsed_url.path, parsed_url.params, parsed_url.query, parsed_url.fragment))
    example['url_path'] = new_url.lstrip('/')  # Store the result in a new field
    return example


extract_url_path({"view_url": "https://www.google.com/search?q=python+url+path"})

{'view_url': 'https://www.google.com/search?q=python+url+path',
 'url_path': 'search?q=python+url+path'}

## Evaluate Models

In [8]:
def get_predictions(tokenized_datasets, tokenizer, model, device, features, split="test"):
    """Use the trained model to make predictions on the test set."""
    
    preds = []
    labels = []
    probabilities = []
    
    for row in tqdm(tokenized_datasets[split]):
        # Encode the text inputs
        if features == "content":
            inputs = tokenizer(row["text"], padding="max_length", truncation=True, return_tensors="pt")
        elif features == "url":
            inputs = tokenizer(row["url_path"], padding="max_length", truncation=True, return_tensors="pt")
        elif features == "url_and_content":
            inputs = tokenizer(row["url_path"], row["text"], padding="max_length", truncation=True, return_tensors="pt")
        else:
            raise ValueError("Invalid value for FEATURES. Expected 'content', 'url', or 'url_and_content'.")

        with torch.no_grad():
            # Forward pass
            outputs = model(**inputs.to(device))
            # Apply softmax to logits to get probabilities
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            # Get the predicted class (the one with the highest probability)
            predicted_class = torch.argmax(predictions).item()
        
        # Store the predictions, labels, and probabilities
        preds.append(predicted_class)
        labels.append(row["label"])
        probabilities.append(predictions.cpu().numpy().tolist()[0][1])# Store the probability of the positive class
    
    return preds, labels, probabilities

In [9]:
def calc_metrics(labels, preds):
    """
    Calculates the accuracy, precision, recall, and F1 score for the given labels and predictions and returns them in a dictionary.
    """
    
    metrics = {
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds, average='binary'),
        'recall': recall_score(labels, preds, average='binary'),
        'f1': f1_score(labels, preds, average='binary'),
    }

    return metrics

In [10]:
def sample_random_from_dataset(dataset, n=5, subset='test'):
    """
    Samples n random examples from a specified subset of the dataset.
    """
    n = min(n, len(dataset[subset]))
    random_indices = random.sample(range(len(dataset[subset])), n)
    sampled_dataset = dataset[subset].select(random_indices)
    return sampled_dataset

**Get chunk level predictions:**

In [11]:
from collections import defaultdict
eval_results = defaultdict(dict)

for topic in TOPICS: # ----------------------------------------------------------------------
    
    #print(f"Loading dataset for {topic}")
    
    for model_name in MODELS: # -------------------------------------------------------------

        print(f"\n\n###### Evaluating model {model_name} on {topic} ###### \n\n")
            
        if FEATURES == "url":
            dataset = load_from_disk(
                f"../../data/tmp/processed_dataset_{topic}_buffed_{SAMPLING}{SUFFIX}")

            if SPLIT == "holdout":
                dataset["holdout"] = concatenate_datasets(
                    [dataset["holdout"], dataset["test"]])
            # Extract the path from the URL
            dataset = dataset.map(extract_url_path, num_proc=8)
        else:
            dataset = load_from_disk(
                f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}")

            if SPLIT == "holdout":
                dataset["holdout"] = concatenate_datasets(
                    [dataset["holdout"], dataset["test"]])
                
            # Extract the path from the URL
            dataset = dataset.map(extract_url_path)
            # dataset['test'] = sample_random_from_dataset(dataset, n=5, subset='test')
        
        # Load model and tokenizer
        model_name_local = f"../../models/{model_name.replace('/','_')}_{topic}_model_{FEATURES}/"
        print(f"Loading model from {model_name_local}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name_local, num_labels=2, local_files_only=True)
        
        # Use multiple GPUs if available
        if torch.cuda.device_count() > 1:
            print(f"Using {torch.cuda.device_count()} GPUs!")
            model = torch.nn.DataParallel(model)
            
        # Move model to GPU if available
        DEVICE = torch.device(f"cuda:{CUDA_ID}" if torch.cuda.is_available() else "cpu")
        model.to(DEVICE)
        
        # Use the trained model to make predictions on the test set
        preds, labels, probas = get_predictions(dataset, tokenizer, model, DEVICE, FEATURES, split=SPLIT)
        metrics = calc_metrics(labels, preds)
        print(f"Metrics for {model_name} on {topic}: {metrics}")
        
        # Add answers to the dataset
        dataset[SPLIT] = dataset[SPLIT].add_column("preds", preds)
        dataset[SPLIT] = dataset[SPLIT].add_column("probas", probas)
        dataset.save_to_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}_s_{model_name.split('/')[1]}_{FEATURES}_{SPLIT}")
        
        # Update the eval_results dictionary
        eval_results[model_name][topic] = metrics
        
        # Clear GPU memory to avoid memory errors
        del model, tokenizer
        torch.cuda.empty_cache()



###### Evaluating model distilbert/distilbert-base-multilingual-cased on cannabis ###### 


Loading model from ../../models/distilbert_distilbert-base-multilingual-cased_cannabis_model_url_and_content/
Using 2 GPUs!


100%|██████████| 507/507 [00:09<00:00, 54.40it/s]


Metrics for distilbert/distilbert-base-multilingual-cased on cannabis: {'accuracy': 0.9960552268244576, 'precision': 0.9939759036144579, 'recall': 1.0, 'f1': 0.9969788519637462}


Saving the dataset (1/1 shards): 100%|██████████| 3815/3815 [00:00<00:00, 231292.38 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 507/507 [00:00<00:00, 71800.39 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 33702/33702 [00:00<00:00, 529002.83 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 224737/224737 [00:00<00:00, 573958.03 examples/s]




###### Evaluating model google-bert/bert-base-multilingual-cased on cannabis ###### 


Loading model from ../../models/google-bert_bert-base-multilingual-cased_cannabis_model_url_and_content/
Using 2 GPUs!


100%|██████████| 507/507 [00:15<00:00, 32.63it/s]


Metrics for google-bert/bert-base-multilingual-cased on cannabis: {'accuracy': 0.9960552268244576, 'precision': 0.9939759036144579, 'recall': 1.0, 'f1': 0.9969788519637462}


Saving the dataset (1/1 shards): 100%|██████████| 3815/3815 [00:00<00:00, 288326.75 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 507/507 [00:00<00:00, 72572.25 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 33702/33702 [00:00<00:00, 530252.99 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 224737/224737 [00:00<00:00, 595474.50 examples/s]




###### Evaluating model distilbert/distilbert-base-multilingual-cased on kinder ###### 


Loading model from ../../models/distilbert_distilbert-base-multilingual-cased_kinder_model_url_and_content/
Using 2 GPUs!


100%|██████████| 316/316 [00:04<00:00, 63.78it/s]


Metrics for distilbert/distilbert-base-multilingual-cased on kinder: {'accuracy': 0.9968354430379747, 'precision': 1.0, 'recall': 0.9950980392156863, 'f1': 0.9975429975429976}


Saving the dataset (1/1 shards): 100%|██████████| 3628/3628 [00:00<00:00, 286754.89 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 316/316 [00:00<00:00, 40845.64 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 33730/33730 [00:00<00:00, 547713.60 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 266322/266322 [00:00<00:00, 584014.93 examples/s]




###### Evaluating model google-bert/bert-base-multilingual-cased on kinder ###### 


Loading model from ../../models/google-bert_bert-base-multilingual-cased_kinder_model_url_and_content/
Using 2 GPUs!


100%|██████████| 316/316 [00:09<00:00, 32.37it/s]


Metrics for google-bert/bert-base-multilingual-cased on kinder: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}


Saving the dataset (1/1 shards): 100%|██████████| 3628/3628 [00:00<00:00, 288708.04 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 316/316 [00:00<00:00, 39063.93 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 33730/33730 [00:00<00:00, 552218.16 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 266322/266322 [00:00<00:00, 600956.67 examples/s]




###### Evaluating model distilbert/distilbert-base-multilingual-cased on energie ###### 


Loading model from ../../models/distilbert_distilbert-base-multilingual-cased_energie_model_url_and_content/
Using 2 GPUs!


100%|██████████| 579/579 [00:09<00:00, 64.30it/s]


Metrics for distilbert/distilbert-base-multilingual-cased on energie: {'accuracy': 0.9965457685664939, 'precision': 0.9924812030075187, 'recall': 1.0, 'f1': 0.9962264150943396}


Saving the dataset (1/1 shards): 100%|██████████| 4227/4227 [00:00<00:00, 297596.69 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 579/579 [00:00<00:00, 73486.31 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 39782/39782 [00:00<00:00, 517229.75 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 229661/229661 [00:00<00:00, 579267.89 examples/s]




###### Evaluating model google-bert/bert-base-multilingual-cased on energie ###### 


Loading model from ../../models/google-bert_bert-base-multilingual-cased_energie_model_url_and_content/
Using 2 GPUs!


100%|██████████| 579/579 [00:17<00:00, 32.79it/s]


Metrics for google-bert/bert-base-multilingual-cased on energie: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}


Saving the dataset (1/1 shards): 100%|██████████| 4227/4227 [00:00<00:00, 303158.63 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 579/579 [00:00<00:00, 71677.40 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 39782/39782 [00:00<00:00, 546716.26 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 229661/229661 [00:00<00:00, 598711.08 examples/s]


In [12]:
print(eval_results)

defaultdict(<class 'dict'>, {'distilbert/distilbert-base-multilingual-cased': {'cannabis': {'accuracy': 0.9960552268244576, 'precision': 0.9939759036144579, 'recall': 1.0, 'f1': 0.9969788519637462}, 'kinder': {'accuracy': 0.9968354430379747, 'precision': 1.0, 'recall': 0.9950980392156863, 'f1': 0.9975429975429976}, 'energie': {'accuracy': 0.9965457685664939, 'precision': 0.9924812030075187, 'recall': 1.0, 'f1': 0.9962264150943396}}, 'google-bert/bert-base-multilingual-cased': {'cannabis': {'accuracy': 0.9960552268244576, 'precision': 0.9939759036144579, 'recall': 1.0, 'f1': 0.9969788519637462}, 'kinder': {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}, 'energie': {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}}})


### Save Chunk Level Predictions and Output Results

In [13]:
from IPython.display import display, HTML
from tabulate import tabulate
import json

In [14]:
# Define the file path to save the dictionary
file_path =f"eval_results_{FEATURES}_{SPLIT}_chunks.json"

In [15]:
# Save the dictionary to disk as JSON
with open(file_path, "w") as file:
    json.dump(eval_results, file)

In [16]:
with open(file_path, "r") as file:
    eval_results = json.load(file)

In [17]:
# Identify all topics (assuming all models are evaluated on the same topics)
topics = list(next(iter(eval_results.values())).keys())

# Prepare headers for the table: each topic will have four metrics
headers = ["Model"] + [f"{topic} {metric}" for topic in topics for metric in ["Acc.", "Prec.", "Rec.", "F1"]]

# Prepare rows: one row per model, containing metrics for each topic
rows = []
for model, topics_metrics in eval_results.items():
    row = [model]  # Start with the model name
    for topic in topics:
        metrics = topics_metrics.get(topic, {})
        row.extend([metrics.get('accuracy',0.0), metrics.get('precision',0.0), metrics.get('recall',0.0), metrics.get('f1',0.0)])
    rows.append(row)

# Generate the HTML table
table_html = tabulate(rows, headers=headers, tablefmt="html", showindex="never", floatfmt=".3f")

In [18]:
from IPython.display import display, HTML
display(HTML(table_html))


Model,cannabis Acc.,cannabis Prec.,cannabis Rec.,cannabis F1,kinder Acc.,kinder Prec.,kinder Rec.,kinder F1,energie Acc.,energie Prec.,energie Rec.,energie F1
distilbert/distilbert-base-multilingual-cased,0.996,0.994,1.0,0.997,0.997,1.0,0.995,0.998,0.997,0.992,1.0,0.996
google-bert/bert-base-multilingual-cased,0.996,0.994,1.0,0.997,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
# Calculate the average F1 score for each model
average_f1_scores = {}
for model, categories in eval_results.items():
    total_f1 = 0
    count = 0
    for category, metrics in categories.items():
        total_f1 += metrics["f1"]
        count += 1
    average_f1_scores[model] = total_f1 / count

# Find the model with the highest average F1 score
best_model = max(average_f1_scores, key=average_f1_scores.get)
best_model_average_f1 = average_f1_scores[best_model]

print(f"The best model is: {best_model}")
print(f"Average F1 score of the best model: {best_model_average_f1:.4f}")

The best model is: google-bert/bert-base-multilingual-cased
Average F1 score of the best model: 0.9990


## Page Level Predictions

In [20]:
from collections import Counter

In [21]:
def majority_voting(answers):
    """Apply majority voting to a list of arbitrary classification answers."""
    count = Counter(answers)
    most_common = count.most_common()  # Get all common answers sorted by frequency

    if not most_common:
        return 0 # Handle empty input scenario

    # Check for ties at the highest count
    max_votes = most_common[0][1]
    tied_classes = [cls for cls, votes in most_common if votes == max_votes]

    if len(tied_classes) > 1:
        return max(tied_classes)  # Return the maximum class label in case of a tie
    return tied_classes[0]  # Return the class with the most votes

majority_voting([1, 1, 2, 2, 2, 3])

2

In [23]:
from collections import defaultdict
eval_results_pages = defaultdict(dict)

for topic in TOPICS: # ----------------------------------------------------------------------
    for model_name in MODELS: # -------------------------------------------------------------

        print(f"\n\n###### Evaluating model {model_name} on {topic} ###### \n\n")
        dataset = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}_s_{model_name.split('/')[1]}_{FEATURES}_{SPLIT}")
        
        print(dataset)
        
        # Group dataset examples by URL, with a fallback to domain
        grouped_dataset = {}
        for example in tqdm(dataset[SPLIT]):
            url = example.get("view_url") or example.get("domain")
            example_filtered = {k: example[k] for k in ["text", "domain", "preds", "label", "category", "annotation_type", "lang"]}
            grouped_dataset.setdefault(url, []).append(example_filtered)
            
        # Extract labels
        labels = []
        for url, chunks in grouped_dataset.items():
            preds = [chunk["label"] for chunk in chunks]
            labels.append(max(preds))
            
        # Merge chunk level predictions
        predictions = []
        for url, chunks in grouped_dataset.items():
            preds = [chunk["preds"] for chunk in chunks]
            pred = majority_voting([pred for pred in preds if pred > 0]) if max(preds) > 0 else 0
            predictions.append(pred)
    
        # Use the trained model to make predictions on the test set
        metrics = calc_metrics(labels, predictions)
        print(f"Metrics for {model_name} on {topic}: {metrics}")
        
        # Update the eval_results dictionary
        eval_results_pages[model_name][topic] = metrics
    



###### Evaluating model distilbert/distilbert-base-multilingual-cased on cannabis ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 3815
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 507
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_pa

100%|██████████| 507/507 [00:00<00:00, 4613.19it/s]

Metrics for distilbert/distilbert-base-multilingual-cased on cannabis: {'accuracy': 0.9767441860465116, 'precision': 0.9523809523809523, 'recall': 1.0, 'f1': 0.975609756097561}


###### Evaluating model google-bert/bert-base-multilingual-cased on cannabis ###### 







DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 3815
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 507
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 33702
    })
    extended: Dataset({
        features: ['_id', 'batch_

100%|██████████| 507/507 [00:00<00:00, 4553.02it/s]


Metrics for google-bert/bert-base-multilingual-cased on cannabis: {'accuracy': 0.9767441860465116, 'precision': 0.9523809523809523, 'recall': 1.0, 'f1': 0.975609756097561}


###### Evaluating model distilbert/distilbert-base-multilingual-cased on kinder ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 3628
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 316
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 

100%|██████████| 316/316 [00:00<00:00, 4643.20it/s]


Metrics for distilbert/distilbert-base-multilingual-cased on kinder: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}


###### Evaluating model google-bert/bert-base-multilingual-cased on kinder ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 3628
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 316
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'categor

100%|██████████| 316/316 [00:00<00:00, 5264.50it/s]


Metrics for google-bert/bert-base-multilingual-cased on kinder: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}


###### Evaluating model distilbert/distilbert-base-multilingual-cased on energie ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 4227
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 579
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'catego

100%|██████████| 579/579 [00:00<00:00, 6195.32it/s]


Metrics for distilbert/distilbert-base-multilingual-cased on energie: {'accuracy': 0.9782608695652174, 'precision': 0.9583333333333334, 'recall': 1.0, 'f1': 0.9787234042553191}


###### Evaluating model google-bert/bert-base-multilingual-cased on energie ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 4227
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 579
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text',

100%|██████████| 579/579 [00:00<00:00, 6128.77it/s]

Metrics for google-bert/bert-base-multilingual-cased on energie: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}





### Save Chunk Level Predictions and Output Results

In [24]:
# Define the file path to save the dictionary
file_path = f"eval_results_{FEATURES}_{SPLIT}_pages.json"

In [25]:
# Save the dictionary to disk as JSON
with open(file_path, "w") as file:
    json.dump(eval_results_pages, file)

In [26]:
with open(file_path, "r") as file:
    eval_results_pages = json.load(file)

In [27]:
# Identify all topics (assuming all models are evaluated on the same topics)
topics = list(next(iter(eval_results_pages.values())).keys())

# Prepare headers for the table: each topic will have four metrics
headers = ["Model"] + [f"{topic} {metric}" for topic in topics for metric in ["Acc.", "Prec.", "Rec.", "F1"]]

# Prepare rows: one row per model, containing metrics for each topic
rows = []
for model, topics_metrics in eval_results_pages.items():
    row = [model]  # Start with the model name
    for topic in topics:
        metrics = topics_metrics.get(topic, {})
        row.extend([metrics.get('accuracy',0.0), metrics.get('precision',0.0), metrics.get('recall',0.0), metrics.get('f1',0.0)])
    rows.append(row)

# Generate the HTML table
table_html = tabulate(rows, headers=headers, tablefmt="html", showindex="never", floatfmt=".3f")

In [28]:
display(HTML(table_html))

Model,cannabis Acc.,cannabis Prec.,cannabis Rec.,cannabis F1,kinder Acc.,kinder Prec.,kinder Rec.,kinder F1,energie Acc.,energie Prec.,energie Rec.,energie F1
distilbert/distilbert-base-multilingual-cased,0.977,0.952,1.0,0.976,1.0,1.0,1.0,1.0,0.978,0.958,1.0,0.979
google-bert/bert-base-multilingual-cased,0.977,0.952,1.0,0.976,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
# Calculate the average F1 score for each model
average_f1_scores = {}
for model, categories in eval_results_pages.items():
    total_f1 = 0
    count = 0
    for category, metrics in categories.items():
        total_f1 += metrics["f1"]
        count += 1
    average_f1_scores[model] = total_f1 / count

# Find the model with the highest average F1 score
best_model = max(average_f1_scores, key=average_f1_scores.get)
best_model_average_f1 = average_f1_scores[best_model]

print(f"The best model is: {best_model}")
print(f"Average F1 score of the best model: {best_model_average_f1:.4f}")

The best model is: google-bert/bert-base-multilingual-cased
Average F1 score of the best model: 0.9919
