## Evaluate Multiple Sampling Strategies

In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk, concatenate_datasets
from sklearn.metrics import accuracy_score
import random
from tqdm import tqdm
import numpy as np
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


## Set Random Seed for Reproducibility

In [2]:
# Set a seed for random module
random.seed(42)

# Set a seed for numpy module
np.random.seed(42)

# Set a seed for torch module
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Define Parameters

In [3]:
TOPICS = ["cannabis", "kinder", "energie"]
MODEL = "deepset/gelectra-large"#"deepset/gelectra-large"
STRATEGIES = ["random", "stratified" , "clustered"]#, "shared_domain"]
SUFFIX = "_extended" #"", "_holdout", "_extended"
SPLIT = "holdout" # "train", "test", "holdout", "extended"
MAX_CONTENT_LENGTH = 384 # 496, 192, 384
OVERLAP = 64
FEATURES = "url_and_content" # "url", "content", "url_and_content"


In [4]:
from torch import nn

**Extract URL-path:**

In [5]:
from urllib.parse import urlparse, urlunparse

def extract_url_path(example):
    view_url = example['view_url']
    if "://" not in view_url:
        view_url = "http://" + view_url  # Assume http if no protocol specified
    parsed_url = urlparse(view_url)
    new_url = urlunparse(('', '', parsed_url.path, parsed_url.params, parsed_url.query, parsed_url.fragment))
    example['url_path'] = new_url.lstrip('/')  # Store the result in a new field
    return example


extract_url_path({"view_url": "https://www.google.com/search?q=python+url+path"})

{'view_url': 'https://www.google.com/search?q=python+url+path',
 'url_path': 'search?q=python+url+path'}

## Evaluate Models

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [7]:
def get_predictions(tokenized_datasets, tokenizer, model, device, features, split="test"):
    """Use the trained model to make predictions on the test set."""
    
    preds = []
    labels = []
    probabilities = []
    
    for row in tqdm(tokenized_datasets[split]):
        # Encode the text inputs
        if features == "content":
            inputs = tokenizer(row["text"], padding="max_length", truncation=True, return_tensors="pt")
        elif features == "url":
            inputs = tokenizer(row["url_path"], padding="max_length", truncation=True, return_tensors="pt")
        elif features == "url_and_content":
            inputs = tokenizer(row["url_path"], row["text"], padding="max_length", truncation=True, return_tensors="pt")
        else:
            raise ValueError("Invalid value for FEATURES. Expected 'content', 'url', or 'url_and_content'.")

        with torch.no_grad():
            # Forward pass
            outputs = model(**inputs.to(device))
            # Apply softmax to logits to get probabilities
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            # Get the predicted class (the one with the highest probability)
            predicted_class = torch.argmax(predictions).item()
        
        # Store the predictions, labels, and probabilities
        preds.append(predicted_class)
        labels.append(row["label"])
        probabilities.append(predictions.cpu().numpy().tolist()[0][1])# Store the probability of the positive class
    
    return preds, labels, probabilities

In [8]:
def calc_metrics(labels, preds):
    """
    Calculates the accuracy, precision, recall, and F1 score for the given labels and predictions and returns them in a dictionary.
    """
    
    metrics = {
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds, average='binary'),
        'recall': recall_score(labels, preds, average='binary'),
        'f1': f1_score(labels, preds, average='binary'),
    }

    return metrics


In [9]:
def sample_random_from_dataset(dataset, n=5, subset='test'):
    """
    Samples n random examples from a specified subset of the dataset.
    """
    n = min(n, len(dataset[subset]))
    random_indices = random.sample(range(len(dataset[subset])), n)
    sampled_dataset = dataset[subset].select(random_indices)
    return sampled_dataset

**Get chunk level predictions:**

In [10]:
from collections import defaultdict
eval_results = defaultdict(dict)

for topic in TOPICS: # ----------------------------------------------------------------------
    
    for strategie in STRATEGIES: # -------------------------------------------------------------

        print(f"Training on {topic} dataset using {strategie} strategy")
        
        if FEATURES == "url":
            dataset = load_from_disk(
                f"../../data/tmp/processed_dataset_{topic}_buffed_{strategie}")
            # Extract the path from the URL
            dataset = dataset.map(extract_url_path)
            # dataset['test'] = sample_random_from_dataset(dataset, n=5, subset='test')
        else:
            dataset = load_from_disk(
                f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{strategie}{SUFFIX}_{MAX_CONTENT_LENGTH}")
            
            if SPLIT == "holdout":
                dataset["holdout"] = concatenate_datasets([dataset["holdout"], dataset["test"]])
                
            # Extract the path from the URL
            dataset = dataset.map(extract_url_path)
            # dataset['test'] = sample_random_from_dataset(dataset, n=5, subset='test')
        
        # Load model and tokenizer
        model_name_local = f"../models_ccu/{MODEL.split('/')[0]}_sampling_{strategie.replace('/','_')}_{topic}_model_{FEATURES}"
        print(f"Loading model from {model_name_local}")
        tokenizer = AutoTokenizer.from_pretrained(MODEL)
        model = AutoModelForSequenceClassification.from_pretrained(model_name_local, num_labels=2, local_files_only=True)
        
        # Use multiple GPUs if available
        if torch.cuda.device_count() > 1:
            print(f"Using {torch.cuda.device_count()} GPUs!")
            model = torch.nn.DataParallel(model)
            
        # Move model to GPU if available
        DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(DEVICE)
        
        # Use the trained model to make predictions on the test set
        preds, labels, probas = get_predictions(dataset, tokenizer, model, DEVICE, FEATURES, split=SPLIT)
        metrics = calc_metrics(labels, preds)
        print(f"Metrics for {MODEL} on {topic}: {metrics}")
        
        # Add answers to the dataset
        dataset[SPLIT] = dataset[SPLIT].add_column("preds", preds)
        dataset[SPLIT] = dataset[SPLIT].add_column("probas", probas)
        dataset.save_to_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{strategie}{SUFFIX}_{MAX_CONTENT_LENGTH}_sampling_{MODEL.split('/')[1]}_{FEATURES}_{SPLIT}")
        
        # Update the eval_results dictionary
        eval_results[strategie][topic] = metrics
        
        # Clear GPU memory to avoid memory errors
        del model, tokenizer
        torch.cuda.empty_cache()


Training on cannabis dataset using random strategy
Loading model from ../models_ccu/deepset_sampling_random_cannabis_model_url_and_content
Using 2 GPUs!


  1%|          | 265/34209 [00:23<49:40, 11.39it/s] 


KeyboardInterrupt: 

In [None]:
print(eval_results)

defaultdict(<class 'dict'>, {'random': {'cannabis': {'accuracy': 0.9960552268244576, 'precision': 0.9939759036144579, 'recall': 1.0, 'f1': 0.9969788519637462}, 'kinder': {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}, 'energie': {'accuracy': 0.998272884283247, 'precision': 1.0, 'recall': 0.9962121212121212, 'f1': 0.9981024667931688}}, 'stratified': {'cannabis': {'accuracy': 0.9960552268244576, 'precision': 0.9939759036144579, 'recall': 1.0, 'f1': 0.9969788519637462}, 'kinder': {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}, 'energie': {'accuracy': 0.998272884283247, 'precision': 1.0, 'recall': 0.9962121212121212, 'f1': 0.9981024667931688}}, 'clustered': {'cannabis': {'accuracy': 0.9960552268244576, 'precision': 0.9939759036144579, 'recall': 1.0, 'f1': 0.9969788519637462}, 'kinder': {'accuracy': 0.990506329113924, 'precision': 0.9855072463768116, 'recall': 1.0, 'f1': 0.9927007299270073}, 'energie': {'accuracy': 0.998272884283247, 'precision': 1.0, 'recall':

### Save Chunk Level Predictions and Output Results

In [None]:
from IPython.display import display, HTML
from tabulate import tabulate
import json

In [None]:
# Define the file path to save the dictionary
file_path =f"eval_results_sampling_{FEATURES}_{SPLIT}_chunks.json"

In [None]:
# Save the dictionary to disk as JSON
with open(file_path, "w") as file:
    json.dump(eval_results, file)
    
# with open(file_path, "r") as file:
#     eval_results = json.load(file)

In [None]:
# Identify all topics (assuming all models are evaluated on the same topics)
topics = list(next(iter(eval_results.values())).keys())

# Prepare headers for the table: each topic will have four metrics
headers = ["Model"] + [f"{topic} {metric}" for topic in topics for metric in ["Acc.", "Prec.", "Rec.", "F1"]]

# Prepare rows: one row per model, containing metrics for each topic
rows = []
for model, topics_metrics in eval_results.items():
    row = [model]  # Start with the model name
    for topic in topics:
        metrics = topics_metrics.get(topic, {})
        row.extend([metrics.get('accuracy',0.0), metrics.get('precision',0.0), metrics.get('recall',0.0), metrics.get('f1',0.0)])
    rows.append(row)

# Generate the HTML table
table_html = tabulate(rows, headers=headers, tablefmt="html", showindex="never", floatfmt=".3f")

In [None]:
from IPython.display import display, HTML
display(HTML(table_html))


Model,cannabis Acc.,cannabis Prec.,cannabis Rec.,cannabis F1,kinder Acc.,kinder Prec.,kinder Rec.,kinder F1,energie Acc.,energie Prec.,energie Rec.,energie F1
random,0.996,0.994,1.0,0.997,1.0,1.0,1.0,1.0,0.998,1.0,0.996,0.998
stratified,0.996,0.994,1.0,0.997,1.0,1.0,1.0,1.0,0.998,1.0,0.996,0.998
clustered,0.996,0.994,1.0,0.997,0.991,0.986,1.0,0.993,0.998,1.0,0.996,0.998


## Page Level Predictions

In [None]:
from collections import Counter

In [None]:
def majority_voting(answers):
    """Apply majority voting to a list of arbitrary classification answers."""
    count = Counter(answers)
    most_common = count.most_common()  # Get all common answers sorted by frequency

    if not most_common:
        return 0 # Handle empty input scenario

    # Check for ties at the highest count
    max_votes = most_common[0][1]
    tied_classes = [cls for cls, votes in most_common if votes == max_votes]

    if len(tied_classes) > 1:
        return max(tied_classes)  # Return the maximum class label in case of a tie
    return tied_classes[0]  # Return the class with the most votes

majority_voting([1, 1, 2, 2, 2, 3])

2

In [None]:
from collections import defaultdict
eval_results_pages = defaultdict(dict)

for topic in TOPICS: # ----------------------------------------------------------------------
    
    for strategie in STRATEGIES: # -------------------------------------------------------------

        print(f"\n\n###### Evaluating model {MODEL} on {topic} ###### \n\n")
        dataset = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{strategie}{SUFFIX}_{MAX_CONTENT_LENGTH}_sampling_{MODEL.split('/')[1]}_{FEATURES}_{SPLIT}")
        
        print(dataset)
        
        # Group dataset examples by URL, with a fallback to domain
        grouped_dataset = {}
        for example in tqdm(dataset[SPLIT]):
            url = example.get("view_url") or example.get("domain")
            example_filtered = {k: example[k] for k in ["text", "domain", "preds", "label", "category", "annotation_type", "lang"]}
            grouped_dataset.setdefault(url, []).append(example_filtered)
            
        # Extract labels
        labels = []
        for url, chunks in grouped_dataset.items():
            preds = [chunk["label"] for chunk in chunks]
            labels.append(max(preds))
            
        # Merge chunk level predictions
        predictions = []
        for url, chunks in grouped_dataset.items():
            preds = [chunk["preds"] for chunk in chunks]
            pred = majority_voting([pred for pred in preds if pred > 0]) if max(preds) > 0 else 0
            predictions.append(pred)
    
        # Use the trained model to make predictions on the test set
        metrics = calc_metrics(labels, predictions)
        print(f"Metrics for {strategie} on {topic}: {metrics}")
        
        # Update the eval_results dictionary
        eval_results_pages[strategie][topic] = metrics
    



###### Evaluating model deepset/gelectra-large on cannabis ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 3815
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 507
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows:

100%|██████████| 507/507 [00:00<00:00, 4137.54it/s]

Metrics for random on cannabis: {'accuracy': 0.9767441860465116, 'precision': 0.9523809523809523, 'recall': 1.0, 'f1': 0.975609756097561}


###### Evaluating model deepset/gelectra-large on cannabis ###### 







DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 4077
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 507
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 33440
    })
    extended: Dataset({
        features: ['_id', 'batch_

100%|██████████| 507/507 [00:00<00:00, 4622.97it/s]


Metrics for stratified on cannabis: {'accuracy': 0.9767441860465116, 'precision': 0.9523809523809523, 'recall': 1.0, 'f1': 0.975609756097561}


###### Evaluating model deepset/gelectra-large on cannabis ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 3832
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 507
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', '

100%|██████████| 507/507 [00:00<00:00, 4641.14it/s]


Metrics for clustered on cannabis: {'accuracy': 0.9767441860465116, 'precision': 0.9523809523809523, 'recall': 1.0, 'f1': 0.975609756097561}


###### Evaluating model deepset/gelectra-large on kinder ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 3628
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 316
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'goo

100%|██████████| 316/316 [00:00<00:00, 5460.68it/s]


Metrics for random on kinder: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}


###### Evaluating model deepset/gelectra-large on kinder ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 3393
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 316
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annota

100%|██████████| 316/316 [00:00<00:00, 4639.62it/s]


Metrics for stratified on kinder: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}


###### Evaluating model deepset/gelectra-large on kinder ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 3421
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 316
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'an

100%|██████████| 316/316 [00:00<00:00, 6000.73it/s]


Metrics for clustered on kinder: {'accuracy': 0.9767441860465116, 'precision': 0.9545454545454546, 'recall': 1.0, 'f1': 0.9767441860465116}


###### Evaluating model deepset/gelectra-large on energie ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 4227
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 579
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'goo

100%|██████████| 579/579 [00:00<00:00, 6225.01it/s]

Metrics for random on energie: {'accuracy': 0.9782608695652174, 'precision': 1.0, 'recall': 0.9565217391304348, 'f1': 0.9777777777777777}


###### Evaluating model deepset/gelectra-large on energie ###### 







DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 4169
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 579
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 39840
    })
    extended: Dataset({
        features: ['_id', 'batch_

100%|██████████| 579/579 [00:00<00:00, 6225.16it/s]


Metrics for stratified on energie: {'accuracy': 0.9782608695652174, 'precision': 1.0, 'recall': 0.9565217391304348, 'f1': 0.9777777777777777}


###### Evaluating model deepset/gelectra-large on energie ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path'],
        num_rows: 4231
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id', 'url_path', 'preds', 'probas'],
        num_rows: 579
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'g

100%|██████████| 579/579 [00:00<00:00, 4864.83it/s]

Metrics for clustered on energie: {'accuracy': 0.9782608695652174, 'precision': 1.0, 'recall': 0.9565217391304348, 'f1': 0.9777777777777777}





### Save Chunk Level Predictions and Output Results

In [None]:
# Define the file path to save the dictionary
file_path =f"eval_results_sampling_{FEATURES}_{SPLIT}_pages.json"

In [None]:
# Save the dictionary to disk as JSON
with open(file_path, "w") as file:
    json.dump(eval_results_pages, file)
    
# with open(file_path, "r") as file:
#     eval_results_pages = json.load(file)

In [None]:
# Identify all topics (assuming all models are evaluated on the same topics)
topics = list(next(iter(eval_results_pages.values())).keys())

# Prepare headers for the table: each topic will have four metrics
headers = ["Model"] + [f"{topic} {metric}" for topic in topics for metric in ["Acc.", "Prec.", "Rec.", "F1"]]

# Prepare rows: one row per model, containing metrics for each topic
rows = []
for model, topics_metrics in eval_results_pages.items():
    row = [model]  # Start with the model name
    for topic in topics:
        metrics = topics_metrics.get(topic, {})
        row.extend([metrics.get('accuracy',0.0), metrics.get('precision',0.0), metrics.get('recall',0.0), metrics.get('f1',0.0)])
    rows.append(row)

# Generate the HTML table
table_html = tabulate(rows, headers=headers, tablefmt="html", showindex="never", floatfmt=".3f")

In [None]:
display(HTML(table_html))

Model,cannabis Acc.,cannabis Prec.,cannabis Rec.,cannabis F1,kinder Acc.,kinder Prec.,kinder Rec.,kinder F1,energie Acc.,energie Prec.,energie Rec.,energie F1
random,0.977,0.952,1.0,0.976,1.0,1.0,1.0,1.0,0.978,1.0,0.957,0.978
stratified,0.977,0.952,1.0,0.976,1.0,1.0,1.0,1.0,0.978,1.0,0.957,0.978
clustered,0.977,0.952,1.0,0.976,0.977,0.955,1.0,0.977,0.978,1.0,0.957,0.978
