## Evaluate Multiple Classifiers Significance

In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk, concatenate_datasets
from sklearn.metrics import accuracy_score
import random
from tqdm import tqdm
import numpy as np
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


## Set Random Seed for Reproducibility

In [2]:
# Set a seed for random module
random.seed(42)

# Set a seed for numpy module
np.random.seed(42)

# Set a seed for torch module
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Define Parameters

In [3]:
SAMPLING = "random" # "random", "stratified", "clustered", "shared_domain"
SUFFIX = "_extended" #"", "_holdout", "_extended"
SPLIT = "extended" # "train", "test", "holdout", "extended"
MAX_CONTENT_LENGTH = 384 # 496, 192
OVERLAP = 64
FEATURES = "url" # "url", "content", "url_and_content"
FOLDER_DATA = "data"
FOLDER_MODELS = "models_ccu"

In [4]:
TOPICS = ["cannabis", "kinder", "energie"]
#TOPICS = ["cannabis"]

In [5]:
MODELS = [#"distilbert/distilbert-base-multilingual-cased",
          "google-bert/bert-base-multilingual-cased", 
          "FacebookAI/xlm-roberta-base", 
          "FacebookAI/xlm-roberta-large", 
          "dbmdz/bert-base-german-uncased", 
          "deepset/gelectra-large",
          "deepset/gelectra-base",
          "deepset/gbert-large",
          "deepset/gbert-base",
          ]

## Helpers

In [6]:
def calc_metrics(labels, preds):
    """
    Calculates the accuracy, precision, recall, and F1 score for the given labels and predictions and returns them in a dictionary.
    """
    
    metrics = {
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds, average='binary'),
        'recall': recall_score(labels, preds, average='binary'),
        'f1': f1_score(labels, preds, average='binary'),
    }

    return metrics

In [7]:
from collections import Counter

def majority_voting(answers):
    """Apply majority voting to a list of arbitrary classification answers."""
    count = Counter(answers)
    most_common = count.most_common()  # Get all common answers sorted by frequency

    if not most_common:
        return 0 # Handle empty input scenario

    # Check for ties at the highest count
    max_votes = most_common[0][1]
    tied_classes = [cls for cls, votes in most_common if votes == max_votes]

    if len(tied_classes) > 1:
        return max(tied_classes)  # Return the maximum class label in case of a tie
    return tied_classes[0]  # Return the class with the most votes

majority_voting([1, 1, 2, 2, 2, 3])

2

## Get Baseline Predictions

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk, concatenate_datasets
from sklearn.metrics import accuracy_score
import random
from tqdm import tqdm
import numpy as np
import torch
import os

In [9]:
from collections import defaultdict
eval_results_pages = defaultdict(dict)
predictions_per_topic_baseline = defaultdict(dict)

for topic in TOPICS: # ----------------------------------------------------------------------

    print(f"\n\n###### Evaluating on {topic} ###### \n\n")
    dataset = load_from_disk(f"../../data_ccu/tmp/processed_dataset_{topic}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}_s_SVM_{FEATURES}_{SPLIT}")
    
    print(dataset)
    
    # Group dataset examples by URL, with a fallback to domain
    grouped_dataset = {}
    for example in tqdm(dataset[SPLIT]):
        url = example.get("view_url") or example.get("domain")
        example_filtered = {k: example[k] for k in ["text", "domain", "preds", "label", "category", "annotation_type", "lang"]}
        grouped_dataset.setdefault(url, []).append(example_filtered)
        
    # Extract labels
    labels = []
    for url, chunks in grouped_dataset.items():
        preds = [chunk["label"] for chunk in chunks]
        labels.append(max(preds))
        
    # Merge chunk level predictions
    predictions = []
    for url, chunks in grouped_dataset.items():
        preds = [chunk["preds"] for chunk in chunks]
        #pred = majority_voting([pred for pred in preds if pred > 0]) if max(preds) > 0 else 0
        predictions.append(max(preds))

    # Use the trained model to make predictions on the test set
    metrics = calc_metrics(labels, predictions)
    print(f"Metrics for {topic}: {metrics}")
    
    # Update the eval_results dictionary
    eval_results_pages[topic] = metrics
    
    predictions_per_topic_baseline[topic] = {"labels": labels, "predictions": predictions}



###### Evaluating on cannabis ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 410
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 46
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 3448
    })
    extended: Dataset({
        features: ['_id', 'batch_id', 'domain', '

  0%|          | 0/44432 [00:00<?, ?it/s]

100%|██████████| 44432/44432 [00:07<00:00, 5720.96it/s]


Metrics for cannabis: {'accuracy': 0.994623777361133, 'precision': 0.10266159695817491, 'recall': 0.9310344827586207, 'f1': 0.18493150684931506}


###### Evaluating on kinder ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 384
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 44
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic

100%|██████████| 53253/53253 [00:09<00:00, 5911.75it/s]


Metrics for kinder: {'accuracy': 0.915424583760003, 'precision': 0.008680169152014245, 'recall': 0.8666666666666667, 'f1': 0.017188188629352136}


###### Evaluating on energie ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 408
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 46
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topi

100%|██████████| 45925/45925 [00:07<00:00, 5881.21it/s]


Metrics for energie: {'accuracy': 0.9073489384866631, 'precision': 0.00584932147870847, 'recall': 0.8064516129032258, 'f1': 0.011614401858304297}


In [10]:
#predictions_per_topic

## Calculate Significance

In [11]:
import numpy as np
from statsmodels.stats.contingency_tables import mcnemar

def compare_models(predictions_model, predictions_baseline, true_labels, alpha=0.05):
    """Compares a model's predictions with a baseline model's predictions using McNemar's test."""
    
    # Calculate b and c for the contingency table
    b = np.sum((predictions_model == true_labels) & (predictions_baseline != true_labels))
    c = np.sum((predictions_model != true_labels) & (predictions_baseline == true_labels))

    # Construct the contingency table
    contingency_table = np.array([[0, b], [c, 0]])

    # Perform McNemar's Test
    result = mcnemar(contingency_table, exact=False, correction=True)

    # Interpretation
    significance = result.pvalue < alpha
    interpretation = "The difference between the models is statistically significant." if significance else "The difference between the models is not statistically significant."

    return {
        "Chi-squared value": result.statistic,
        "P-value": result.pvalue,
        "Significance": significance,
        "Interpretation": interpretation
    }

# Example usage
predictions_model = np.array([0, 1, 0, 0, 1, 0, 1, 0, 1, 1])
predictions_baseline = np.array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0])
true_labels = np.array([0, 1, 0, 0, 1, 0, 1, 0, 1, 1])

result = compare_models(predictions_model, predictions_baseline, true_labels)
print(result)


{'Chi-squared value': 5.142857142857143, 'P-value': 0.02334220201289086, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


In [12]:
from collections import defaultdict
eval_results_pages = defaultdict(dict)
eval_results_significance = defaultdict(dict)

for topic in TOPICS: # ----------------------------------------------------------------------
    for model_name in MODELS: # -------------------------------------------------------------

        print(f"\n\n###### Evaluating model {model_name} on {topic} ###### \n\n")
        dataset = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}_s_{model_name.split('/')[1]}_{FEATURES}_{SPLIT}/")
        
        #print(dataset)
        
        # Group dataset examples by URL, with a fallback to domain
        grouped_dataset = {}
        for example in tqdm(dataset[SPLIT]):
            url = example.get("view_url") or example.get("domain")
            example_filtered = {k: example[k] for k in ["text", "domain", "preds", "label", "category", "annotation_type", "lang"]}
            grouped_dataset.setdefault(url, []).append(example_filtered)
            
        # Extract labels
        labels = []
        for url, chunks in grouped_dataset.items():
            preds = [chunk["label"] for chunk in chunks]
            labels.append(max(preds))
            
        # Merge chunk level predictions
        predictions = []
        for url, chunks in grouped_dataset.items():
            preds = [chunk["preds"] for chunk in chunks]
            predictions.append(max(preds))
    
        # Use the trained model to make predictions on the test set
        metrics = calc_metrics(labels, predictions)
        print(f"Metrics for {model_name} on {topic}: {metrics}")
        
        # Update the eval_results dictionary
        eval_results_pages[model_name][topic] = metrics
        
        predictions_model = np.array(predictions)
        predictions_baseline = np.array(predictions_per_topic_baseline[topic]["predictions"])
        true_labels = np.array(labels)

        result = compare_models(predictions_model, predictions_baseline, true_labels)
        print(result)
        eval_results_significance[model_name][topic] = result
        
    



###### Evaluating model google-bert/bert-base-multilingual-cased on cannabis ###### 




100%|██████████| 44432/44432 [00:07<00:00, 5810.14it/s]


Metrics for google-bert/bert-base-multilingual-cased on cannabis: {'accuracy': 0.9994352707312114, 'precision': 0.5384615384615384, 'recall': 0.9655172413793104, 'f1': 0.691358024691358}
{'Chi-squared value': 177.64426877470356, 'P-value': 1.5841283620116657e-40, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model FacebookAI/xlm-roberta-base on cannabis ###### 




100%|██████████| 44432/44432 [00:07<00:00, 5686.92it/s]


Metrics for FacebookAI/xlm-roberta-base on cannabis: {'accuracy': 0.9988931306331744, 'precision': 0.3684210526315789, 'recall': 0.9655172413793104, 'f1': 0.5333333333333333}
{'Chi-squared value': 124.8904593639576, 'P-value': 5.378316564684772e-29, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model FacebookAI/xlm-roberta-large on cannabis ###### 




100%|██████████| 44432/44432 [00:07<00:00, 5755.88it/s]


Metrics for FacebookAI/xlm-roberta-large on cannabis: {'accuracy': 0.9990738439991868, 'precision': 0.4117647058823529, 'recall': 0.9655172413793104, 'f1': 0.5773195876288659}
{'Chi-squared value': 139.69454545454545, 'P-value': 3.104631760241412e-32, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model dbmdz/bert-base-german-uncased on cannabis ###### 




100%|██████████| 44432/44432 [00:07<00:00, 5785.47it/s]


Metrics for dbmdz/bert-base-german-uncased on cannabis: {'accuracy': 0.9983284013643859, 'precision': 0.2727272727272727, 'recall': 0.9310344827586207, 'f1': 0.421875}
{'Chi-squared value': 86.82679738562092, 'P-value': 1.1845360287123875e-20, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model deepset/gelectra-large on cannabis ###### 




100%|██████████| 44432/44432 [00:07<00:00, 5668.30it/s]


Metrics for deepset/gelectra-large on cannabis: {'accuracy': 0.999457859901963, 'precision': 0.5490196078431373, 'recall': 0.9655172413793104, 'f1': 0.7}
{'Chi-squared value': 174.49615384615385, 'P-value': 7.713200274413404e-40, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model deepset/gelectra-base on cannabis ###### 




100%|██████████| 44432/44432 [00:07<00:00, 5771.31it/s]


Metrics for deepset/gelectra-base on cannabis: {'accuracy': 0.9960920734599833, 'precision': 0.06626506024096386, 'recall': 0.3793103448275862, 'f1': 0.11282051282051282}
{'Chi-squared value': 10.694516971279374, 'P-value': 0.0010745352524984533, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model deepset/gbert-large on cannabis ###### 




100%|██████████| 44432/44432 [00:07<00:00, 5705.25it/s]


Metrics for deepset/gbert-large on cannabis: {'accuracy': 0.9993675032189568, 'precision': 0.509090909090909, 'recall': 0.9655172413793104, 'f1': 0.6666666666666666}
{'Chi-squared value': 169.30620155038758, 'P-value': 1.0488195193611374e-38, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model deepset/gbert-base on cannabis ###### 




100%|██████████| 44432/44432 [00:07<00:00, 5899.32it/s]


Metrics for deepset/gbert-base on cannabis: {'accuracy': 0.9991642006821929, 'precision': 0.42, 'recall': 0.7241379310344828, 'f1': 0.5316455696202531}
{'Chi-squared value': 150.9433962264151, 'P-value': 1.0783504830038695e-34, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model google-bert/bert-base-multilingual-cased on kinder ###### 




100%|██████████| 53253/53253 [00:09<00:00, 5613.25it/s]


Metrics for google-bert/bert-base-multilingual-cased on kinder: {'accuracy': 0.9314673645086662, 'precision': 0.01149739939775527, 'recall': 0.9333333333333333, 'f1': 0.022714981070849107}
{'Chi-squared value': 92.80283337665713, 'P-value': 5.7772386086477775e-22, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model FacebookAI/xlm-roberta-base on kinder ###### 




100%|██████████| 53253/53253 [00:09<00:00, 5801.26it/s]


Metrics for FacebookAI/xlm-roberta-base on kinder: {'accuracy': 0.9763340539310502, 'precision': 0.03263403263403263, 'recall': 0.9333333333333333, 'f1': 0.06306306306306306}
{'Chi-squared value': 1907.2365889752127, 'P-value': 0.0, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model FacebookAI/xlm-roberta-large on kinder ###### 




100%|██████████| 53253/53253 [00:09<00:00, 5713.42it/s]


Metrics for FacebookAI/xlm-roberta-large on kinder: {'accuracy': 0.9452535366177419, 'precision': 0.014354066985645933, 'recall': 0.9333333333333333, 'f1': 0.0282733086502861}
{'Chi-squared value': 341.08819875776396, 'P-value': 3.697271745071062e-76, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model dbmdz/bert-base-german-uncased on kinder ###### 




100%|██████████| 53253/53253 [00:09<00:00, 5742.90it/s]


Metrics for dbmdz/bert-base-german-uncased on kinder: {'accuracy': 0.9476049607463876, 'precision': 0.014989293361884369, 'recall': 0.9333333333333333, 'f1': 0.029504741833508957}
{'Chi-squared value': 402.23968675709693, 'P-value': 1.7921995373498607e-89, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model deepset/gelectra-large on kinder ###### 




100%|██████████| 53253/53253 [00:09<00:00, 5697.04it/s]


Metrics for deepset/gelectra-large on kinder: {'accuracy': 0.9309553608677513, 'precision': 0.01141304347826087, 'recall': 0.9333333333333333, 'f1': 0.022550335570469798}
{'Chi-squared value': 86.3497225448445, 'P-value': 1.5076968099856985e-20, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model deepset/gelectra-base on kinder ###### 




100%|██████████| 53253/53253 [00:09<00:00, 5737.59it/s]


Metrics for deepset/gelectra-base on kinder: {'accuracy': 0.8637691053210452, 'precision': 0.0033393627382774455, 'recall': 0.5333333333333333, 'f1': 0.00663716814159292}
{'Chi-squared value': 667.9936036036036, 'P-value': 2.7285893609902625e-147, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model deepset/gbert-large on kinder ###### 




100%|██████████| 53253/53253 [00:09<00:00, 5654.77it/s]


Metrics for deepset/gbert-large on kinder: {'accuracy': 0.9376682974930785, 'precision': 0.012627781118460614, 'recall': 0.9333333333333333, 'f1': 0.024918421833283893}
{'Chi-squared value': 184.3984427439925, 'P-value': 5.310363761280684e-42, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model deepset/gbert-base on kinder ###### 




100%|██████████| 53253/53253 [00:09<00:00, 5652.46it/s]


Metrics for deepset/gbert-base on kinder: {'accuracy': 0.9175484507149088, 'precision': 0.009573740597219056, 'recall': 0.9333333333333333, 'f1': 0.01895306859205776}
{'Chi-squared value': 1.4594882729211087, 'P-value': 0.22701155942299367, 'Significance': False, 'Interpretation': 'The difference between the models is not statistically significant.'}


###### Evaluating model google-bert/bert-base-multilingual-cased on energie ###### 




100%|██████████| 45925/45925 [00:07<00:00, 5751.68it/s]


Metrics for google-bert/bert-base-multilingual-cased on energie: {'accuracy': 0.8711812738160044, 'precision': 0.0035431078117091276, 'recall': 0.6774193548387096, 'f1': 0.007049345417925478}
{'Chi-squared value': 335.10884105557585, 'P-value': 7.414816598581238e-75, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model FacebookAI/xlm-roberta-base on energie ###### 




100%|██████████| 45925/45925 [00:07<00:00, 5777.85it/s]


Metrics for FacebookAI/xlm-roberta-base on energie: {'accuracy': 0.9364833968426782, 'precision': 0.0081799591002045, 'recall': 0.7741935483870968, 'f1': 0.016188870151770656}
{'Chi-squared value': 291.70512402088775, 'P-value': 2.1136297615474224e-65, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model FacebookAI/xlm-roberta-large on energie ###### 




100%|██████████| 45925/45925 [00:08<00:00, 5656.14it/s]


Metrics for FacebookAI/xlm-roberta-large on energie: {'accuracy': 0.9143168209036473, 'precision': 0.007070707070707071, 'recall': 0.9032258064516129, 'f1': 0.014031571034828364}
{'Chi-squared value': 13.747770872737098, 'P-value': 0.00020906877802145257, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model dbmdz/bert-base-german-uncased on energie ###### 




100%|██████████| 45925/45925 [00:08<00:00, 5675.13it/s]


Metrics for dbmdz/bert-base-german-uncased on energie: {'accuracy': 0.9039738704409364, 'precision': 0.0056446150372544595, 'recall': 0.8064516129032258, 'f1': 0.011210762331838564}
{'Chi-squared value': 3.0339004733273636, 'P-value': 0.08154177683785487, 'Significance': False, 'Interpretation': 'The difference between the models is not statistically significant.'}


###### Evaluating model deepset/gelectra-large on energie ###### 




100%|██████████| 45925/45925 [00:08<00:00, 5632.03it/s]


Metrics for deepset/gelectra-large on energie: {'accuracy': 0.9832770821992379, 'precision': 0.030573248407643312, 'recall': 0.7741935483870968, 'f1': 0.058823529411764705}
{'Chi-squared value': 2772.5749486652976, 'P-value': 0.0, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model deepset/gelectra-base on energie ###### 




100%|██████████| 45925/45925 [00:07<00:00, 5762.84it/s]


Metrics for deepset/gelectra-base on energie: {'accuracy': 0.9039085465432771, 'precision': 0.0027235587834770767, 'recall': 0.3870967741935484, 'f1': 0.005409060175794456}
{'Chi-squared value': 3.4253752084491382, 'P-value': 0.06420165271935481, 'Significance': False, 'Interpretation': 'The difference between the models is not statistically significant.'}


###### Evaluating model deepset/gbert-large on energie ###### 




100%|██████████| 45925/45925 [00:08<00:00, 5702.90it/s]


Metrics for deepset/gbert-large on energie: {'accuracy': 0.9229831246597714, 'precision': 0.007584269662921348, 'recall': 0.8709677419354839, 'f1': 0.015037593984962405}
{'Chi-squared value': 73.52531464530892, 'P-value': 9.935373509181449e-18, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


###### Evaluating model deepset/gbert-base on energie ###### 




100%|██████████| 45925/45925 [00:08<00:00, 5735.42it/s]


Metrics for deepset/gbert-base on energie: {'accuracy': 0.9257267283614589, 'precision': 0.007575757575757576, 'recall': 0.8387096774193549, 'f1': 0.015015882183078255}
{'Chi-squared value': 103.80499561787906, 'P-value': 2.232415660573675e-24, 'Significance': True, 'Interpretation': 'The difference between the models is statistically significant.'}


In [13]:
eval_results_significance

defaultdict(dict,
            {'google-bert/bert-base-multilingual-cased': {'cannabis': {'Chi-squared value': 177.64426877470356,
               'P-value': 1.5841283620116657e-40,
               'Significance': True,
               'Interpretation': 'The difference between the models is statistically significant.'},
              'kinder': {'Chi-squared value': 92.80283337665713,
               'P-value': 5.7772386086477775e-22,
               'Significance': True,
               'Interpretation': 'The difference between the models is statistically significant.'},
              'energie': {'Chi-squared value': 335.10884105557585,
               'P-value': 7.414816598581238e-75,
               'Significance': True,
               'Interpretation': 'The difference between the models is statistically significant.'}},
             'FacebookAI/xlm-roberta-base': {'cannabis': {'Chi-squared value': 124.8904593639576,
               'P-value': 5.378316564684772e-29,
               'Significanc

In [14]:
import numpy as np
from collections import defaultdict
from tabulate import tabulate


# Identify all topics (assuming all models are evaluated on the same topics)
topics = list(next(iter(eval_results_significance.values())).keys())

# Prepare headers for the table: each topic will have two metrics
headers = ["Model"] + [f"{topic} {metric}" for topic in topics for metric in ["P-value", "Significance"]]

# Prepare rows: one row per model, containing metrics for each topic
rows = []
for model, topics_metrics in eval_results_significance.items():
    row = [model]  # Start with the model name
    for topic in topics:
        metrics = topics_metrics.get(topic, {})
        row.extend([metrics.get('P-value', 0.0), metrics.get('Significance', False)])
    rows.append(row)

# Generate the HTML table
table_html = tabulate(rows, headers=headers, tablefmt="html", showindex="never")

# Print the HTML table (for display purposes, you might want to actually render this in an HTML document)
display(table_html)

Model,cannabis P-value,cannabis Significance,kinder P-value,kinder Significance,energie P-value,energie Significance
google-bert/bert-base-multilingual-cased,1.58413e-40,1,5.77724e-22,1,7.41482e-75,1
FacebookAI/xlm-roberta-base,5.37832e-29,1,0.0,1,2.1136299999999998e-65,1
FacebookAI/xlm-roberta-large,3.1046300000000003e-32,1,3.6972700000000004e-76,1,0.000209069,1
dbmdz/bert-base-german-uncased,1.18454e-20,1,1.7921999999999998e-89,1,0.0815418,0
deepset/gelectra-large,7.713199999999999e-40,1,1.5077e-20,1,0.0,1
deepset/gelectra-base,0.00107454,1,2.72859e-147,1,0.0642017,0
deepset/gbert-large,1.04882e-38,1,5.3103599999999997e-42,1,9.935370000000001e-18,1
deepset/gbert-base,1.0783500000000001e-34,1,0.227012,0,2.23242e-24,1


In [15]:
# Define the file path to save the dictionary
file_path = f"eval_results_{FEATURES}_{SPLIT}_pages_significance.json"

In [16]:
import json 
def convert_numpy_types(obj):
    """Convert numpy types to native Python types."""
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.bool_):
        return bool(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {k: convert_numpy_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(i) for i in obj]
    else:
        return obj

# Convert the entire dictionary
eval_results_significance = convert_numpy_types(eval_results_significance)

with open(file_path, "w") as file:
    json.dump(eval_results_significance, file)