## Evaluate Classifiers

In [1]:
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, matthews_corrcoef
from datasets import load_from_disk
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd

from os.path import join


# set the path to the data
data_dir = "../../data"

# pred_dir in data dir
pred_dir = join(data_dir, "predictions")

## Define Parameters

In [2]:
SAMPLING = "random" # "random", "stratified", "clustered", "shared_domain"
SUFFIX = "_extended" #"", "_holdout", "_extended"
SPLITS = ['train', 'test', 'holdout', 'extended', 'holdout_url', 'extended_url']
MAX_CONTENT_LENGTH = 384 # 496, 192
OVERLAP = 64
FEATURES = "url_and_content" # "url", "content", "url_and_content"

In [3]:

MODELS = ["gbert-large", 'gelectra-large']
TOPICS = ["kinder", "energie", "cannabis"]

## Load Predictions

In [4]:
# create an empty list to store the dataframes
dfs = []

for model in MODELS:

    for topic in TOPICS:
        # load the file in the pred_dir
        pred_file = join(pred_dir, f"processed_dataset_{topic}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}_with_urls_{model}_{FEATURES}_with_predictions")

        # load the predictions
        datasets = load_from_disk(pred_file)

        coi = ['_id', 'batch_id', 'view_url', 'topic', 'is_topic', 'chunk_id', 'label', 'preds', 'probas']


        # convert all the datasets to pandas dataframes indicating the name of the dataset
        for split in SPLITS:
            # Get the current split dataset
            current_dataset = datasets[split]
            
            # Determine which columns to remove (all columns not in 'coi')
            columns_to_remove = [col for col in current_dataset.column_names if col not in coi]
            
            # Remove unwanted columns
            reduced_dataset = current_dataset.remove_columns(columns_to_remove)
            
            # Convert to pandas DataFrame with specified columns
            df = reduced_dataset.to_pandas()

            # add the split column
            df['split'] = split

            # add the model column
            df['model'] = model

            # append to the list
            dfs.append(df)

# concatenate the dataframes
df = pd.concat(dfs)

## Metric function

In [5]:
# Define a function to calculate the metrics for a group
def calculate_metrics(group):
    preds = group['preds']
    labels = group['label']
    
    metrics = {
        'f1_score': f1_score(labels, preds, average='binary'),
        'recall': recall_score(labels, preds, average='binary'),
        'precision': precision_score(labels, preds, average='binary'),
        'accuracy': accuracy_score(labels, preds),
        'mcc': matthews_corrcoef(labels, preds),
        'fp': sum((labels == 0) & (preds == 1)),
        'fn': sum((labels == 1) & (preds == 0)),
        'tp': sum((labels == 1) & (preds == 1)),
        'tn': sum((labels == 0) & (preds == 0)),
        'count': len(group)
    }
    return pd.Series(metrics)

# Option A: One positive chunk is enough

In [6]:
base_cols = ['model', 'topic', 'split', 'view_url']

# Group by 'model', 'topic', 'split', 'view_url', and 'preds' to calculate the size and mean probability
preds_grouped = df.groupby(base_cols + ['preds']).agg(count=('preds', 'size'), mean_proba=('probas', 'mean')).reset_index()

# One positive chunk is enough (assume highest preds is positive, i.e., 1): 
# sort by 'model', 'topic', 'split', 'view_url', & 'preds' 
most_common_preds = preds_grouped.sort_values( base_cols + ['preds'],  ascending=[True, True, True, True, False]
).drop_duplicates(subset=base_cols, keep='first')

# Drop duplicates based on 'model', 'topic', 'split', 'view_url' for labels (no need to sort)
the_labels = df.drop_duplicates(subset=base_cols)[base_cols + ['label']]

# Merge the results
df_page = pd.merge( most_common_preds[base_cols + ['preds']], the_labels[base_cols + ['label']], 
                   on=base_cols, how='outer', indicator=True)

# check the indicator of the merge
if len(df_page['_merge'] == 'both') != df_page.shape[0]:
    print(df_page['_merge'].value_counts())
    raise Exception("Not all rows are in both dataframes")


# Group by 'topic' and 'split', then apply the metrics calculation function
results = df_page.groupby(['model', 'topic', 'split']).apply(calculate_metrics).reset_index()

# Sort the split values in the following order: train, test, holdout, extended, holdout_url, extended_url
results['split'] = pd.Categorical(results['split'], SPLITS)

# Set the order of the topics
results['topic'] = pd.Categorical(results['topic'], TOPICS)

# sort the dataframe
results = results.sort_values(['model', 'topic', 'split'])

# print the results
results[['model', 'topic', 'split', 'accuracy', 'precision', 'recall', 'f1_score', 'mcc', 'count', 'tp', 'tn', 'fp', 'fn']]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,model,topic,split,accuracy,precision,recall,f1_score,mcc,count,tp,tn,fp,fn
17,gbert-large,kinder,train,0.997361,1.0,0.994652,0.997319,0.994736,379.0,186.0,192.0,0.0,1.0
16,gbert-large,kinder,test,1.0,1.0,1.0,1.0,1.0,43.0,21.0,22.0,0.0,0.0
14,gbert-large,kinder,holdout,0.9884,0.0,0.0,0.0,0.0,3707.0,0.0,3664.0,43.0,0.0
12,gbert-large,kinder,extended,0.997705,0.2625,0.933333,0.409756,0.494327,52734.0,42.0,52571.0,118.0,3.0
15,gbert-large,kinder,holdout_url,0.995704,0.666667,1.0,0.8,0.814725,931.0,8.0,919.0,4.0,0.0
13,gbert-large,kinder,extended_url,0.999743,0.969613,0.997159,0.983193,0.983162,46680.0,351.0,46317.0,11.0,1.0
11,gbert-large,energie,train,0.997537,1.0,0.99505,0.997519,0.995086,406.0,201.0,204.0,0.0,1.0
10,gbert-large,energie,test,0.978261,0.958333,1.0,0.978723,0.957427,46.0,23.0,22.0,1.0,0.0
8,gbert-large,energie,holdout,0.935879,0.0,0.0,0.0,0.0,4164.0,0.0,3897.0,267.0,0.0
6,gbert-large,energie,extended,0.980359,0.030205,0.903226,0.058455,0.163191,45925.0,28.0,44995.0,899.0,3.0


# Option B: Majority voting (& proba as tie breaker)

In [7]:

# One positive chunk is enough (assume highest preds is positive, i.e., 1): 
# sort by 'model', 'topic', 'split', 'view_url', & 'preds' 
most_common_preds = preds_grouped.sort_values( base_cols + ['count', 'mean_proba', 'preds'],
    ascending=[True, True, True, True, False, False, False]
).drop_duplicates(subset=base_cols, keep='first')

# Drop duplicates based on 'model', 'topic', 'split', 'view_url' for labels (no need to sort)
the_labels = df.drop_duplicates(subset=base_cols)[base_cols + ['label']]

# Merge the results
df_page = pd.merge( most_common_preds[base_cols + ['preds']], the_labels[base_cols + ['label']], 
                   on=base_cols, how='outer', indicator=True)

# check the indicator of the merge
if len(df_page['_merge'] == 'both') != df_page.shape[0]:
    print(df_page['_merge'].value_counts())
    raise Exception("Not all rows are in both dataframes")


# Group by 'topic' and 'split', then apply the metrics calculation function
results = df_page.groupby(['model', 'topic', 'split']).apply(calculate_metrics).reset_index()

# Sort the split values in the following order: train, test, holdout, extended, holdout_url, extended_url
results['split'] = pd.Categorical(results['split'], SPLITS)

# Set the order of the topics
results['topic'] = pd.Categorical(results['topic'], TOPICS)

# sort the dataframe
results = results.sort_values(['model', 'topic', 'split'])

# print the results
results[['model', 'topic', 'split', 'accuracy', 'precision', 'recall', 'f1_score', 'mcc', 'count', 'tp', 'tn', 'fp', 'fn']]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,model,topic,split,accuracy,precision,recall,f1_score,mcc,count,tp,tn,fp,fn
17,gbert-large,kinder,train,0.997361,1.0,0.994652,0.997319,0.994736,379.0,186.0,192.0,0.0,1.0
16,gbert-large,kinder,test,1.0,1.0,1.0,1.0,1.0,43.0,21.0,22.0,0.0,0.0
14,gbert-large,kinder,holdout,0.993526,0.0,0.0,0.0,0.0,3707.0,0.0,3683.0,24.0,0.0
12,gbert-large,kinder,extended,0.999241,0.531646,0.933333,0.677419,0.704114,52734.0,42.0,52652.0,37.0,3.0
15,gbert-large,kinder,holdout_url,0.995704,0.666667,1.0,0.8,0.814725,931.0,8.0,919.0,4.0,0.0
13,gbert-large,kinder,extended_url,0.999743,0.969613,0.997159,0.983193,0.983162,46680.0,351.0,46317.0,11.0,1.0
11,gbert-large,energie,train,0.997537,1.0,0.99505,0.997519,0.995086,406.0,201.0,204.0,0.0,1.0
10,gbert-large,energie,test,0.978261,0.958333,1.0,0.978723,0.957427,46.0,23.0,22.0,1.0,0.0
8,gbert-large,energie,holdout,0.95317,0.0,0.0,0.0,0.0,4164.0,0.0,3969.0,195.0,0.0
6,gbert-large,energie,extended,0.988481,0.050542,0.903226,0.095726,0.212161,45925.0,28.0,45368.0,526.0,3.0
