## Setup and Import Libraries

In [1]:
# Import necessary libraries
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
from sklearn.metrics import classification_report, f1_score
import os, pandas as pd
import json

In [2]:
def get_notebook_environment():
    """Returns the environment in which the notebook is running."""
    if 'KAGGLE_URL_BASE' in os.environ:
        return 'Kaggle'
    if 'GOOGLE_CLOUD_PROJECT' in os.environ:
        return 'GCC'
    if 'COLAB_GPU' in os.environ:
        return 'Google Colab'
    return 'local'

environment = get_notebook_environment()
print(f'You are running on: {environment}')

# Define the folder where the dataset is saved
folder=''
if environment == 'Kaggle':
    folder ='/kaggle/input/dataset-bioautex/'
elif environment == 'GCC':
    folder ='dataset-bioautex/'
elif environment == 'Google Colab':
    from google.colab import drive
    drive.mount('/content/drive')
    folder='/content/drive/My Drive/Biomedicina/dataset-bioautex/'

try:
    os.chdir(folder)
except:
    pass


You are running on: Google Colab
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Mayority Voting Ensemble

In [3]:
def majority_voting(predictions):
    """
    Perform majority voting on the predictions from multiple models.

    Args:
    predictions (np.array): Array of shape (num_models, num_samples, num_classes) containing predictions from multiple models.

    Returns:
    np.array: Array of shape (num_samples,) containing the final predictions.
    """
    majority_votes = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)
    return majority_votes

## Define paths to results

In [4]:
# Replace with the desired path to the results folders
preds_path = [
    "../ensembler/results/bert-base-multilingual-cased",
    "../ensembler/results/gpt2",
    "../ensembler/results/xlm-roberta-base",
    "../ensembler/results/fast_detect_gpt",
    "../ensembler/results/mdeberta-v3-base",
    "../ensembler/results/medical_mT5/",
]

# Define the experiments to run
experiments = [1,2,3,4,5]

# Name of the file to test
pred_file_name = 'results_no_dataleak.csv'

all_models = True

SAVE = True
if SAVE:
    # create directories to save predictions and scores
    os.makedirs('../ensembler/predictions/', exist_ok=True)
    os.makedirs('../ensembler/scores/', exist_ok=True)
    os.makedirs('../ensembler/matrix/', exist_ok=True)


## Create saving results functions

In [5]:
def save_ensemble_predictions(ensembler_pred, ensemble_file_name):
    """Saves the ensembled predictions to a CSV file."""
    df_ensemble = pd.DataFrame(ensembler_pred, columns=['prediction'])
    file_predictions = f'../ensembler/predictions/{ensemble_file_name}.csv'
    df_ensemble.to_csv(file_predictions, index=False)
    print(f'{file_predictions} saved')

def save_ensemble_scores(y_true, ensembler_pred, ensemble_file_name):
    """Calculates and saves the classification report to a text file."""
    score_en = classification_report(y_true, ensembler_pred)
    print(score_en)
    file_scores = f'../ensembler/scores/{ensemble_file_name}.txt'
    with open(file_scores, 'w') as f:
        f.write(score_en)
    print(f'{file_scores} saved')

def save_confusion_matrix(y_true, ensembler_pred, ensemble_file_name):
    """Calculates and saves the confusion matrix to a CSV file."""
    cm = confusion_matrix(y_true, ensembler_pred)
    df_cm = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
    file_matrix = f'../ensembler/matrix/{ensemble_file_name}_matrix.csv'
    df_cm.to_csv(file_matrix)
    print(f'{file_matrix} saved')

# Test the ensambler

In [6]:
for experiment in experiments:
    model_preds = []
    y_true_loaded = None

    for path in preds_path:
        file_path = os.path.join(path, str(experiment), pred_file_name)
        df = pd.read_csv(file_path)

        predictions = df['Prediction'].values
        model_preds.append(predictions)

        if y_true_loaded is None:
            y_true_loaded = df['label'].values

    all_predictions = np.array(model_preds)
    ensembler_pred = majority_voting(all_predictions)

    ensemble_file_name = f"MV_ens_all_models_exp_{experiment}" if all_models else f"MV_ens_no_mT5_{experiment}"

    if SAVE:
        save_ensemble_predictions(ensembler_pred, ensemble_file_name)
        save_ensemble_scores(y_true_loaded, ensembler_pred, ensemble_file_name)
        save_confusion_matrix(y_true_loaded, ensembler_pred, ensemble_file_name)

../ensembler/predictions/MV_ens_all_models_exp_1.csv saved
              precision    recall  f1-score   support

           0       0.88      0.68      0.77      1250
           1       0.74      0.91      0.82      1250

    accuracy                           0.80      2500
   macro avg       0.81      0.80      0.79      2500
weighted avg       0.81      0.80      0.79      2500

../ensembler/scores/MV_ens_all_models_exp_1.txt saved
../ensembler/matrix/MV_ens_all_models_exp_1_matrix.csv saved
../ensembler/predictions/MV_ens_all_models_exp_2.csv saved
              precision    recall  f1-score   support

           0       0.87      0.64      0.74      1250
           1       0.72      0.91      0.80      1250

    accuracy                           0.77      2500
   macro avg       0.80      0.77      0.77      2500
weighted avg       0.80      0.77      0.77      2500

../ensembler/scores/MV_ens_all_models_exp_2.txt saved
../ensembler/matrix/MV_ens_all_models_exp_2_matrix.csv save