In [1]:
import json
import os
import numpy as np
import pandas as pd
# This is the base directory where your fold reports are located.
dataset_name = 'mtsamples'
base_dir = f'analysis/{dataset_name}/reports'  # Replace with your actual path

# Initialize dictionaries to hold all the metric values from each fold.
binary_metrics = {
    'O': {'precision': [], 'recall': [], 'f1-score': []},
    'I': {'precision': [], 'recall': [], 'f1-score': []},
    'macro avg': {'precision': [], 'recall': [], 'f1-score': []},
    'weighted avg': {'precision': [], 'recall': [], 'f1-score': []}
}

multiclass_metrics = {
    "macro avg": {'precision': [], 'recall': [], 'f1-score': []},
    "weighted avg": {'precision': [], 'recall': [], 'f1-score': []},
    "macro_wo_O": {'precision': [], 'recall': [], 'f1-score': []}

}

# Process each fold
for fold in range(5):
    fold_dir = os.path.join(base_dir, f'fold{fold}')

    # Load binary classification report
    with open(os.path.join(fold_dir, 'binary_classification_report.json'), 'r') as f:
        binary_report = json.load(f)
        for category in binary_metrics.keys():
            for metric in binary_metrics[category].keys():
                binary_metrics[category][metric].append(binary_report[category][metric])

    # Load multiclass classification report
    multi_df = pd.read_json(os.path.join(fold_dir, 'multiclass_classification_report.json'))
  
    # with open(os.path.join(fold_dir, 'multiclass_classification_report.json'), 'r') as f:
    multiclass_report = multi_df.T.to_dict()
    print(multiclass_report)

    for category in multiclass_metrics.keys():

        for metric in multiclass_metrics[category].keys():
            multiclass_metrics[category][metric].append(multiclass_report[category][metric])

# Calculate mean and standard deviation for binary metrics
binary_results = {}
for category, metrics in binary_metrics.items():
 
    binary_results[category] = {}
    for metric, values in metrics.items():
        binary_results[category][metric] = {
            'mean': round(np.mean(values), 3),
            'std': round(np.std(values), 3)
        }

# Do the same for multiclass metrics
multiclass_results = {}
for category, metrics in multiclass_metrics.items():
    multiclass_results[category] = {}
    for metric, values in metrics.items():
        multiclass_results[category][metric] = {
            # round to 3 decimal places
            'mean': round(np.mean(values), 3),
            'std': round(np.std(values), 3)
        }

# Now you have binary_results and multiclass_results with the mean and std dev of each metric.
print('Binary results:')
# dict to dataframe
binary_results = pd.DataFrame(binary_results)    
binary_results.T




{'I-Background': {'precision': 0.833, 'recall': 0.895, 'f1-score': 0.863, 'support': 95.0}, 'I-Other': {'precision': 0.969, 'recall': 0.5740000000000001, 'f1-score': 0.721, 'support': 54.0}, 'I-Problem': {'precision': 0.911, 'recall': 0.862, 'f1-score': 0.886, 'support': 629.0}, 'I-Test': {'precision': 0.792, 'recall': 0.623, 'f1-score': 0.6970000000000001, 'support': 61.0}, 'I-Treatment': {'precision': 0.755, 'recall': 0.552, 'f1-score': 0.638, 'support': 67.0}, 'O': {'precision': 0.91, 'recall': 0.961, 'f1-score': 0.935, 'support': 1450.0}, 'accuracy': {'precision': 0.902, 'recall': 0.902, 'f1-score': 0.902, 'support': 0.902}, 'macro avg': {'precision': 0.862, 'recall': 0.744, 'f1-score': 0.79, 'support': 2356.0}, 'weighted avg': {'precision': 0.901, 'recall': 0.902, 'f1-score': 0.899, 'support': 2356.0}, 'macro_wo_O': {'precision': 0.852, 'recall': 0.7010000000000001, 'f1-score': 0.761, 'support': 906.0}}
{'I-Background': {'precision': 0.857, 'recall': 0.8210000000000001, 'f1-score'

Unnamed: 0,precision,recall,f1-score
O,"{'mean': 0.922, 'std': 0.006}","{'mean': 0.962, 'std': 0.003}","{'mean': 0.941, 'std': 0.003}"
I,"{'mean': 0.934, 'std': 0.005}","{'mean': 0.869, 'std': 0.011}","{'mean': 0.9, 'std': 0.007}"
macro avg,"{'mean': 0.928, 'std': 0.004}","{'mean': 0.915, 'std': 0.006}","{'mean': 0.921, 'std': 0.005}"
weighted avg,"{'mean': 0.926, 'std': 0.004}","{'mean': 0.926, 'std': 0.004}","{'mean': 0.925, 'std': 0.005}"


In [2]:
print('Multiclass results:')
# dict to dataframe
multiclass_results = pd.DataFrame(multiclass_results)
multiclass_results.T

Multiclass results:


Unnamed: 0,precision,recall,f1-score
macro avg,"{'mean': 0.845, 'std': 0.012}","{'mean': 0.749, 'std': 0.011}","{'mean': 0.788, 'std': 0.005}"
weighted avg,"{'mean': 0.905, 'std': 0.002}","{'mean': 0.906, 'std': 0.002}","{'mean': 0.904, 'std': 0.003}"
macro_wo_O,"{'mean': 0.83, 'std': 0.015}","{'mean': 0.706, 'std': 0.014}","{'mean': 0.757, 'std': 0.006}"
