In [25]:
import json
import os
import numpy as np
import pandas as pd
# This is the base directory where your fold reports are located.
base_dir = 'analysis/phee/reports'  # Replace with your actual path

# Initialize dictionaries to hold all the metric values from each fold.
binary_metrics = {
    'O': {'precision': [], 'recall': [], 'f1-score': []},
    'I': {'precision': [], 'recall': [], 'f1-score': []},
    'macro avg': {'precision': [], 'recall': [], 'f1-score': []},
    'weighted avg': {'precision': [], 'recall': [], 'f1-score': []}
}

multiclass_metrics = {
    "macro avg": {'precision': [], 'recall': [], 'f1-score': []},
    "weighted avg": {'precision': [], 'recall': [], 'f1-score': []},
    "macro_wo_O": {'precision': [], 'recall': [], 'f1-score': []}

}

# Process each fold
for fold in range(5):
    fold_dir = os.path.join(base_dir, f'fold{fold}')

    # Load binary classification report
    with open(os.path.join(fold_dir, 'binary_classification_report.json'), 'r') as f:
        binary_report = json.load(f)
        for category in binary_metrics.keys():
            for metric in binary_metrics[category].keys():
                binary_metrics[category][metric].append(binary_report[category][metric])

    # Load multiclass classification report
    multi_df = pd.read_json(os.path.join(fold_dir, 'multiclass_classification_report.json'))
  
    # with open(os.path.join(fold_dir, 'multiclass_classification_report.json'), 'r') as f:
    multiclass_report = multi_df.T.to_dict()
    print(multiclass_report)

    for category in multiclass_metrics.keys():

        for metric in multiclass_metrics[category].keys():
            multiclass_metrics[category][metric].append(multiclass_report[category][metric])

# Calculate mean and standard deviation for binary metrics
binary_results = {}
for category, metrics in binary_metrics.items():
 
    binary_results[category] = {}
    for metric, values in metrics.items():
        binary_results[category][metric] = {
            'mean': round(np.mean(values), 3),
            'std': round(np.std(values), 3)
        }

# Do the same for multiclass metrics
multiclass_results = {}
for category, metrics in multiclass_metrics.items():
    multiclass_results[category] = {}
    for metric, values in metrics.items():
        multiclass_results[category][metric] = {
            # round to 3 decimal places
            'mean': round(np.mean(values), 3),
            'std': round(np.std(values), 3)
        }

# Now you have binary_results and multiclass_results with the mean and std dev of each metric.
print('Binary results:')
# dict to dataframe
binary_results = pd.DataFrame(binary_results)    
binary_results.T



{'I-Background': {'precision': 0.91, 'recall': 0.746, 'f1-score': 0.8200000000000001, 'support': 776.0}, 'I-Other': {'precision': 0.7030000000000001, 'recall': 0.759, 'f1-score': 0.73, 'support': 274.0}, 'I-Problem': {'precision': 0.872, 'recall': 0.887, 'f1-score': 0.879, 'support': 3889.0}, 'I-Test': {'precision': 0.5700000000000001, 'recall': 0.506, 'f1-score': 0.536, 'support': 160.0}, 'I-Treatment': {'precision': 0.833, 'recall': 0.89, 'f1-score': 0.86, 'support': 2680.0}, 'O': {'precision': 0.9380000000000001, 'recall': 0.929, 'f1-score': 0.933, 'support': 12477.0}, 'accuracy': {'precision': 0.903, 'recall': 0.903, 'f1-score': 0.903, 'support': 0.903}, 'macro avg': {'precision': 0.804, 'recall': 0.786, 'f1-score': 0.793, 'support': 20256.0}, 'weighted avg': {'precision': 0.904, 'recall': 0.903, 'f1-score': 0.903, 'support': 20256.0}, 'macro_wo_O': {'precision': 0.778, 'recall': 0.758, 'f1-score': 0.765, 'support': 7779.0}}
{'I-Background': {'precision': 0.854, 'recall': 0.724, 'f

Unnamed: 0,precision,recall,f1-score
O,"{'mean': 0.938, 'std': 0.002}","{'mean': 0.929, 'std': 0.006}","{'mean': 0.933, 'std': 0.004}"
I,"{'mean': 0.885, 'std': 0.009}","{'mean': 0.899, 'std': 0.004}","{'mean': 0.892, 'std': 0.006}"
macro avg,"{'mean': 0.911, 'std': 0.005}","{'mean': 0.914, 'std': 0.004}","{'mean': 0.912, 'std': 0.005}"
weighted avg,"{'mean': 0.918, 'std': 0.004}","{'mean': 0.917, 'std': 0.004}","{'mean': 0.917, 'std': 0.004}"


In [24]:
print('Multiclass results:')
# dict to dataframe
multiclass_results = pd.DataFrame(multiclass_results)
multiclass_results.T

Multiclass results:


Unnamed: 0,precision,recall,f1-score
macro avg,"{'mean': 0.806, 'std': 0.015}","{'mean': 0.803, 'std': 0.011}","{'mean': 0.803, 'std': 0.012}"
weighted avg,"{'mean': 0.904, 'std': 0.004}","{'mean': 0.903, 'std': 0.004}","{'mean': 0.903, 'std': 0.004}"
macro_wo_O,"{'mean': 0.78, 'std': 0.017}","{'mean': 0.778, 'std': 0.013}","{'mean': 0.777, 'std': 0.014}"
