# Performance evaluation 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [None]:
def compare_model_performance(metrics_json):
    n_models = len(metrics_json)
    if n_models < 2 or n_models > 4:
        raise ValueError("Function supports comparison between 2 to 4 models.")

    # Initialize a figure for the plots
    fig, axes = plt.subplots(nrows=2, ncols=n_models, figsize=(15, 10))
    
    # Lists to hold overall scores for comparison plots
    accuracies = []
    f1scores = []
    kappas = []
    mccs = []
    
    for i, (model_name, metrics) in enumerate(metrics_json.items()):
        # Extract metrics
        accuracy = metrics['accuracy']
        f1score = metrics['f1score']
        cm = np.array(metrics['confusion_matrix'])
        true_labels = metrics['true_labels']
        predicted_labels = metrics['predicted_labels']
        kappa = cohen_kappa_score(true_labels, predicted_labels)
        mcc = matthews_corrcoef(true_labels, predicted_labels)
        
        # Append overall scores for comparison plots
        accuracies.append(accuracy)
        f1scores.append(f1score)
        kappas.append(kappa)
        mccs.append(mcc)
        
        # Plot Confusion Matrix
        sns.heatmap(cm, annot=True, fmt='d', ax=axes[0, i], cmap='Blues')
        axes[0, i].set_title(f'Confusion Matrix: {model_name}')
        axes[0, i].set_xlabel('Predicted Labels')
        axes[0, i].set_ylabel('True Labels')
        
        # Display metrics
        print(f"Model: {model_name}")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"F1 Score (Weighted): {f1score:.2f}")
        print(f"Cohen's Kappa: {kappa:.2f}")
        print(f"MCC: {mcc:.2f}")
        print("----------")
    
    # Comparison plot for Accuracy
    axes[1, 0].bar(metrics_json.keys(), accuracies)
    axes[1, 0].set_title('Accuracy Comparison')
    axes[1, 0].set_ylabel('Accuracy')

    # Comparison plot for F1 Scores
    axes[1, 1].bar(metrics_json.keys(), f1scores)
    axes[1, 1].set_title('F1 Score Comparison')
    axes[1, 1].set_ylabel('F1 Score (Weighted)')

    # Comparison plot for Cohen's Kappa
    axes[1, 2].bar(metrics_json.keys(), kappas)
    axes[1, 2].set_title("Cohen's Kappa Comparison")
    axes[1, 2].set_ylabel("Cohen's Kappa")

    # Comparison plot for MCC
    axes[1, 3].bar(metrics_json.keys(), mccs)
    axes[1, 3].set_title('MCC Comparison')
    axes[1, 3].set_ylabel('MCC')

    # Adjust layout
    plt.tight_layout()
    plt.show()

# Sample usage:
# Each model's data is expected in the following format.
# metrics_json = {
#     'model1': {
#         'accuracy': 0.90,
#         'f1score': 0.91,
#         'confusion_matrix': [[13, 1, 1], [2, 15, 0], [0, 1, 17]],
#         'true_labels': [0, 0, 0, 1, 1, 1, 2, 2, 2],
#         'predicted_labels': [0, 0, 1, 1, 1, 1, 2, 2, 2]
#     },
#     'model2': {...},
#     'model3': {...},
#     'model4': {...}
# }
# compare_model_performance(metrics_json)
