In [1]:
import numpy as np

def calculate_metrics(cm, model_name):
    """Calculates and prints classification metrics from a confusion matrix."""
    n_classes = cm.shape[0]
    metrics = {}

    # Total instances
    total_samples = np.sum(cm)

    # Accuracy
    accuracy = np.trace(cm) / total_samples
    metrics['Accuracy'] = accuracy

    # Per-class metrics
    precision = []
    recall = []
    f1_score = []
    true_positives = np.diag(cm)
    false_positives = np.sum(cm, axis=0) - true_positives
    false_negatives = np.sum(cm, axis=1) - true_positives

    for i in range(n_classes):
        p = true_positives[i] / (true_positives[i] + false_positives[i]) if (true_positives[i] + false_positives[i]) > 0 else 0
        r = true_positives[i] / (true_positives[i] + false_negatives[i]) if (true_positives[i] + false_negatives[i]) > 0 else 0
        f1 = 2 * (p * r) / (p + r) if (p + r) > 0 else 0
        precision.append(p)
        recall.append(r)
        f1_score.append(f1)

    # Macro averages
    metrics['Macro Precision'] = np.mean(precision)
    metrics['Macro Recall'] = np.mean(recall)
    metrics['Macro F1-score'] = np.mean(f1_score)

    # Weighted averages
    samples_per_class = np.sum(cm, axis=1)
    metrics['Weighted Precision'] = np.sum([p * s for p, s in zip(precision, samples_per_class)]) / total_samples
    metrics['Weighted Recall'] = np.sum([r * s for r, s in zip(recall, samples_per_class)]) / total_samples
    metrics['Weighted F1-score'] = np.sum([f1 * s for f1, s in zip(f1_score, samples_per_class)]) / total_samples

    return {model_name: metrics}

# --- Data from your request ---
# Confusion matrices
cm_gpt_4_1_mini = np.array([[708, 788, 221], [110, 1262, 170], [24, 152, 27]])
cm_gpt_4_1 = np.array([[602, 497, 618], [48, 1073, 421], [5, 126, 72]])
cm_deepseek_chat = np.array([[564, 366, 787], [77, 764, 701], [12, 83, 108]])
cm_deepseek_reasoner = np.array([[683, 514, 520], [131, 1055, 356], [24, 128, 51]])

# --- Calculations ---
# Calculate metrics for all models
results = {}
results.update(calculate_metrics(cm_gpt_4_1_mini, 'GPT 4.1-mini'))
results.update(calculate_metrics(cm_gpt_4_1, 'GPT 4.1'))
results.update(calculate_metrics(cm_deepseek_chat, 'Deepseek Chat'))
results.update(calculate_metrics(cm_deepseek_reasoner, 'Deepseek Reasoner'))


# --- Output Generation ---
# Generate LaTeX table
latex_table = "\\begin{table}[h!]\n\\centering\n"
latex_table += "\\begin{tabular}{l|ccccccc}\n"
latex_table += "\\hline\n"
latex_table += "Model & Accuracy & Macro P & Macro R & Macro F1 & Weighted P & Weighted R & Weighted F1 \\\\ \n"
latex_table += "\\hline\n"

for model, metrics in results.items():
    latex_table += f"{model} & "
    latex_table += f"{metrics['Accuracy']:.3f} & "
    latex_table += f"{metrics['Macro Precision']:.3f} & "
    latex_table += f"{metrics['Macro Recall']:.3f} & "
    latex_table += f"{metrics['Macro F1-score']:.3f} & "
    latex_table += f"{metrics['Weighted Precision']:.3f} & "
    latex_table += f"{metrics['Weighted Recall']:.3f} & "
    latex_table += f"{metrics['Weighted F1-score']:.3f} \\\\ \n"

latex_table += "\\hline\n"
latex_table += "\\end{tabular}\n"
latex_table += "\\caption{Model Performance Metrics (P: Precision, R: Recall)}\n"
latex_table += "\\label{tab:model_metrics}\n"
latex_table += "\\end{table}"

print(latex_table)

\begin{table}[h!]
\centering
\begin{tabular}{l|ccccccc}
\hline
Model & Accuracy & Macro P & Macro R & Macro F1 & Weighted P & Weighted R & Weighted F1 \\ 
\hline
GPT 4.1-mini & 0.577 & 0.493 & 0.455 & 0.438 & 0.676 & 0.577 & 0.580 \\ 
GPT 4.1 & 0.505 & 0.539 & 0.467 & 0.427 & 0.741 & 0.505 & 0.553 \\ 
Deepseek Chat & 0.415 & 0.520 & 0.452 & 0.384 & 0.713 & 0.415 & 0.490 \\ 
Deepseek Reasoner & 0.517 & 0.497 & 0.444 & 0.425 & 0.684 & 0.517 & 0.561 \\ 
\hline
\end{tabular}
\caption{Model Performance Metrics (P: Precision, R: Recall)}
\label{tab:model_metrics}
\end{table}
