In [None]:
import pandas as pd
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Load training results
results_path = Path("~/Desktop/training_results/full_fine-tuning/alephbert-base/cls/training_results.json").expanduser()

with open(results_path, 'r') as f:
    results = json.load(f)

# Extract test metrics
test_metrics = results['test_metrics']
print("Test Metrics:")
print(f"  F1 Score: {test_metrics['f1']:.4f}")
print(f"  Accuracy: {test_metrics['accuracy']:.4f}")
print(f"  Precision: {test_metrics['precision']:.4f}")
print(f"  Recall: {test_metrics['recall']:.4f}")

# Confusion matrix
cm = [
    [test_metrics['confusion_matrix_tn'], test_metrics['confusion_matrix_fp']],
    [test_metrics['confusion_matrix_fn'], test_metrics['confusion_matrix_tp']]
]

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Literal', 'Figurative'],
            yticklabels=['Literal', 'Figurative'])
plt.title('Confusion Matrix - AlephBERT')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Training history
history_df = pd.DataFrame(results['training_history'])

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Loss
axes[0, 0].plot(history_df['epoch'], history_df['loss'], label='Train Loss')
axes[0, 0].plot(history_df['epoch'], history_df['eval_loss'], label='Val Loss')
axes[0, 0].set_title('Loss over Epochs')
axes[0, 0].legend()

# F1 Score
axes[0, 1].plot(history_df['epoch'], history_df['eval_f1'])
axes[0, 1].set_title('Validation F1 Score')
axes[0, 1].axhline(y=test_metrics['f1'], color='r', linestyle='--', label='Test F1')
axes[0, 1].legend()

# Accuracy
axes[1, 0].plot(history_df['epoch'], history_df['eval_accuracy'])
axes[1, 0].set_title('Validation Accuracy')

# Learning Rate (if saved)
if 'learning_rate' in history_df.columns:
    axes[1, 1].plot(history_df['epoch'], history_df['learning_rate'])
    axes[1, 1].set_title('Learning Rate Schedule')

plt.tight_layout()
plt.show()

In [None]:
import glob

# Load all results
all_results = {}
results_dir = Path("~/Desktop/training_results/full_fine-tuning/").expanduser()

for model_dir in results_dir.glob("*/cls/"):
    model_name = model_dir.parent.name
    results_file = model_dir / "training_results.json"

    if results_file.exists():
        with open(results_file, 'r') as f:
            all_results[model_name] = json.load(f)

# Create comparison DataFrame
comparison = []
for model_name, results in all_results.items():
    test_metrics = results['test_metrics']
    comparison.append({
        'Model': model_name,
        'F1': test_metrics['f1'],
        'Accuracy': test_metrics['accuracy'],
        'Precision': test_metrics['precision'],
        'Recall': test_metrics['recall'],
        'Training Time (s)': results.get('training_metrics', {}).get('runtime', 'N/A')
    })

df_comparison = pd.DataFrame(comparison).sort_values('F1', ascending=False)
print(df_comparison)

# Plot comparison
plt.figure(figsize=(12, 6))
x = range(len(df_comparison))
width = 0.2

plt.bar([i - width*1.5 for i in x], df_comparison['F1'], width, label='F1', alpha=0.8)
plt.bar([i - width*0.5 for i in x], df_comparison['Accuracy'], width, label='Accuracy', alpha=0.8)
plt.bar([i + width*0.5 for i in x], df_comparison['Precision'], width, label='Precision', alpha=0.8)
plt.bar([i + width*1.5 for i in x], df_comparison['Recall'], width, label='Recall', alpha=0.8)

plt.xticks(x, df_comparison['Model'], rotation=45, ha='right')
plt.ylabel('Score')
plt.title('Model Comparison - Task 1 (Sentence Classification)')
plt.legend()
plt.tight_layout()
plt.show()