In [3]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

## Load Predictions and History

In [None]:
import os

if not os.path.exists('preprocessed/predictions.pkl') or not os.path.exists('preprocessed/training_history.pkl'):
    print("❌ ERROR: Required files not found!")
    print("\nYou need to run the following notebooks first:")
    print("1. Run '1_preprocessing.ipynb' to generate 'preprocessed/preprocessed_eeg_data.pkl'")
    print("2. Run '2_model.ipynb' to generate 'preprocessed/predictions.pkl' and 'training_history.pkl'")
    print("\nThese files are created during model training.")
    raise FileNotFoundError("Missing required files. Please run notebooks 1 and 2 first.")

with open('preprocessed/predictions.pkl', 'rb') as f:
    predictions = pickle.load(f)

with open('preprocessed/training_history.pkl', 'rb') as f:
    history = pickle.load(f)

y_true = predictions['y_true']
y_pred = predictions['y_pred']

print(f"Total samples evaluated: {len(y_true)}")

❌ ERROR: Required files not found!

You need to run the following notebooks first:
1. Run '1_preprocessing.ipynb' to generate 'preprocessed_eeg_data.pkl'
2. Run '2_model.ipynb' to generate 'predictions.pkl' and 'training_history.pkl'

These files are created during model training.


FileNotFoundError: Missing required files. Please run notebooks 1 and 2 first.

## Overall Metrics

In [None]:
accuracy = accuracy_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_weighted = f1_score(y_true, y_pred, average='weighted')

print("=" * 50)
print("OVERALL PERFORMANCE METRICS")
print("=" * 50)
print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"F1-Score (Macro): {f1_macro:.4f}")
print(f"F1-Score (Weighted): {f1_weighted:.4f}")
print("=" * 50)

## Per-Class Metrics

In [None]:
precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, average=None, zero_division=0)

print("\nPer-Subject Performance (first 10 subjects):")
print("-" * 70)
print(f"{'Subject':<10} {'Precision':<12} {'Recall':<12} {'F1-Score':<12} {'Support':<10}")
print("-" * 70)

for i in range(min(10, len(precision))):
    print(f"S{i+1:03d}       {precision[i]:.4f}       {recall[i]:.4f}       {f1[i]:.4f}       {support[i]}")

print("\n... (showing first 10 subjects only)")

## Training History Visualization

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

epochs = range(1, len(history['train_loss']) + 1)

ax1.plot(epochs, history['train_loss'], label='Train Loss', linewidth=2)
ax1.plot(epochs, history['val_loss'], label='Val Loss', linewidth=2)
ax1.set_xlabel('Epoch', fontsize=12)
ax1.set_ylabel('Loss', fontsize=12)
ax1.set_title('Training and Validation Loss', fontsize=14)
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.plot(epochs, history['train_acc'], label='Train Accuracy', linewidth=2)
ax2.plot(epochs, history['val_acc'], label='Val Accuracy', linewidth=2)
ax2.set_xlabel('Epoch', fontsize=12)
ax2.set_ylabel('Accuracy (%)', fontsize=12)
ax2.set_title('Training and Validation Accuracy', fontsize=14)
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('preprocessed/training_history.png', dpi=300, bbox_inches='tight')
plt.show()

print("Training history saved as 'preprocessed/training_history.png'")

## Error Analysis

In [None]:
errors = y_true != y_pred
num_errors = np.sum(errors)
error_rate = num_errors / len(y_true)

print("\n" + "=" * 50)
print("ERROR ANALYSIS")
print("=" * 50)
print(f"Total errors: {num_errors}")
print(f"Error rate: {error_rate:.4f} ({error_rate*100:.2f}%)")
print()

error_subjects = y_true[errors]
unique_subjects, error_counts = np.unique(error_subjects, return_counts=True)
sorted_idx = np.argsort(error_counts)[::-1]

print("Top 10 subjects with most misclassifications:")
print("-" * 40)
for i in range(min(10, len(unique_subjects))):
    subject_id = unique_subjects[sorted_idx[i]]
    count = error_counts[sorted_idx[i]]
    print(f"Subject S{subject_id+1:03d}: {count} errors")

## Best and Worst Performing Subjects

In [None]:
subject_f1_with_support = [(i, f1[i], support[i]) for i in range(len(f1)) if support[i] > 0]
subject_f1_with_support.sort(key=lambda x: x[1], reverse=True)

print("\n" + "=" * 50)
print("BEST PERFORMING SUBJECTS (Top 10)")
print("=" * 50)
for i in range(min(10, len(subject_f1_with_support))):
    subject_id, f1_val, sup = subject_f1_with_support[i]
    print(f"S{subject_id+1:03d} - F1: {f1_val:.4f} (samples: {sup})")

print("\n" + "=" * 50)
print("WORST PERFORMING SUBJECTS (Bottom 10)")
print("=" * 50)
for i in range(max(0, len(subject_f1_with_support)-10), len(subject_f1_with_support)):
    subject_id, f1_val, sup = subject_f1_with_support[i]
    print(f"S{subject_id+1:03d} - F1: {f1_val:.4f} (samples: {sup})")

## Model Strengths and Weaknesses

In [None]:
print("\n" + "=" * 70)
print("MODEL ANALYSIS: STRENGTHS AND WEAKNESSES")
print("=" * 70)

print("\n✓ STRENGTHS:")
print("-" * 70)
print(f"• Achieved {accuracy*100:.2f}% accuracy across 109 subjects")
print(f"• CNN effectively extracts spatial-frequency patterns from EEG")
print(f"• LSTM captures temporal dependencies in brain signals")
print(f"• Strong generalization with {len(y_true)} validation samples")
high_f1_count = np.sum(f1 > 0.8)
print(f"• {high_f1_count} subjects with F1-score > 0.8 (strong performance)")

print("\n✗ WEAKNESSES:")
print("-" * 70)
print(f"• Variable performance across subjects (F1 range: {f1[f1>0].min():.3f} - {f1.max():.3f})")
low_f1_count = np.sum((f1 < 0.5) & (support > 0))
if low_f1_count > 0:
    print(f"• {low_f1_count} subjects with F1-score < 0.5 (poor performance)")
print(f"• Some subjects easily confused (check confusion matrix)")
print(f"• Model complexity may lead to overfitting with limited data per subject")
print(f"• Cross-session generalization not explicitly tested")

print("\n⚡ RECOMMENDATIONS:")
print("-" * 70)
print("• Collect more data for poorly performing subjects")
print("• Apply data augmentation (time shifts, noise injection)")
print("• Test cross-session validation (different recording days)")
print("• Experiment with attention mechanisms for better feature selection")
print("• Consider subject-specific fine-tuning for low-performing cases")
print("=" * 70)

## Summary Report

In [None]:
report = f"""
╔══════════════════════════════════════════════════════════════════╗
║         EEG PERSON IDENTIFICATION - PERFORMANCE SUMMARY          ║
╚══════════════════════════════════════════════════════════════════╝

DATASET:
  • PhysioNet EEG Motor Movement/Imagery Database
  • 109 subjects
  • Validation samples: {len(y_true)}

MODEL ARCHITECTURE:
  • CNN (3 conv layers) + LSTM (2 layers)
  • Time-frequency feature extraction
  • Multi-class classification (109 classes)

PERFORMANCE:
  • Accuracy:           {accuracy*100:.2f}%
  • F1-Score (Macro):   {f1_macro:.4f}
  • F1-Score (Weighted): {f1_weighted:.4f}
  • Total errors:       {num_errors}/{len(y_true)}

CONCLUSION:
  The CNN+RNN hybrid model successfully learns subject-specific EEG
  patterns for person identification. Performance varies by subject,
  suggesting individual differences in brain signal consistency.

═══════════════════════════════════════════════════════════════════
"""

print(report)

with open('preprocessed/performance_report.txt', 'w') as f:
    f.write(report)

print("Performance report saved as 'preprocessed/performance_report.txt'")