# Custom OCR Model Evaluation Notebook

This notebook provides comprehensive evaluation and analysis of our trained custom OCR model.
We'll test the model's performance, analyze errors, and compare it with other OCR solutions.

## 🎯 Evaluation Objectives:
- Load and test trained custom OCR model
- Perform comprehensive error analysis
- Compare with pretrained OCR solutions (EasyOCR, Pytesseract)
- Analyze performance across different text characteristics
- Generate detailed evaluation reports

## 📊 Evaluation Metrics:
- **Accuracy**: Exact match percentage
- **CER**: Character Error Rate
- **WER**: Word Error Rate
- **BLEU Score**: Sequence similarity metric
- **Confidence Analysis**: Model prediction confidence

## 1. Import Libraries and Setup

Import all necessary libraries for model evaluation.

In [None]:
# Standard libraries
import os
import sys
import time
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Data handling and visualization
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter

# Deep learning
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

# Progress tracking
from tqdm.notebook import tqdm

# Statistical analysis
from scipy import stats
import editdistance

# Add project root to path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Import custom modules
from scripts.custom_model import create_model
from utils.dataset import CharacterMapping, OCRDataset, ctc_collate_fn
from utils.metrics import calculate_detailed_metrics, print_metrics_report
from scripts.predict import OCRPredictor

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print(f"📁 Project root: {project_root}")
print(f"🔧 PyTorch version: {torch.__version__}")
print(f"💻 CUDA available: {torch.cuda.is_available()}")

## 2. Load Trained Model and Configuration

Load our trained custom OCR model and its configuration.

In [None]:
# Define paths
model_path = project_root / 'models' / 'checkpoints' / 'best_model.pth'
test_csv = project_root / 'data' / 'test' / 'dataset.csv'
test_dir = project_root / 'data' / 'test'
results_dir = project_root / 'results'

# Create results directory if it doesn't exist
results_dir.mkdir(exist_ok=True)

# Check if model exists
if not model_path.exists():
    print(f"❌ Model not found at {model_path}")
    print("Please run the training notebook first to create the model.")
    exit()

# Load model checkpoint
print("🔄 Loading trained model...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
checkpoint = torch.load(model_path, map_location=device)

# Extract configuration and components
config = checkpoint['config']
char_mapping = checkpoint['char_mapping']
best_val_accuracy = checkpoint['best_val_accuracy']
training_epoch = checkpoint['epoch']

# Create and load model
model = create_model(
    num_classes=char_mapping.num_classes,
    img_height=config['img_height'],
    img_width=config['img_width'],
    lstm_hidden_size=config['lstm_hidden_size'],
    lstm_num_layers=config['lstm_num_layers']
)

model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

print(f"✅ Model loaded successfully!")
print(f"📊 Model trained for {training_epoch + 1} epochs")
print(f"🎯 Best validation accuracy: {best_val_accuracy:.4f}")
print(f"🧠 Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"🔧 Using device: {device}")

# Initialize predictor
predictor = OCRPredictor(str(model_path))
print(f"🔮 OCR Predictor initialized")

## 3. Load Test Dataset

Load test data for comprehensive evaluation.

In [None]:
# Check if test dataset exists
if not test_csv.exists():
    print("📊 Test dataset not found. Using validation dataset for evaluation...")
    test_csv = project_root / 'data' / 'val' / 'dataset.csv'
    test_dir = project_root / 'data' / 'val'
    
    if not test_csv.exists():
        print("❌ No evaluation dataset found. Please ensure you have validation or test data.")
        exit()

# Load test dataset
test_df = pd.read_csv(test_csv)
print(f"📊 Loaded test dataset with {len(test_df)} samples")

# Display sample data
print(f"\n📋 Sample test data:")
print(test_df.head())

# Create test dataset and dataloader
test_dataset = OCRDataset(
    csv_file=str(test_csv),
    image_dir=str(test_dir),
    char_mapping=char_mapping,
    img_height=config['img_height'],
    img_width=config['img_width'],
    is_training=False  # No augmentation for evaluation
)

test_loader = DataLoader(
    test_dataset,
    batch_size=16,
    shuffle=False,
    num_workers=2,
    collate_fn=ctc_collate_fn,
    pin_memory=True
)

print(f"🔄 Test dataloader created with {len(test_loader)} batches")

# Analyze test dataset characteristics
text_lengths = [len(text) for text in test_df['label']]
unique_chars = set(''.join(test_df['label'].str.lower()))

print(f"\n📈 Test Dataset Statistics:")
print(f"  Total samples: {len(test_df)}")
print(f"  Average text length: {np.mean(text_lengths):.2f} characters")
print(f"  Min/Max text length: {min(text_lengths)}/{max(text_lengths)} characters")
print(f"  Unique characters: {len(unique_chars)}")
print(f"  Character set: {''.join(sorted(unique_chars))[:50]}{'...' if len(unique_chars) > 50 else ''}")

## 4. Model Performance Evaluation

Evaluate our custom model on the test dataset.

In [None]:
# Perform evaluation on test dataset
print("🔍 Evaluating model on test dataset...")

all_predictions = []
all_targets = []
all_confidences = []
prediction_times = []
detailed_results = []

model.eval()
with torch.no_grad():
    for batch_idx, batch in enumerate(tqdm(test_loader, desc="Evaluating")):
        images = batch['images'].to(device)
        texts = batch['texts']
        
        # Measure prediction time
        start_time = time.time()
        
        # Get model predictions
        outputs = model(images)
        predictions = model.predict(images)
        
        # Calculate confidence scores (using max softmax probability)
        probs = F.softmax(outputs, dim=-1)
        max_probs = torch.max(probs, dim=-1)[0]
        confidences = torch.mean(max_probs, dim=0).cpu().numpy()
        
        prediction_time = (time.time() - start_time) / len(images)
        
        # Decode predictions
        for i, pred in enumerate(predictions):
            pred_text = char_mapping.ctc_decode(pred.cpu().numpy())
            target_text = texts[i]
            confidence = confidences[i]
            
            all_predictions.append(pred_text)
            all_targets.append(target_text)
            all_confidences.append(confidence)
            prediction_times.append(prediction_time)
            
            # Store detailed results
            detailed_results.append({
                'batch_idx': batch_idx,
                'sample_idx': i,
                'target': target_text,
                'prediction': pred_text,
                'confidence': confidence,
                'prediction_time': prediction_time,
                'correct': pred_text.lower().strip() == target_text.lower().strip()
            })

# Calculate comprehensive metrics
test_metrics = calculate_detailed_metrics(all_predictions, all_targets)

print(f"\n📊 Test Set Evaluation Results:")
print("=" * 50)
print_metrics_report(test_metrics, "Custom OCR Model")

# Additional performance statistics
avg_prediction_time = np.mean(prediction_times)
avg_confidence = np.mean(all_confidences)

print(f"\n⚡ Performance Statistics:")
print(f"  Average prediction time: {avg_prediction_time*1000:.2f} ms per image")
print(f"  Average confidence score: {avg_confidence:.4f}")
print(f"  Total evaluation time: {sum(prediction_times):.2f} seconds")
print(f"  Throughput: {len(all_predictions)/sum(prediction_times):.1f} images/second")

## 5. Detailed Error Analysis

Analyze different types of errors and failure cases.

In [None]:
# Create detailed analysis DataFrame
results_df = pd.DataFrame(detailed_results)

# Calculate additional metrics for each sample
results_df['target_length'] = results_df['target'].apply(len)
results_df['prediction_length'] = results_df['prediction'].apply(len)
results_df['length_diff'] = results_df['prediction_length'] - results_df['target_length']
results_df['edit_distance'] = results_df.apply(
    lambda row: editdistance.eval(row['target'].lower(), row['prediction'].lower()), axis=1
)
results_df['cer'] = results_df['edit_distance'] / results_df['target_length']

# Error analysis by text length
length_bins = pd.cut(results_df['target_length'], bins=5, labels=['Very Short', 'Short', 'Medium', 'Long', 'Very Long'])
results_df['length_category'] = length_bins

print("📊 Error Analysis by Text Length:")
length_analysis = results_df.groupby('length_category').agg({
    'correct': ['count', 'sum', 'mean'],
    'cer': 'mean',
    'confidence': 'mean'
}).round(4)

length_analysis.columns = ['Total', 'Correct', 'Accuracy', 'Avg_CER', 'Avg_Confidence']
print(length_analysis)

# Confidence vs. Accuracy analysis
confidence_bins = pd.cut(results_df['confidence'], bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
results_df['confidence_category'] = confidence_bins

print(f"\n📊 Error Analysis by Confidence:")
confidence_analysis = results_df.groupby('confidence_category').agg({
    'correct': ['count', 'sum', 'mean'],
    'cer': 'mean'
}).round(4)

confidence_analysis.columns = ['Total', 'Correct', 'Accuracy', 'Avg_CER']
print(confidence_analysis)

# Character-level error analysis
char_errors = defaultdict(int)
char_substitutions = defaultdict(lambda: defaultdict(int))
char_total = defaultdict(int)

for _, row in results_df.iterrows():
    target = row['target'].lower()
    prediction = row['prediction'].lower()
    
    # Count character occurrences and errors
    for char in target:
        char_total[char] += 1
    
    # Simple character alignment for error analysis
    min_len = min(len(target), len(prediction))
    
    for i in range(min_len):
        if target[i] != prediction[i]:
            char_errors[target[i]] += 1
            char_substitutions[target[i]][prediction[i]] += 1
    
    # Handle length differences
    if len(target) > len(prediction):
        for i in range(min_len, len(target)):
            char_errors[target[i]] += 1

# Most problematic characters
char_error_rates = {}
for char in char_total:
    if char_total[char] >= 10:  # Only consider frequently occurring characters
        error_rate = char_errors[char] / char_total[char]
        char_error_rates[char] = error_rate

# Sort by error rate
sorted_char_errors = sorted(char_error_rates.items(), key=lambda x: x[1], reverse=True)

print(f"\n❌ Most Problematic Characters (Error Rate):")
for char, error_rate in sorted_char_errors[:10]:
    total_count = char_total[char]
    error_count = char_errors[char]
    print(f"  '{char}': {error_rate:.3f} ({error_count}/{total_count})")

# Most common character substitutions
print(f"\n🔄 Most Common Character Substitutions:")
all_substitutions = []
for source_char, targets in char_substitutions.items():
    for target_char, count in targets.items():
        all_substitutions.append((f"'{source_char}' → '{target_char}'", count))

all_substitutions.sort(key=lambda x: x[1], reverse=True)
for substitution, count in all_substitutions[:10]:
    print(f"  {substitution}: {count} times")

## 6. Visualization of Results

Create comprehensive visualizations of the evaluation results.

In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(3, 2, figsize=(15, 18))

# 1. Accuracy by text length
length_acc = results_df.groupby('length_category')['correct'].mean()
axes[0, 0].bar(range(len(length_acc)), length_acc.values, alpha=0.7, color='skyblue')
axes[0, 0].set_xticks(range(len(length_acc)))
axes[0, 0].set_xticklabels(length_acc.index, rotation=45)
axes[0, 0].set_title('Accuracy by Text Length Category')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].grid(True, alpha=0.3)

# Add value labels on bars
for i, v in enumerate(length_acc.values):
    axes[0, 0].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

# 2. Confidence vs Accuracy
axes[0, 1].scatter(results_df['confidence'], results_df['correct'], alpha=0.6)
axes[0, 1].set_xlabel('Confidence Score')
axes[0, 1].set_ylabel('Correct (1) / Incorrect (0)')
axes[0, 1].set_title('Confidence vs Accuracy')
axes[0, 1].grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(results_df['confidence'], results_df['correct'], 1)
p = np.poly1d(z)
axes[0, 1].plot(results_df['confidence'], p(results_df['confidence']), "r--", alpha=0.8)

# 3. Distribution of prediction times
axes[1, 0].hist(np.array(prediction_times) * 1000, bins=30, alpha=0.7, color='lightgreen')
axes[1, 0].set_xlabel('Prediction Time (ms)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distribution of Prediction Times')
axes[1, 0].axvline(avg_prediction_time * 1000, color='red', linestyle='--', 
                   label=f'Mean: {avg_prediction_time*1000:.1f} ms')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. CER distribution
axes[1, 1].hist(results_df['cer'], bins=30, alpha=0.7, color='orange')
axes[1, 1].set_xlabel('Character Error Rate')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Distribution of Character Error Rates')
axes[1, 1].axvline(results_df['cer'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {results_df["cer"].mean():.3f}')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

# 5. Length difference analysis
axes[2, 0].hist(results_df['length_diff'], bins=30, alpha=0.7, color='purple')
axes[2, 0].set_xlabel('Length Difference (Prediction - Target)')
axes[2, 0].set_ylabel('Frequency')
axes[2, 0].set_title('Distribution of Length Differences')
axes[2, 0].axvline(0, color='red', linestyle='--', label='Perfect Length Match')
axes[2, 0].legend()
axes[2, 0].grid(True, alpha=0.3)

# 6. Character error rates visualization
if sorted_char_errors:
    chars, error_rates = zip(*sorted_char_errors[:15])  # Top 15 problematic characters
    axes[2, 1].bar(range(len(chars)), error_rates, alpha=0.7, color='red')
    axes[2, 1].set_xticks(range(len(chars)))
    axes[2, 1].set_xticklabels([f"'{c}'" for c in chars], rotation=45)
    axes[2, 1].set_title('Character Error Rates (Top 15)')
    axes[2, 1].set_ylabel('Error Rate')
    axes[2, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(results_dir / 'evaluation_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# Additional metrics summary
print(f"\n📊 Additional Analysis:")
print(f"  Samples with perfect predictions: {results_df['correct'].sum()}/{len(results_df)} ({results_df['correct'].mean():.2%})")
print(f"  Samples with CER = 0: {(results_df['cer'] == 0).sum()}/{len(results_df)} ({(results_df['cer'] == 0).mean():.2%})")
print(f"  Samples with CER > 0.5: {(results_df['cer'] > 0.5).sum()}/{len(results_df)} ({(results_df['cer'] > 0.5).mean():.2%})")
print(f"  Correlation between confidence and accuracy: {np.corrcoef(results_df['confidence'], results_df['correct'])[0,1]:.3f}")

## 7. Sample Predictions Visualization

Visualize sample predictions including both correct and incorrect cases.

In [None]:
# Get sample predictions for visualization
sample_batch = next(iter(test_loader))
sample_images = sample_batch['images'][:8]  # First 8 images
sample_texts = sample_batch['texts'][:8]

# Get predictions for sample images
with torch.no_grad():
    sample_images_gpu = sample_images.to(device)
    sample_predictions = model.predict(sample_images_gpu)
    
    # Get confidence scores
    outputs = model(sample_images_gpu)
    probs = F.softmax(outputs, dim=-1)
    max_probs = torch.max(probs, dim=-1)[0]
    sample_confidences = torch.mean(max_probs, dim=0).cpu().numpy()

# Decode predictions
sample_pred_texts = []
for pred in sample_predictions:
    pred_text = char_mapping.ctc_decode(pred.cpu().numpy())
    sample_pred_texts.append(pred_text)

# Visualize sample predictions
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

for i in range(len(sample_images)):
    # Convert tensor to numpy and denormalize
    img = sample_images[i].squeeze().cpu().numpy()
    img = (img * 0.5) + 0.5  # Denormalize from [-1,1] to [0,1]
    
    # Check if prediction is correct
    target = sample_texts[i]
    prediction = sample_pred_texts[i]
    confidence = sample_confidences[i]
    
    is_correct = prediction.lower().strip() == target.lower().strip()
    color = 'green' if is_correct else 'red'
    status = '✓' if is_correct else '✗'
    
    # Calculate CER for this sample
    cer = editdistance.eval(target.lower(), prediction.lower()) / len(target) if len(target) > 0 else 0
    
    axes[i].imshow(img, cmap='gray')
    title = f"{status} Target: '{target}'\nPred: '{prediction}'\nConf: {confidence:.3f}, CER: {cer:.3f}"
    axes[i].set_title(title, color=color, fontsize=9)
    axes[i].axis('off')

plt.tight_layout()
plt.suptitle('Sample Model Predictions', fontsize=16, y=1.02)
plt.savefig(results_dir / 'sample_predictions_detailed.png', dpi=300, bbox_inches='tight')
plt.show()

# Show worst predictions
worst_predictions = results_df.nlargest(5, 'cer')
print(f"\n❌ Worst Predictions (Highest CER):")
print("=" * 70)
for idx, row in worst_predictions.iterrows():
    print(f"Target: '{row['target']}'")
    print(f"Prediction: '{row['prediction']}'")
    print(f"CER: {row['cer']:.3f}, Confidence: {row['confidence']:.3f}")
    print("-" * 50)

# Show best predictions with high confidence
best_predictions = results_df[(results_df['correct'] == True) & (results_df['confidence'] > 0.9)].nlargest(5, 'confidence')
print(f"\n✅ Best Predictions (High Confidence & Correct):")
print("=" * 70)
for idx, row in best_predictions.iterrows():
    print(f"Target: '{row['target']}'")
    print(f"Prediction: '{row['prediction']}'")
    print(f"Confidence: {row['confidence']:.3f}")
    print("-" * 50)

## 8. Comparison with Pretrained Models

Compare our custom model with EasyOCR and Pytesseract.

In [None]:
# Comparison with pretrained models
try:
    import easyocr
    import pytesseract
    from PIL import Image as PILImage
    
    print("🔄 Initializing pretrained OCR models...")
    easy_reader = easyocr.Reader(['en'])
    
    # Select subset of test images for comparison (to save time)
    comparison_indices = range(0, min(30, len(test_df)), 2)  # Every 2nd image, max 15 images
    comparison_df = test_df.iloc[comparison_indices].copy()
    
    print(f"🔍 Comparing models on {len(comparison_df)} test images...")
    
    # Get predictions from all models
    comparison_results = []
    
    for idx, row in tqdm(comparison_df.iterrows(), total=len(comparison_df), desc="Comparing models"):
        image_path = test_dir / row['imagename']
        target_text = row['label']
        
        if not image_path.exists():
            continue
        
        # Custom model prediction
        try:
            custom_result = predictor.predict_single(str(image_path))
            custom_pred = custom_result['text']
            custom_conf = custom_result['confidence']
        except:
            custom_pred = ""
            custom_conf = 0.0
        
        # EasyOCR prediction
        try:
            easy_results = easy_reader.readtext(str(image_path))
            easy_pred = ' '.join([result[1] for result in easy_results])
            easy_conf = np.mean([result[2] for result in easy_results]) if easy_results else 0.0
        except:
            easy_pred = ""
            easy_conf = 0.0
        
        # Pytesseract prediction
        try:
            img_pil = PILImage.open(image_path)
            tesseract_pred = pytesseract.image_to_string(img_pil, lang='eng').strip()
            tesseract_conf = 0.5  # Pytesseract doesn't provide confidence easily
        except:
            tesseract_pred = ""
            tesseract_conf = 0.0
        
        comparison_results.append({
            'image': row['imagename'],
            'target': target_text,
            'custom_pred': custom_pred,
            'custom_conf': custom_conf,
            'easy_pred': easy_pred.strip(),
            'easy_conf': easy_conf,
            'tesseract_pred': tesseract_pred,
            'tesseract_conf': tesseract_conf
        })
    
    # Create comparison DataFrame
    comp_df = pd.DataFrame(comparison_results)
    
    # Calculate metrics for each model
    custom_metrics_comp = calculate_detailed_metrics(comp_df['custom_pred'].tolist(), comp_df['target'].tolist())
    easy_metrics_comp = calculate_detailed_metrics(comp_df['easy_pred'].tolist(), comp_df['target'].tolist())
    tesseract_metrics_comp = calculate_detailed_metrics(comp_df['tesseract_pred'].tolist(), comp_df['target'].tolist())
    
    # Create detailed comparison table
    comparison_table = pd.DataFrame({
        'Model': ['Custom CRNN', 'EasyOCR', 'Pytesseract'],
        'Accuracy': [custom_metrics_comp['accuracy'], easy_metrics_comp['accuracy'], tesseract_metrics_comp['accuracy']],
        'CER': [custom_metrics_comp['cer'], easy_metrics_comp['cer'], tesseract_metrics_comp['cer']],
        'WER': [custom_metrics_comp['wer'], easy_metrics_comp['wer'], tesseract_metrics_comp['wer']],
        'BLEU': [custom_metrics_comp['bleu_score'], easy_metrics_comp['bleu_score'], tesseract_metrics_comp['bleu_score']],
        'Avg_Confidence': [comp_df['custom_conf'].mean(), comp_df['easy_conf'].mean(), comp_df['tesseract_conf'].mean()]
    })
    
    print(f"\n🏆 Model Comparison Results ({len(comp_df)} samples):")
    print("=" * 80)
    print(comparison_table.to_string(index=False, float_format='%.4f'))
    
    # Visualize comparison
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    metrics_to_plot = ['Accuracy', 'CER', 'WER', 'BLEU']
    colors = ['blue', 'green', 'orange']
    
    for i, metric in enumerate(metrics_to_plot):
        ax = axes[i//2, i%2]
        values = comparison_table[metric].values
        bars = ax.bar(comparison_table['Model'], values, color=colors, alpha=0.7)
        
        ax.set_title(f'{metric} Comparison')
        ax.set_ylabel(metric)
        
        # Add value labels
        for bar, value in zip(bars, values):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{value:.3f}', ha='center', va='bottom')
        
        # Highlight best performance
        best_idx = np.argmax(values) if metric in ['Accuracy', 'BLEU'] else np.argmin(values)
        bars[best_idx].set_color('gold')
        bars[best_idx].set_edgecolor('black')
        bars[best_idx].set_linewidth(2)
    
    plt.tight_layout()
    plt.savefig(results_dir / 'model_comparison_detailed.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Save detailed comparison results
    comp_df.to_csv(results_dir / 'detailed_model_comparison.csv', index=False)
    print(f"\n💾 Detailed comparison saved to: {results_dir / 'detailed_model_comparison.csv'}")
    
    # Performance summary
    best_accuracy_model = comparison_table.loc[comparison_table['Accuracy'].idxmax(), 'Model']
    best_cer_model = comparison_table.loc[comparison_table['CER'].idxmin(), 'Model']
    
    print(f"\n🏅 Performance Summary:")
    print(f"  Best Accuracy: {best_accuracy_model} ({comparison_table['Accuracy'].max():.4f})")
    print(f"  Best CER: {best_cer_model} ({comparison_table['CER'].min():.4f})")
    
    # Show some example comparisons
    print(f"\n📝 Example Comparisons:")
    print("=" * 80)
    for i in range(min(3, len(comp_df))):
        row = comp_df.iloc[i]
        print(f"Target: '{row['target']}'")
        print(f"Custom:    '{row['custom_pred']}' (conf: {row['custom_conf']:.3f})")
        print(f"EasyOCR:   '{row['easy_pred']}' (conf: {row['easy_conf']:.3f})")
        print(f"Tesseract: '{row['tesseract_pred']}'")
        print("-" * 60)
    
except ImportError as e:
    print(f"⚠️ Cannot compare with pretrained models: {e}")
    print("Install easyocr and pytesseract to enable comparison:")
    print("pip install easyocr pytesseract")
except Exception as e:
    print(f"❌ Error during comparison: {e}")

## 9. Generate Comprehensive Evaluation Report

Create a detailed evaluation report with all findings.

In [None]:
# Generate comprehensive evaluation report
report_content = f"""# Custom OCR Model Evaluation Report

**Evaluation Date**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
**Model Path**: {model_path}
**Test Dataset**: {test_csv}

## Model Information
- **Architecture**: CNN + LSTM + CTC Loss
- **Training Epochs**: {training_epoch + 1}
- **Model Parameters**: {sum(p.numel() for p in model.parameters()):,}
- **Image Input Size**: {config['img_height']}x{config['img_width']}
- **Character Classes**: {char_mapping.num_classes}
- **Device**: {device}

## Test Dataset Statistics
- **Total Samples**: {len(test_df)}
- **Average Text Length**: {np.mean([len(text) for text in test_df['label']]):.2f} characters
- **Text Length Range**: {min([len(text) for text in test_df['label']])}-{max([len(text) for text in test_df['label']])} characters
- **Unique Characters**: {len(set(''.join(test_df['label'].str.lower())))}

## Performance Metrics

### Overall Performance
- **Accuracy**: {test_metrics['accuracy']:.4f} ({test_metrics['accuracy']*100:.2f}%)
- **Character Error Rate (CER)**: {test_metrics['cer']:.4f}
- **Word Error Rate (WER)**: {test_metrics['wer']:.4f}
- **BLEU Score**: {test_metrics['bleu_score']:.4f}

### Performance Statistics
- **Average Prediction Time**: {avg_prediction_time*1000:.2f} ms per image
- **Throughput**: {len(all_predictions)/sum(prediction_times):.1f} images/second
- **Average Confidence**: {avg_confidence:.4f}
- **Perfect Predictions**: {results_df['correct'].sum()}/{len(results_df)} ({results_df['correct'].mean():.2%})

## Error Analysis

### Performance by Text Length
"""

# Add length analysis to report
for category in length_analysis.index:
    if pd.notna(category):
        stats = length_analysis.loc[category]
        report_content += f"- **{category}**: {stats['Accuracy']:.3f} accuracy, {stats['Avg_CER']:.3f} CER ({int(stats['Total'])} samples)\n"

report_content += f"""
### Most Problematic Characters
"""

# Add character error analysis
for char, error_rate in sorted_char_errors[:10]:
    total_count = char_total[char]
    error_count = char_errors[char]
    report_content += f"- **'{char}'**: {error_rate:.3f} error rate ({error_count}/{total_count})\n"

report_content += f"""
### Performance Distribution
- **Samples with CER = 0**: {(results_df['cer'] == 0).sum()}/{len(results_df)} ({(results_df['cer'] == 0).mean():.2%})
- **Samples with CER > 0.5**: {(results_df['cer'] > 0.5).sum()}/{len(results_df)} ({(results_df['cer'] > 0.5).mean():.2%})
- **Confidence-Accuracy Correlation**: {np.corrcoef(results_df['confidence'], results_df['correct'])[0,1]:.3f}

## Model Comparison"""

# Add comparison if available
try:
    if 'comparison_table' in locals():
        report_content += f"""

### Comparison with Pretrained Models (on {len(comp_df)} samples)
| Model | Accuracy | CER | WER | BLEU | Avg Confidence |
|-------|----------|-----|-----|------|----------------|
"""
        for _, row in comparison_table.iterrows():
            report_content += f"| {row['Model']} | {row['Accuracy']:.4f} | {row['CER']:.4f} | {row['WER']:.4f} | {row['BLEU']:.4f} | {row['Avg_Confidence']:.4f} |\n"
        
        best_overall = comparison_table.loc[comparison_table['Accuracy'].idxmax(), 'Model']
        report_content += f"\n**Best Overall Performance**: {best_overall}\n"
except:
    report_content += "\n\n*Comparison with pretrained models not performed (libraries not available)*\n"

report_content += f"""
## Insights and Recommendations

### Strengths
"""

# Add insights based on performance
if test_metrics['accuracy'] > 0.9:
    report_content += "- Excellent overall accuracy indicates strong model performance\n"
if avg_confidence > 0.8:
    report_content += "- High average confidence suggests reliable predictions\n"
if avg_prediction_time < 0.1:
    report_content += "- Fast inference speed suitable for real-time applications\n"

report_content += f"""
### Areas for Improvement
"""

if test_metrics['cer'] > 0.1:
    report_content += "- Character error rate could be improved with more training data or better architecture\n"
if len(length_analysis) > 1 and (length_analysis['Accuracy'].max() - length_analysis['Accuracy'].min()) > 0.3:
    report_content += "- Performance varies significantly with text length - consider balanced training data\n"
if np.corrcoef(results_df['confidence'], results_df['correct'])[0,1] < 0.3:
    report_content += "- Low confidence-accuracy correlation suggests need for better confidence calibration\n"

report_content += f"""
### Recommendations
1. **Data Augmentation**: Increase training data diversity with synthetic examples
2. **Architecture Improvements**: Consider attention mechanisms or transformer architectures
3. **Post-processing**: Implement language model-based correction
4. **Domain Adaptation**: Fine-tune on specific document types or fonts
5. **Ensemble Methods**: Combine multiple models for better accuracy

## Files Generated
- **Evaluation Report**: `results/evaluation_report.md`
- **Detailed Results**: `results/detailed_evaluation_results.csv`
- **Analysis Plots**: `results/evaluation_analysis.png`
- **Sample Predictions**: `results/sample_predictions_detailed.png`

---
*Report generated automatically by Custom OCR Evaluation Notebook*
"""

# Save evaluation report
with open(results_dir / 'evaluation_report.md', 'w') as f:
    f.write(report_content)

# Save detailed results
results_df.to_csv(results_dir / 'detailed_evaluation_results.csv', index=False)

# Save metrics summary
metrics_summary = {
    'test_metrics': test_metrics,
    'performance_stats': {
        'avg_prediction_time_ms': avg_prediction_time * 1000,
        'avg_confidence': avg_confidence,
        'throughput_images_per_sec': len(all_predictions) / sum(prediction_times)
    },
    'dataset_stats': {
        'total_samples': len(test_df),
        'avg_text_length': np.mean([len(text) for text in test_df['label']]),
        'unique_characters': len(set(''.join(test_df['label'].str.lower())))
    }
}

with open(results_dir / 'metrics_summary.json', 'w') as f:
    json.dump(metrics_summary, f, indent=2)

print("📊 Evaluation completed successfully!")
print("=" * 50)
print(f"📄 Detailed report: {results_dir / 'evaluation_report.md'}")
print(f"📈 Results CSV: {results_dir / 'detailed_evaluation_results.csv'}")
print(f"📊 Metrics JSON: {results_dir / 'metrics_summary.json'}")
print(f"🖼️ Analysis plots: {results_dir / 'evaluation_analysis.png'}")

# Final summary
print(f"\n🎯 Final Evaluation Summary:")
print(f"   Accuracy: {test_metrics['accuracy']:.4f} ({test_metrics['accuracy']*100:.2f}%)")
print(f"   CER: {test_metrics['cer']:.4f}")
print(f"   WER: {test_metrics['wer']:.4f}")
print(f"   Average prediction time: {avg_prediction_time*1000:.2f} ms")
print(f"   Model confidence: {avg_confidence:.4f}")
print(f"\n🚀 Your custom OCR model evaluation is complete!")