# Template Performance Analysis

This notebook analyzes the effectiveness of different template variations on model performance.

Key metrics analyzed:
- Success rate by template category
- Impact of token separation style
- Sequence length vs performance
- Template complexity vs accuracy
- Error pattern analysis

In [None]:
import sys
    from pathlib import Path
    import json
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from collections import defaultdict

    # Add project root to path
    project_root = Path.cwd().parent
    sys.path.append(str(project_root))

    # Set standard plotting parameters
    plt.style.use('seaborn')
    plt.rcParams['figure.figsize'] = [12, 8]
    plt.rcParams['figure.dpi'] = 100

    # Import project modules
    from src.data.data_loader import TemplateDataLoader, BatchConfig, DataStats

## Data Loading

In [None]:
# Initialize data loader
data_dir = project_root / "data" / "processed" / "template_variations"
batch_config = BatchConfig(
    batch_size=32,
    max_length=512,
    similar_length_tolerance=50,
    shuffle=True
)

loader = TemplateDataLoader(
    data_dir=data_dir,
    batch_config=batch_config,
    split_ratios=(0.8, 0.1, 0.1)
)

# Get data statistics
stats = loader.get_stats()
print(f"Total examples: {stats.total_examples}")
print(f"Average sequence length: {stats.avg_sequence_length:.2f}")

# Get training batches
train_batches = list(loader.train_batches())
print(f"Number of training batches: {len(train_batches)}")

## Analysis

In [None]:
# Plot template category distribution
plt.figure(figsize=(12, 6))
template_dist = pd.Series(stats.template_distribution)
sns.barplot(x=template_dist.index, y=template_dist.values)
plt.xticks(rotation=45)
plt.title('Distribution of Template Categories')
plt.tight_layout()
plt.savefig(project_root / 'results' / 'token_analysis' / 'template_distribution.png')
plt.close()

# Plot sequence length distribution
plt.figure(figsize=(12, 6))
length_dist = pd.Series(stats.length_distribution)
sns.histplot(data=length_dist.index, weights=length_dist.values, bins=30)
plt.title('Distribution of Sequence Lengths')
plt.xlabel('Sequence Length')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig(project_root / 'results' / 'token_analysis' / 'length_distribution.png')
plt.close()