# GPT-2 Token Extraction Validation

This notebook validates the extracted multi-character, letter-based tokens from the GPT-2 vocabulary and creates visualizations of their distribution.

In [None]:
import json
import random
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pathlib import Path

# Get project root directory
project_root = Path.cwd()

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette("husl")

# Load the extracted tokens
tokens_file = project_root / 'data' / 'processed' / 'gpt2_letter_tokens.json'
with open(tokens_file) as f:
    tokens_data = json.load(f)

# Convert to DataFrame for easier analysis
df = pd.DataFrame(tokens_data)

# Calculate basic statistics
print(f"Total number of tokens: {len(df)}")
print(f"Average token length: {df['length'].mean():.2f} characters")
print(f"Median token length: {df['length'].median()} characters")
print(f"Min token length: {df['length'].min()} characters")
print(f"Max token length: {df['length'].max()} characters")

# Create results directory if it doesn't exist
results_dir = project_root / 'results' / 'token_analysis'
results_dir.mkdir(parents=True, exist_ok=True)

# Create length distribution plot
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='length', bins=30)
plt.title('Distribution of Token Lengths')
plt.xlabel('Token Length (characters)')
plt.ylabel('Count')
plt.savefig(results_dir / 'length_distribution.png')
plt.show()