In [1]:
# Mount Google Drive for saving models and results
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Create directory for our project in Google Drive
project_drive_path = '/content/drive/MyDrive/khmer_ocr_training'
os.makedirs(project_drive_path, exist_ok=True)
os.makedirs(f'{project_drive_path}/models', exist_ok=True)
os.makedirs(f'{project_drive_path}/results', exist_ok=True)
os.makedirs(f'{project_drive_path}/logs', exist_ok=True)

print(f"✅ Google Drive mounted successfully!")
print(f"📁 Project directory: {project_drive_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive mounted successfully!
📁 Project directory: /content/drive/MyDrive/khmer_ocr_training


In [2]:
# Check GPU availability
import torch
import sys

print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("⚠️ GPU not available, will use CPU (training will be slower)")


Python version: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version: 12.4
GPU device: Tesla T4
GPU memory: 14.7 GB


In [3]:
# Clone the repository
import os
import subprocess

# Change to content directory
os.chdir('/content')

# Clone repository - REPLACE THIS URL WITH YOUR ACTUAL REPOSITORY URL
repo_url = "https://github.com/kunthet/khmer-ocr-digits.git"  # Replace with actual URL
repo_name = 'khmer-ocr-digits'

if not os.path.exists(repo_name):
    print("📥 Cloning repository...")
    try:
        result = subprocess.run(['git', 'clone', repo_url], capture_output=True, text=True)
        if result.returncode == 0:
            print(f"✅ Repository cloned successfully!")
        else:
            print(f"❌ Clone failed: {result.stderr}")
            print("Creating directory structure manually for demo...")
            os.makedirs(repo_name, exist_ok=True)
    except Exception as e:
        print(f"❌ Error cloning repository: {e}")
        print("Creating directory structure manually for demo...")
        os.makedirs(repo_name, exist_ok=True)
else:
    print(f"✅ Repository already exists at /content/{repo_name}")

# Change to repository directory
os.chdir(f'/content/{repo_name}')
print(f"📁 Current directory: {os.getcwd()}")


✅ Repository already exists at /content/khmer-ocr-digits
📁 Current directory: /content/khmer-ocr-digits


In [4]:
# Install dependencies
print("📦 Installing compatible PyTorch with CUDA support...")

# Use the pre-installed PyTorch in Colab or install compatible version
import sys
import subprocess

# Check if running in Colab
try:
    import google.colab
    IN_COLAB = True
    print("✅ Running in Google Colab - using pre-installed PyTorch")
except ImportError:
    IN_COLAB = False
    print("🔧 Not in Colab - installing PyTorch manually")

if not IN_COLAB:
    # Only install PyTorch if not in Colab
    import subprocess
    subprocess.run([sys.executable, "-m", "pip", "install", "torch", "torchvision", "torchaudio", "--extra-index-url", "https://download.pytorch.org/whl/cu118"])
else:
    # In Colab, use the pre-installed PyTorch or reinstall compatible version
    try:
        import torch
        print(f"Using pre-installed PyTorch {torch.__version__}")
    except ImportError:
        import subprocess
        subprocess.run([sys.executable, "-m", "pip", "install", "torch", "torchvision", "torchaudio"])

print("📦 Installing core dependencies...")
%pip install efficientnet_pytorch opencv-python Pillow numpy scipy pandas h5py

print("📦 Installing visualization and utilities...")
%pip install matplotlib seaborn tensorboard PyYAML omegaconf tqdm click

# Skip wandb for now to avoid conflicts
print("📦 Installing font and text processing libraries...")
%pip install fonttools freetype-py unicodedata2 scikit-learn scikit-image

print("📦 Installing Jupyter widgets...")
%pip install ipywidgets --quiet

print("✅ All dependencies installed successfully!")

# Verify PyTorch CUDA installation
import torch
print(f"🔥 PyTorch version: {torch.__version__}")
print(f"🔥 PyTorch CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"🔥 CUDA version: {torch.version.cuda}")
    print(f"🔥 cuDNN version: {torch.backends.cudnn.version()}")
    print(f"🔥 GPU device: {torch.cuda.get_device_name(0)}")
else:
    print("⚠️ CUDA not available - will use CPU (slower training)")


📦 Installing compatible PyTorch with CUDA support...
✅ Running in Google Colab - using pre-installed PyTorch
Using pre-installed PyTorch 2.6.0+cu124
📦 Installing core dependencies...
📦 Installing visualization and utilities...
📦 Installing font and text processing libraries...
📦 Installing Jupyter widgets...
✅ All dependencies installed successfully!
🔥 PyTorch version: 2.6.0+cu124
🔥 PyTorch CUDA available: True
🔥 CUDA version: 12.4
🔥 cuDNN version: 90100
🔥 GPU device: Tesla T4


In [5]:
# Fix cuDNN compatibility issues in Google Colab
print("🔧 Checking and fixing cuDNN compatibility...")

try:
    import torch
    # Test CUDA functionality
    if torch.cuda.is_available():
        # Try to create a simple tensor to test cuDNN
        test_tensor = torch.randn(1, 1, 10, 10).cuda()
        test_conv = torch.nn.Conv2d(1, 1, 3).cuda()
        _ = test_conv(test_tensor)
        print("✅ cuDNN working correctly!")
    else:
        print("⚠️ CUDA not available, using CPU")

except Exception as e:
    if "cudnnGetLibConfig" in str(e):
        print("🚨 cuDNN compatibility issue detected!")
        print("🔄 Applying fix...")

        # Restart runtime and reinstall PyTorch with compatible version
        print("Please restart the runtime (Runtime -> Restart runtime) and run this cell again.")
        print("If the issue persists, use CPU training by setting device='cpu' in the configuration.")

        # Alternative: Force CPU usage
        import os
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        print("⚠️ Forcing CPU usage due to cuDNN issues")
    else:
        print(f"❌ Unexpected error: {e}")

# Set device preference
device_preference = 'cuda' if torch.cuda.is_available() and 'CUDA_VISIBLE_DEVICES' not in os.environ else 'cpu'
print(f"🎯 Device preference set to: {device_preference}")

# Update the configuration to use CPU if CUDA has issues
import os
if device_preference == 'cpu':
    print("📝 Updating configuration for CPU training...")
    # We'll adjust batch sizes and other settings for CPU later


🔧 Checking and fixing cuDNN compatibility...
✅ cuDNN working correctly!
🎯 Device preference set to: cuda


In [6]:
# Alternative cuDNN fix - Run this cell only if you're still having issues

print("🔧 Alternative cuDNN fixes:")
print("1. Restart runtime and try again")
print("2. Use specific PyTorch version")
print("3. Force CPU training")

# Option 1: Install specific PyTorch version that's known to work with Colab
def fix_cudnn_option1():
    print("🔄 Installing PyTorch 2.0.1 with CUDA 11.8...")
    import subprocess
    import sys
    subprocess.run([
        sys.executable, "-m", "pip", "install",
        "torch==2.0.1", "torchvision==0.15.2", "torchaudio==2.0.2",
        "--index-url", "https://download.pytorch.org/whl/cu118"
    ])

# Option 2: Force CPU training
def fix_cudnn_option2():
    print("🔄 Forcing CPU training...")
    import os
    os.environ['CUDA_VISIBLE_DEVICES'] = ''
    global device_preference
    device_preference = 'cpu'

# Option 3: Use CPU-only PyTorch
def fix_cudnn_option3():
    print("🔄 Installing CPU-only PyTorch...")
    import subprocess
    import sys
    subprocess.run([
        sys.executable, "-m", "pip", "install",
        "torch", "torchvision", "torchaudio",
        "--index-url", "https://download.pytorch.org/whl/cpu"
    ])

# Uncomment one of these lines if needed:
# fix_cudnn_option1()  # Try specific PyTorch version
# fix_cudnn_option2()  # Force CPU training
# fix_cudnn_option3()  # Install CPU-only PyTorch

print("💡 If none of these work, restart the runtime and run all cells again.")
print("💡 The notebook will automatically adapt to CPU training if GPU fails.")


🔧 Alternative cuDNN fixes:
1. Restart runtime and try again
2. Use specific PyTorch version
3. Force CPU training
💡 If none of these work, restart the runtime and run all cells again.
💡 The notebook will automatically adapt to CPU training if GPU fails.


In [7]:
# Create essential project directories and files
import os
import yaml

# Create directory structure
directories = [
    'src/models',
    'src/modules/data_utils',
    'src/modules/synthetic_data_generator',
    'src/modules/trainers',
    'src/fonts',
    'config',
    'generated_data',
    'training_output',
    'docs'
]

for directory in directories:
    os.makedirs(directory, exist_ok=True)
    print(f"📁 Created directory: {directory}")

print("✅ Project structure created!")


📁 Created directory: src/models
📁 Created directory: src/modules/data_utils
📁 Created directory: src/modules/synthetic_data_generator
📁 Created directory: src/modules/trainers
📁 Created directory: src/fonts
📁 Created directory: config
📁 Created directory: generated_data
📁 Created directory: training_output
📁 Created directory: docs
✅ Project structure created!


In [8]:
# Create model configuration file
model_config = {
    'model': {
        'name': 'khmer_digits_ocr',
        'architecture': 'cnn_rnn_attention',
        'input': {
            'image_size': [128, 64],
            'channels': 3,
            'normalization': {
                'mean': [0.485, 0.456, 0.406],
                'std': [0.229, 0.224, 0.225]
            }
        },
        'characters': {
            'khmer_digits': ["០", "១", "២", "៣", "៤", "៥", "៦", "៧", "៨", "៩"],
            'special_tokens': ["<EOS>", "<PAD>", "<BLANK>"],
            'total_classes': 13,
            'max_sequence_length': 8
        },
        'cnn': {
            'type': 'resnet18',
            'pretrained': True,
            'feature_size': 512
        },
        'rnn': {
            'encoder': {
                'type': 'bidirectional_lstm',
                'hidden_size': 256,
                'num_layers': 2,
                'dropout': 0.1
            },
            'decoder': {
                'type': 'lstm',
                'hidden_size': 256,
                'num_layers': 1,
                'dropout': 0.1
            },
            'attention': {
                'type': 'bahdanau',
                'hidden_size': 256
            }
        }
    }
}

# Save model configuration
with open('config/model_config.yaml', 'w') as f:
    yaml.dump(model_config, f, default_flow_style=False, allow_unicode=True)

print("✅ Model configuration created!")


✅ Model configuration created!


In [9]:
# Create adaptive hyperparameter tuning configuration
# Adjust settings based on available device
try:
    use_gpu = device_preference == 'cuda'
except NameError:
    # Fallback if device_preference not set
    use_gpu = torch.cuda.is_available()
    device_preference = 'cuda' if use_gpu else 'cpu'

batch_size_multiplier = 4 if use_gpu else 1  # Reduce batch sizes for CPU
num_workers = 2 if use_gpu else 0  # Disable multiprocessing for CPU

print(f"🎯 Configuring for {device_preference.upper()} training")
print(f"📊 Batch size multiplier: {batch_size_multiplier}x")

hyperparameter_config = {
    'base_config': {
        'data': {
            'metadata_path': 'generated_data/metadata.yaml',
            'train_split': 'train',
            'val_split': 'val',
            'num_workers': num_workers,
            'pin_memory': use_gpu,
            'augmentation': True
        },
        'training': {
            'device': device_preference,
            'mixed_precision': use_gpu and torch.cuda.is_available(),
            'gradient_clip_norm': 1.0,
            'log_every_n_steps': 25,
            'save_every_n_epochs': 5,
            'keep_n_checkpoints': 3,
            'use_tensorboard': True
        },
        'early_stopping': {
            'patience': 8,
            'min_delta': 0.001,
            'monitor': 'val_char_accuracy',
            'mode': 'max'
        }
    },
    'experiments': {
        'baseline_gpu_optimized': {
            'experiment_name': 'baseline_gpu_optimized',
            'model': {
                'name': 'medium',
                'config_path': 'config/model_config.yaml'
            },
            'training': {
                'batch_size': 32 * batch_size_multiplier,  # Adaptive batch size
                'learning_rate': 0.002,
                'weight_decay': 0.0001,
                'num_epochs': 30 if use_gpu else 15,  # Fewer epochs for CPU
                'loss_type': 'crossentropy',
                'label_smoothing': 0.1
            },
            'optimizer': {
                'type': 'adamw',
                'betas': [0.9, 0.999]
            },
            'scheduler': {
                'type': 'cosine',
                'warmup_epochs': 3,
                'min_lr': 1e-6
            }
        },
        'aggressive_learning_gpu': {
            'experiment_name': 'aggressive_learning_gpu',
            'model': {
                'name': 'medium',
                'config_path': 'config/model_config.yaml'
            },
            'training': {
                'batch_size': 64 * batch_size_multiplier,  # Adaptive batch size
                'learning_rate': 0.003,
                'weight_decay': 0.0002,
                'num_epochs': 25 if use_gpu else 12,  # Fewer epochs for CPU
                'loss_type': 'crossentropy',
                'label_smoothing': 0.15
            },
            'optimizer': {
                'type': 'adamw',
                'betas': [0.9, 0.999]
            },
            'scheduler': {
                'type': 'steplr',
                'step_size': 8,
                'gamma': 0.5
            }
        },
        'large_model_gpu': {
            'experiment_name': 'large_model_gpu',
            'model': {
                'name': 'large',
                'config_path': 'config/model_config.yaml'
            },
            'training': {
                'batch_size': 16 * batch_size_multiplier,  # Smaller batch for large model
                'learning_rate': 0.0008,
                'weight_decay': 0.0005,
                'num_epochs': 25 if use_gpu else 10,  # Fewer epochs for CPU
                'loss_type': 'crossentropy',
                'label_smoothing': 0.2
            },
            'optimizer': {
                'type': 'adamw',
                'betas': [0.9, 0.999]
            },
            'scheduler': {
                'type': 'cosine',
                'warmup_epochs': 2,
                'min_lr': 1e-6
            }
        }
    }
}

# Save hyperparameter configuration
with open('config/phase3_colab_configs.yaml', 'w') as f:
    yaml.dump(hyperparameter_config, f, default_flow_style=False, allow_unicode=True)

print("✅ Hyperparameter configuration created!")
print(f"📊 Number of experiments: {len(hyperparameter_config['experiments'])}")


🎯 Configuring for CUDA training
📊 Batch size multiplier: 4x
✅ Hyperparameter configuration created!
📊 Number of experiments: 3


In [10]:
# Download Khmer fonts for data generation
import urllib.request
import os

# Create fonts directory
os.makedirs('src/fonts', exist_ok=True)

# Download a basic Khmer font (you may need to add more fonts)
font_urls = {
    'KhmerOS.ttf': 'https://github.com/google/fonts/raw/main/ofl/khmeros/KhmerOS.ttf'
}

print("📝 Downloading Khmer fonts...")
for font_name, url in font_urls.items():
    font_path = f'src/fonts/{font_name}'
    if not os.path.exists(font_path):
        try:
            urllib.request.urlretrieve(url, font_path)
            print(f"✅ Downloaded {font_name}")
        except Exception as e:
            print(f"❌ Failed to download {font_name}: {e}")
            # Create a dummy font file for testing
            with open(font_path, 'w') as f:
                f.write("dummy font file")
            print(f"⚠️ Created dummy font file: {font_name}")
    else:
        print(f"✅ Font already exists: {font_name}")

# List fonts
fonts = os.listdir('src/fonts')
print(f"📝 Available fonts: {fonts}")


📝 Downloading Khmer fonts...
✅ Font already exists: KhmerOS.ttf
📝 Available fonts: ['KhmerOSsiemreap.ttf', 'KhmerOSbokor.ttf', 'KhmerOSmuollight.ttf', 'KhmerOSmuol.ttf', 'KhmerOSfasthand.ttf', 'KhmerOS.ttf', 'KhmerOSbattambang.ttf', 'KhmerOSmetalchrieng.ttf']


In [11]:
# Create essential __init__.py files
init_files = [
    'src/__init__.py',
    'src/models/__init__.py',
    'src/modules/__init__.py',
    'src/modules/data_utils/__init__.py',
    'src/modules/synthetic_data_generator/__init__.py',
    'src/modules/trainers/__init__.py'
]

for init_file in init_files:
    with open(init_file, 'w') as f:
        f.write('"""Module initialization."""\n')
    print(f"✅ Created {init_file}")

print("✅ Module structure created!")


✅ Created src/__init__.py
✅ Created src/models/__init__.py
✅ Created src/modules/__init__.py
✅ Created src/modules/data_utils/__init__.py
✅ Created src/modules/synthetic_data_generator/__init__.py
✅ Created src/modules/trainers/__init__.py
✅ Module structure created!


In [12]:
# Simplified data generation for Colab
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import random
import json
import yaml
from pathlib import Path

class SimplifiedDataGenerator:
    """Simplified data generator for Colab environment."""

    def __init__(self, fonts_dir='src/fonts', output_dir='generated_data'):
        self.fonts_dir = Path(fonts_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)

        # Khmer digits
        self.khmer_digits = ["០", "១", "២", "៣", "៤", "៥", "៦", "៧", "៨", "៩"]
        self.special_tokens = ["<EOS>", "<PAD>", "<BLANK>"]

        # Create character mappings
        all_chars = self.khmer_digits + self.special_tokens
        self.char_to_idx = {char: idx for idx, char in enumerate(all_chars)}
        self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}

        print(f"✅ Data generator initialized")
        print(f"📊 Character set size: {len(all_chars)}")

    def generate_sample_image(self, text, size=(128, 64)):
        """Generate a simple text image."""
        # Create image with white background
        img = Image.new('RGB', size, 'white')
        draw = ImageDraw.Draw(img)

        # Try to use downloaded font, fallback to default
        try:
            font_files = list(self.fonts_dir.glob('*.ttf'))
            if font_files:
                font = ImageFont.truetype(str(font_files[0]), 24)
            else:
                font = ImageFont.load_default()
        except:
            font = ImageFont.load_default()

        # Calculate text position (center)
        bbox = draw.textbbox((0, 0), text, font=font)
        text_width = bbox[2] - bbox[0]
        text_height = bbox[3] - bbox[1]
        x = (size[0] - text_width) // 2
        y = (size[1] - text_height) // 2

        # Draw text
        draw.text((x, y), text, fill='black', font=font)

        return img

    def generate_dataset(self, num_samples=1000, train_split=0.8):
        """Generate a simple dataset."""
        print(f"🔄 Generating {num_samples} samples...")

        samples = []

        for i in range(num_samples):
            # Generate random sequence length (1-5 digits)
            seq_length = random.randint(1, 5)

            # Generate random digit sequence
            digits = [random.choice(self.khmer_digits) for _ in range(seq_length)]
            text = ''.join(digits)

            # Generate image
            img = self.generate_sample_image(text)

            # Save image
            img_filename = f"sample_{i:06d}.png"
            img_path = self.output_dir / img_filename
            img.save(img_path)

            # Create sample metadata
            sample = {
                'image_path': str(img_path),
                'text': text,
                'char_indices': [self.char_to_idx[char] for char in text],
                'sequence_length': len(text)
            }
            samples.append(sample)

            if (i + 1) % 100 == 0:
                print(f"  Generated {i + 1}/{num_samples} samples")

        # Split data
        split_idx = int(len(samples) * train_split)
        train_samples = samples[:split_idx]
        val_samples = samples[split_idx:]

        # Create metadata
        metadata = {
            'dataset_info': {
                'total_samples': len(samples),
                'train_samples': len(train_samples),
                'val_samples': len(val_samples),
                'char_to_idx': self.char_to_idx,
                'idx_to_char': self.idx_to_char,
                'max_sequence_length': max(s['sequence_length'] for s in samples)
            },
            'splits': {
                'train': train_samples,
                'val': val_samples
            }
        }

        # Save metadata
        metadata_path = self.output_dir / 'metadata.yaml'
        with open(metadata_path, 'w', encoding='utf-8') as f:
            yaml.dump(metadata, f, default_flow_style=False, allow_unicode=True)

        print(f"✅ Dataset generated successfully!")
        print(f"📊 Train samples: {len(train_samples)}")
        print(f"📊 Validation samples: {len(val_samples)}")
        print(f"📄 Metadata saved to: {metadata_path}")

        return metadata

# Generate dataset
generator = SimplifiedDataGenerator()
metadata = generator.generate_dataset(num_samples=2000, train_split=0.8)

print("✅ Data generation completed!")


✅ Data generator initialized
📊 Character set size: 13
🔄 Generating 2000 samples...
  Generated 100/2000 samples
  Generated 200/2000 samples
  Generated 300/2000 samples
  Generated 400/2000 samples
  Generated 500/2000 samples
  Generated 600/2000 samples
  Generated 700/2000 samples
  Generated 800/2000 samples
  Generated 900/2000 samples
  Generated 1000/2000 samples
  Generated 1100/2000 samples
  Generated 1200/2000 samples
  Generated 1300/2000 samples
  Generated 1400/2000 samples
  Generated 1500/2000 samples
  Generated 1600/2000 samples
  Generated 1700/2000 samples
  Generated 1800/2000 samples
  Generated 1900/2000 samples
  Generated 2000/2000 samples
✅ Dataset generated successfully!
📊 Train samples: 1600
📊 Validation samples: 400
📄 Metadata saved to: generated_data/metadata.yaml
✅ Data generation completed!


In [13]:
# Simplified OCR Model for Colab
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import yaml

class SimpleOCRModel(nn.Module):
    """Simplified OCR model for Khmer digits."""

    def __init__(self, vocab_size, max_sequence_length, model_size='medium'):
        super().__init__()
        self.vocab_size = vocab_size
        self.max_sequence_length = max_sequence_length

        # Model size configurations
        size_configs = {
            'small': {'cnn_features': 128, 'rnn_hidden': 128},
            'medium': {'cnn_features': 256, 'rnn_hidden': 256},
            'large': {'cnn_features': 512, 'rnn_hidden': 512}
        }
        config = size_configs[model_size]

        # Simple CNN backbone
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(128, config['cnn_features'], 3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((4, 8))
        )

        # RNN for sequence modeling
        self.rnn = nn.LSTM(
            config['cnn_features'],
            config['rnn_hidden'],
            batch_first=True,
            bidirectional=True
        )

        # Classification head
        self.classifier = nn.Linear(config['rnn_hidden'] * 2, vocab_size)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        batch_size = x.size(0)

        # CNN feature extraction
        features = self.cnn(x)  # [B, C, H, W]

        # Reshape for RNN
        features = features.view(batch_size, features.size(1), -1)  # [B, C, H*W]
        features = features.permute(0, 2, 1)  # [B, H*W, C]

        # RNN
        rnn_out, _ = self.rnn(features)  # [B, seq_len, hidden*2]

        # Apply dropout and classification
        rnn_out = self.dropout(rnn_out)
        logits = self.classifier(rnn_out)  # [B, seq_len, vocab_size]

        return logits

class KhmerDataset(Dataset):
    """Simple dataset for Khmer digits."""

    def __init__(self, samples, char_to_idx, max_seq_len, transform=None):
        self.samples = samples
        self.char_to_idx = char_to_idx
        self.max_seq_len = max_seq_len
        self.transform = transform or transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        # Load image
        image = Image.open(sample['image_path']).convert('RGB')
        if self.transform:
            image = self.transform(image)

        # Prepare target sequence
        text = sample['text']
        target = [self.char_to_idx[char] for char in text]
        target.append(self.char_to_idx['<EOS>'])  # Add EOS token

        # Pad sequence
        while len(target) < self.max_seq_len:
            target.append(self.char_to_idx['<PAD>'])

        target = torch.tensor(target[:self.max_seq_len], dtype=torch.long)

        return image, target, len(text) + 1  # +1 for EOS

def create_model(model_size, vocab_size, max_sequence_length):
    """Create model based on configuration."""
    return SimpleOCRModel(vocab_size, max_sequence_length, model_size)

print("✅ Model and dataset classes created!")


✅ Model and dataset classes created!


In [14]:
# Simplified Trainer for Hyperparameter Tuning
import time
from datetime import datetime
import copy
import shutil

class SimpleTrainer:
    """Simplified trainer for hyperparameter tuning."""

    def __init__(self, model, train_loader, val_loader, config, device):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.config = config
        self.device = device

        # Setup optimizer
        if config['optimizer']['type'] == 'adamw':
            self.optimizer = torch.optim.AdamW(
                model.parameters(),
                lr=config['training']['learning_rate'],
                weight_decay=config['training']['weight_decay'],
                betas=config['optimizer']['betas']
            )
        else:
            self.optimizer = torch.optim.Adam(
                model.parameters(),
                lr=config['training']['learning_rate'],
                weight_decay=config['training']['weight_decay']
            )

        # Setup scheduler
        if config['scheduler']['type'] == 'cosine':
            self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                self.optimizer,
                T_max=config['training']['num_epochs'],
                eta_min=config['scheduler']['min_lr']
            )
        elif config['scheduler']['type'] == 'steplr':
            self.scheduler = torch.optim.lr_scheduler.StepLR(
                self.optimizer,
                step_size=config['scheduler']['step_size'],
                gamma=config['scheduler']['gamma']
            )
        else:
            self.scheduler = None

        # Loss function
        self.criterion = nn.CrossEntropyLoss(ignore_index=metadata['dataset_info']['char_to_idx']['<PAD>'])

        # Training history
        self.history = {
            'train_loss': [],
            'val_loss': [],
            'val_char_accuracy': [],
            'val_seq_accuracy': []
        }

        self.best_val_acc = 0.0
        self.best_model_state = None

    def calculate_accuracy(self, outputs, targets, lengths):
        """Calculate character and sequence accuracy."""
        predictions = torch.argmax(outputs, dim=-1)

        char_correct = 0
        char_total = 0
        seq_correct = 0

        for pred, target, length in zip(predictions, targets, lengths):
            # Character accuracy
            pred_chars = pred[:length]
            target_chars = target[:length]
            char_correct += (pred_chars == target_chars).sum().item()
            char_total += length

            # Sequence accuracy
            if torch.equal(pred_chars, target_chars):
                seq_correct += 1

        char_accuracy = char_correct / char_total if char_total > 0 else 0
        seq_accuracy = seq_correct / len(lengths)

        return char_accuracy, seq_accuracy

    def train_epoch(self):
        """Train for one epoch."""
        self.model.train()
        total_loss = 0
        num_batches = 0

        for images, targets, lengths in self.train_loader:
            images = images.to(self.device)
            targets = targets.to(self.device)

            self.optimizer.zero_grad()

            outputs = self.model(images)

            # Reshape for loss computation
            outputs = outputs.view(-1, outputs.size(-1))
            targets = targets.view(-1)

            loss = self.criterion(outputs, targets)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

            self.optimizer.step()

            total_loss += loss.item()
            num_batches += 1

        return total_loss / num_batches

    def validate(self):
        """Validate the model."""
        self.model.eval()
        total_loss = 0
        all_char_acc = []
        all_seq_acc = []

        with torch.no_grad():
            for images, targets, lengths in self.val_loader:
                images = images.to(self.device)
                targets = targets.to(self.device)

                outputs = self.model(images)

                # Calculate loss
                outputs_flat = outputs.view(-1, outputs.size(-1))
                targets_flat = targets.view(-1)
                loss = self.criterion(outputs_flat, targets_flat)
                total_loss += loss.item()

                # Calculate accuracy
                char_acc, seq_acc = self.calculate_accuracy(outputs, targets, lengths)
                all_char_acc.append(char_acc)
                all_seq_acc.append(seq_acc)

        avg_loss = total_loss / len(self.val_loader)
        avg_char_acc = sum(all_char_acc) / len(all_char_acc)
        avg_seq_acc = sum(all_seq_acc) / len(all_seq_acc)

        return avg_loss, avg_char_acc, avg_seq_acc

    def train(self):
        """Full training loop."""
        print(f"🚀 Starting training: {self.config['experiment_name']}")

        for epoch in range(self.config['training']['num_epochs']):
            start_time = time.time()

            # Train
            train_loss = self.train_epoch()

            # Validate
            val_loss, val_char_acc, val_seq_acc = self.validate()

            # Update scheduler
            if self.scheduler:
                self.scheduler.step()

            # Update history
            self.history['train_loss'].append(train_loss)
            self.history['val_loss'].append(val_loss)
            self.history['val_char_accuracy'].append(val_char_acc)
            self.history['val_seq_accuracy'].append(val_seq_acc)

            # Save best model
            if val_char_acc > self.best_val_acc:
                self.best_val_acc = val_char_acc
                self.best_model_state = copy.deepcopy(self.model.state_dict())

            epoch_time = time.time() - start_time

            print(f"Epoch {epoch+1}/{self.config['training']['num_epochs']} | "
                  f"Train Loss: {train_loss:.4f} | "
                  f"Val Loss: {val_loss:.4f} | "
                  f"Val Char Acc: {val_char_acc:.4f} | "
                  f"Val Seq Acc: {val_seq_acc:.4f} | "
                  f"Time: {epoch_time:.1f}s")

            # Early stopping check
            if len(self.history['val_char_accuracy']) >= self.config['early_stopping']['patience']:
                recent_accs = self.history['val_char_accuracy'][-self.config['early_stopping']['patience']:]
                if max(recent_accs) - min(recent_accs) < self.config['early_stopping']['min_delta']:
                    print(f"Early stopping triggered at epoch {epoch+1}")
                    break

        return self.history

print("✅ Trainer class created!")


✅ Trainer class created!


In [15]:
# Fixed Model and Trainer Classes
print("🔧 Creating fixed model and trainer classes...")

class FixedSimpleOCRModel(nn.Module):
    """Fixed OCR model with proper sequence length handling."""

    def __init__(self, vocab_size, max_sequence_length, model_size='medium'):
        super().__init__()
        self.vocab_size = vocab_size
        self.max_sequence_length = max_sequence_length

        # Model size configurations
        size_configs = {
            'small': {'cnn_features': 128, 'rnn_hidden': 128},
            'medium': {'cnn_features': 256, 'rnn_hidden': 256},
            'large': {'cnn_features': 512, 'rnn_hidden': 512}
        }
        config = size_configs[model_size]

        # CNN backbone that outputs exactly max_sequence_length features
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(128, config['cnn_features'], 3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, max_sequence_length))  # Force exact sequence length
        )

        # RNN for sequence modeling
        self.rnn = nn.LSTM(
            config['cnn_features'],
            config['rnn_hidden'],
            batch_first=True,
            bidirectional=True
        )

        # Classification head
        self.classifier = nn.Linear(config['rnn_hidden'] * 2, vocab_size)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        batch_size = x.size(0)

        # CNN feature extraction
        features = self.cnn(x)  # [B, C, 1, max_seq_len]

        # Reshape for RNN: [B, max_seq_len, C]
        features = features.squeeze(2).permute(0, 2, 1)  # [B, max_seq_len, C]

        # RNN
        rnn_out, _ = self.rnn(features)  # [B, max_seq_len, hidden*2]

        # Apply dropout and classification
        rnn_out = self.dropout(rnn_out)
        logits = self.classifier(rnn_out)  # [B, max_seq_len, vocab_size]

        return logits

class FixedSimpleTrainer:
    """Fixed trainer with proper sequence length handling."""

    def __init__(self, model, train_loader, val_loader, config, device, char_to_idx):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.config = config
        self.device = device
        self.char_to_idx = char_to_idx

        # Setup optimizer
        if config['optimizer']['type'] == 'adamw':
            self.optimizer = torch.optim.AdamW(
                model.parameters(),
                lr=config['training']['learning_rate'],
                weight_decay=config['training']['weight_decay'],
                betas=config['optimizer']['betas']
            )
        else:
            self.optimizer = torch.optim.Adam(
                model.parameters(),
                lr=config['training']['learning_rate'],
                weight_decay=config['training']['weight_decay']
            )

        # Setup scheduler
        if config['scheduler']['type'] == 'cosine':
            self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                self.optimizer,
                T_max=config['training']['num_epochs'],
                eta_min=config['scheduler']['min_lr']
            )
        elif config['scheduler']['type'] == 'steplr':
            self.scheduler = torch.optim.lr_scheduler.StepLR(
                self.optimizer,
                step_size=config['scheduler']['step_size'],
                gamma=config['scheduler']['gamma']
            )
        else:
            self.scheduler = None

        # Loss function
        self.criterion = nn.CrossEntropyLoss(ignore_index=char_to_idx['<PAD>'])

        # Training history
        self.history = {
            'train_loss': [],
            'val_loss': [],
            'val_char_accuracy': [],
            'val_seq_accuracy': []
        }

        self.best_val_acc = 0.0
        self.best_model_state = None

    def calculate_accuracy(self, outputs, targets, lengths):
        """Calculate character and sequence accuracy."""
        predictions = torch.argmax(outputs, dim=-1)

        char_correct = 0
        char_total = 0
        seq_correct = 0

        for pred, target, length in zip(predictions, targets, lengths):
            # Character accuracy (exclude padding)
            pred_chars = pred[:length]
            target_chars = target[:length]
            char_correct += (pred_chars == target_chars).sum().item()
            char_total += length

            # Sequence accuracy
            if torch.equal(pred_chars, target_chars):
                seq_correct += 1

        char_accuracy = char_correct / char_total if char_total > 0 else 0
        seq_accuracy = seq_correct / len(lengths)

        return char_accuracy, seq_accuracy

    def train_epoch(self):
        """Train for one epoch."""
        self.model.train()
        total_loss = 0
        num_batches = 0

        for images, targets, lengths in self.train_loader:
            images = images.to(self.device)
            targets = targets.to(self.device)

            self.optimizer.zero_grad()

            outputs = self.model(images)  # [B, max_seq_len, vocab_size]

            # Now both outputs and targets have the same sequence length
            # Reshape for loss computation
            outputs = outputs.view(-1, outputs.size(-1))  # [B*max_seq_len, vocab_size]
            targets = targets.view(-1)  # [B*max_seq_len]

            loss = self.criterion(outputs, targets)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

            self.optimizer.step()

            total_loss += loss.item()
            num_batches += 1

        return total_loss / num_batches

    def validate(self):
        """Validate the model."""
        self.model.eval()
        total_loss = 0
        all_char_acc = []
        all_seq_acc = []

        with torch.no_grad():
            for images, targets, lengths in self.val_loader:
                images = images.to(self.device)
                targets = targets.to(self.device)

                outputs = self.model(images)

                # Calculate loss
                outputs_flat = outputs.view(-1, outputs.size(-1))
                targets_flat = targets.view(-1)
                loss = self.criterion(outputs_flat, targets_flat)
                total_loss += loss.item()

                # Calculate accuracy
                char_acc, seq_acc = self.calculate_accuracy(outputs, targets, lengths)
                all_char_acc.append(char_acc)
                all_seq_acc.append(seq_acc)

        avg_loss = total_loss / len(self.val_loader)
        avg_char_acc = sum(all_char_acc) / len(all_char_acc) if all_char_acc else 0
        avg_seq_acc = sum(all_seq_acc) / len(all_seq_acc) if all_seq_acc else 0

        return avg_loss, avg_char_acc, avg_seq_acc

    def train(self):
        """Full training loop."""
        print(f"🚀 Starting training: {self.config['experiment_name']}")

        for epoch in range(self.config['training']['num_epochs']):
            start_time = time.time()

            # Train
            train_loss = self.train_epoch()

            # Validate
            val_loss, val_char_acc, val_seq_acc = self.validate()

            # Update scheduler
            if self.scheduler:
                self.scheduler.step()

            # Update history
            self.history['train_loss'].append(train_loss)
            self.history['val_loss'].append(val_loss)
            self.history['val_char_accuracy'].append(val_char_acc)
            self.history['val_seq_accuracy'].append(val_seq_acc)

            # Save best model
            if val_char_acc > self.best_val_acc:
                self.best_val_acc = val_char_acc
                self.best_model_state = copy.deepcopy(self.model.state_dict())

            epoch_time = time.time() - start_time

            print(f"Epoch {epoch+1}/{self.config['training']['num_epochs']} | "
                  f"Train Loss: {train_loss:.4f} | "
                  f"Val Loss: {val_loss:.4f} | "
                  f"Val Char Acc: {val_char_acc:.4f} | "
                  f"Val Seq Acc: {val_seq_acc:.4f} | "
                  f"Time: {epoch_time:.1f}s")

            # Early stopping check
            if len(self.history['val_char_accuracy']) >= self.config['early_stopping']['patience']:
                recent_accs = self.history['val_char_accuracy'][-self.config['early_stopping']['patience']:]
                if max(recent_accs) - min(recent_accs) < self.config['early_stopping']['min_delta']:
                    print(f"Early stopping triggered at epoch {epoch+1}")
                    break

        return self.history

# Update create_model function
def create_model_fixed(model_size, vocab_size, max_sequence_length):
    """Create fixed model based on configuration."""
    return FixedSimpleOCRModel(vocab_size, max_sequence_length, model_size)

print("✅ Fixed model and trainer classes created!")


🔧 Creating fixed model and trainer classes...
✅ Fixed model and trainer classes created!


In [16]:
# Save results and generate summary
print("💾 Saving results to Google Drive...")
results_file = fixed_tuner.save_results()

# Generate summary report
summary = fixed_tuner.generate_summary()
print("\n📊 EXPERIMENT SUMMARY")
print("=" * 50)
for key, value in summary.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    elif key.endswith('_time'):
        print(f"{key}: {value/60:.1f} minutes" if isinstance(value, (int, float)) else f"{key}: {value}")
    else:
        print(f"{key}: {value}")

print(f"\n📁 Results saved to Google Drive:")
print(f"  • Detailed results: {results_file}")

# Show final status
completed_experiments = [r for r in fixed_tuner.results if r.get('status') == 'completed']
failed_experiments = [r for r in fixed_tuner.results if r.get('status') == 'failed']

print(f"\n🎯 FINAL STATUS:")
print(f"✅ Completed experiments: {len(completed_experiments)}")
print(f"❌ Failed experiments: {len(failed_experiments)}")

if completed_experiments:
    print(f"\n🏆 BEST PERFORMING MODEL:")
    best = fixed_tuner.best_result
    print(f"  • Experiment: {best['experiment_name']}")
    print(f"  • Character accuracy: {best['best_val_char_accuracy']:.4f}")
    print(f"  • Sequence accuracy: {best['best_val_seq_accuracy']:.4f}")
    print(f"  • Training time: {best['training_time']/60:.1f} minutes")
    print(f"  • Model saved at: {best.get('model_path', 'N/A')}")

print(f"\n✅ Hyperparameter tuning completed successfully!")


💾 Saving results to Google Drive...


NameError: name 'fixed_tuner' is not defined

In [None]:
# Quick fix for tensor serialization issue
def convert_tensors_to_python(obj):
    """Recursively convert tensors to Python types for JSON serialization."""
    if isinstance(obj, torch.Tensor):
        return obj.item() if obj.numel() == 1 else obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_tensors_to_python(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_tensors_to_python(item) for item in obj]
    elif isinstance(obj, (int, float, str, bool, type(None))):
        return obj
    else:
        # For other types, try to convert to string
        return str(obj)

# Save results with tensor conversion
print("💾 Saving results to Google Drive (with tensor conversion)...")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Convert tensors to Python types for JSON serialization
serializable_results = convert_tensors_to_python({
    'timestamp': timestamp,
    'device': str(fixed_tuner.device),
    'dataset_info': fixed_tuner.metadata['dataset_info'],
    'best_result': fixed_tuner.best_result,
    'all_results': fixed_tuner.results,
    'summary': fixed_tuner.generate_summary()
})

# Save detailed results
results_file = f"{project_drive_path}/results/hyperparameter_tuning_results_{timestamp}.json"
with open(results_file, 'w') as f:
    json.dump(serializable_results, f, indent=2)

print(f"✅ Results saved successfully to: {results_file}")

# Generate summary report
summary = fixed_tuner.generate_summary()
print("\n📊 EXPERIMENT SUMMARY")
print("=" * 50)
for key, value in summary.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    elif key.endswith('_time'):
        print(f"{key}: {value/60:.1f} minutes" if isinstance(value, (int, float)) else f"{key}: {value}")
    else:
        print(f"{key}: {value}")

print(f"\n📁 Results saved to Google Drive:")
print(f"  • Detailed results: {results_file}")

# Show final status
completed_experiments = [r for r in fixed_tuner.results if r.get('status') == 'completed']
failed_experiments = [r for r in fixed_tuner.results if r.get('status') == 'failed']

print(f"\n🎯 FINAL STATUS:")
print(f"✅ Completed experiments: {len(completed_experiments)}")
print(f"❌ Failed experiments: {len(failed_experiments)}")

if completed_experiments:
    print(f"\n🏆 BEST PERFORMING MODEL:")
    best = fixed_tuner.best_result
    print(f"  • Experiment: {best['experiment_name']}")
    print(f"  • Character accuracy: {best['best_val_char_accuracy']:.4f}")
    print(f"  • Sequence accuracy: {best['best_val_seq_accuracy']:.4f}")
    print(f"  • Training time: {best['training_time']/60:.1f} minutes")
    print(f"  • Model saved at: {best.get('model_path', 'N/A')}")

    # List all completed experiments
    print(f"\n📋 ALL COMPLETED EXPERIMENTS:")
    for i, result in enumerate(completed_experiments, 1):
        print(f"  {i}. {result['experiment_name']}")
        print(f"     - Character accuracy: {result['best_val_char_accuracy']:.4f}")
        print(f"     - Sequence accuracy: {result['best_val_seq_accuracy']:.4f}")
        print(f"     - Training time: {result['training_time']/60:.1f} min")

if failed_experiments:
    print(f"\n❌ FAILED EXPERIMENTS:")
    for result in failed_experiments:
        print(f"  • {result['experiment_name']}: {result.get('error', 'Unknown error')}")

print(f"\n✅ Hyperparameter tuning completed successfully!")


In [None]:
# Test the fixed classes with a single experiment
print("🧪 Testing fixed classes with a single experiment...")

# Load metadata
with open('generated_data/metadata.yaml', 'r') as f:
    metadata = yaml.safe_load(f)

print(f"📊 Dataset info:")
print(f"  • Max sequence length: {metadata['dataset_info']['max_sequence_length']}")
print(f"  • Vocab size: {len(metadata['dataset_info']['char_to_idx'])}")
print(f"  • Training samples: {metadata['dataset_info']['train_samples']}")

# Create a small test dataset
test_dataset = KhmerDataset(
    metadata['splits']['train'][:32],  # Use only 32 samples for quick test
    metadata['dataset_info']['char_to_idx'],
    metadata['dataset_info']['max_sequence_length'] + 1
)

test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Create fixed model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_model = create_model_fixed(
    model_size='small',  # Use small model for quick test
    vocab_size=len(metadata['dataset_info']['char_to_idx']),
    max_sequence_length=metadata['dataset_info']['max_sequence_length'] + 1
)

print(f"🔥 Using device: {device}")
print(f"📏 Model max sequence length: {test_model.max_sequence_length}")

# Test forward pass
test_model.to(device)
test_model.eval()

with torch.no_grad():
    for images, targets, lengths in test_loader:
        images = images.to(device)
        targets = targets.to(device)

        print(f"\n🔍 Testing batch:")
        print(f"  • Images shape: {images.shape}")
        print(f"  • Targets shape: {targets.shape}")
        print(f"  • Sequence lengths: {lengths.tolist()}")

        # Forward pass
        outputs = test_model(images)
        print(f"  • Model outputs shape: {outputs.shape}")

        # Test loss calculation
        pad_idx = metadata['dataset_info']['char_to_idx']['<PAD>']
        criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_idx)

        # Reshape for loss
        outputs_flat = outputs.view(-1, outputs.size(-1))
        targets_flat = targets.view(-1)

        print(f"  • Flattened outputs shape: {outputs_flat.shape}")
        print(f"  • Flattened targets shape: {targets_flat.shape}")

        # This should work now!
        loss = criterion(outputs_flat, targets_flat)
        print(f"  ✅ Loss calculated successfully: {loss.item():.4f}")

        break  # Only test first batch

print("\n✅ Fixed classes are working correctly!")
print("🚀 You can now run experiments with FixedHyperparameterTuner")

# Clean up
del test_model, test_dataset, test_loader
if torch.cuda.is_available():
    torch.cuda.empty_cache()


In [None]:
# 🚨 IMPORTANT: Use the FIXED tuner to avoid sequence length errors!
print("🚀 Starting ALL experiments with FIXED classes...")
print("⚠️  This will use FixedHyperparameterTuner (not the old one)")

# Initialize the FIXED tuner (not the old one!)
fixed_tuner = FixedHyperparameterTuner()

# Define ALL experiment configurations with FIXED settings
experiments = {
    'baseline_gpu_optimized_fixed': {
        'experiment_name': 'baseline_gpu_optimized_fixed',
        'model': {
            'name': 'medium',
            'config_path': 'config/model_config.yaml'
        },
        'training': {
            'batch_size': 128,
            'learning_rate': 0.002,
            'weight_decay': 0.0001,
            'num_epochs': 25,
            'loss_type': 'crossentropy',
            'label_smoothing': 0.1
        },
        'optimizer': {
            'type': 'adamw',
            'betas': [0.9, 0.999]
        },
        'scheduler': {
            'type': 'cosine',
            'warmup_epochs': 3,
            'min_lr': 1e-6
        }
    },

    'aggressive_learning_fixed': {
        'experiment_name': 'aggressive_learning_fixed',
        'model': {
            'name': 'medium',
            'config_path': 'config/model_config.yaml'
        },
        'training': {
            'batch_size': 256,
            'learning_rate': 0.003,
            'weight_decay': 0.00005,
            'num_epochs': 25,
            'loss_type': 'crossentropy',
            'label_smoothing': 0.05
        },
        'optimizer': {
            'type': 'adam',
            'betas': [0.9, 0.999]
        },
        'scheduler': {
            'type': 'step',
            'step_size': 8,
            'gamma': 0.5
        }
    },

    'large_model_gpu_fixed': {
        'experiment_name': 'large_model_gpu_fixed',
        'model': {
            'name': 'large',
            'config_path': 'config/model_config.yaml'
        },
        'training': {
            'batch_size': 64,
            'learning_rate': 0.0008,
            'weight_decay': 0.0003,
            'num_epochs': 30,
            'loss_type': 'crossentropy',
            'label_smoothing': 0.15
        },
        'optimizer': {
            'type': 'adamw',
            'betas': [0.9, 0.99]
        },
        'scheduler': {
            'type': 'cosine',
            'warmup_epochs': 5,
            'min_lr': 1e-7
        }
    }
}

print(f"📋 Will run {len(experiments)} experiments:")
for name in experiments.keys():
    print(f"  • {name}")

# Run all experiments with the FIXED tuner
results = fixed_tuner.run_experiments(experiments)

print("\n🎉 All experiments completed!")
print(f"📊 Results summary:")
print(f"  • Total experiments: {len(results)}")
print(f"  • Successful: {sum(1 for r in results if r['status'] == 'completed')}")
print(f"  • Failed: {sum(1 for r in results if r['status'] == 'failed')}")

# Find best result
best_result = None
for result in results:
    if result['status'] == 'completed':
        if best_result is None or result['best_val_char_accuracy'] > best_result['best_val_char_accuracy']:
            best_result = result

if best_result:
    print(f"\n🏆 Best experiment: {best_result['experiment_name']}")
    print(f"  • Character accuracy: {best_result['best_val_char_accuracy']:.4f}")
    print(f"  • Sequence accuracy: {best_result['best_val_seq_accuracy']:.4f}")
    print(f"  • Training time: {best_result['training_time']:.1f} seconds")
else:
    print("\n❌ No successful experiments")


In [None]:
# 🔧 CORRECTED APPROACH: Run experiments directly with fixed tuner
print("🚀 Starting experiments with CORRECTED FixedHyperparameterTuner...")

# Initialize the fixed tuner
fixed_tuner = FixedHyperparameterTuner()

# Define experiment configurations directly (not using the config file)
experiments = {
    'baseline_gpu_optimized_fixed': {
        'experiment_name': 'baseline_gpu_optimized_fixed',
        'model': {
            'name': 'medium',
            'config_path': 'config/model_config.yaml'
        },
        'training': {
            'batch_size': 128,
            'learning_rate': 0.002,
            'weight_decay': 0.0001,
            'num_epochs': 25,
            'loss_type': 'crossentropy',
            'label_smoothing': 0.1
        },
        'optimizer': {
            'type': 'adamw',
            'betas': [0.9, 0.999]
        },
        'scheduler': {
            'type': 'cosine',
            'warmup_epochs': 3,
            'min_lr': 1e-6
        }
    },

    'aggressive_learning_fixed': {
        'experiment_name': 'aggressive_learning_fixed',
        'model': {
            'name': 'medium',
            'config_path': 'config/model_config.yaml'
        },
        'training': {
            'batch_size': 256,
            'learning_rate': 0.003,
            'weight_decay': 0.00005,
            'num_epochs': 25,
            'loss_type': 'crossentropy',
            'label_smoothing': 0.05
        },
        'optimizer': {
            'type': 'adam',
            'betas': [0.9, 0.999]
        },
        'scheduler': {
            'type': 'step',
            'step_size': 8,
            'gamma': 0.5
        }
    },

    'large_model_gpu_fixed': {
        'experiment_name': 'large_model_gpu_fixed',
        'model': {
            'name': 'large',
            'config_path': 'config/model_config.yaml'
        },
        'training': {
            'batch_size': 64,
            'learning_rate': 0.0008,
            'weight_decay': 0.0003,
            'num_epochs': 30,
            'loss_type': 'crossentropy',
            'label_smoothing': 0.15
        },
        'optimizer': {
            'type': 'adamw',
            'betas': [0.9, 0.99]
        },
        'scheduler': {
            'type': 'cosine',
            'warmup_epochs': 5,
            'min_lr': 1e-7
        }
    }
}

print(f"📋 Will run {len(experiments)} experiments:")
for name in experiments.keys():
    print(f"  • {name}")
print()

# Run experiments one by one and collect results
results = []
for exp_name, exp_config in experiments.items():
    print(f"\n{'='*60}")
    print(f"🧪 Starting experiment: {exp_name}")
    print(f"{'='*60}")

    result = fixed_tuner.run_single_experiment(exp_name, exp_config)
    results.append(result)

    # Print immediate result
    if result['status'] == 'completed':
        print(f"✅ {exp_name} completed!")
        print(f"   Character accuracy: {result['best_val_char_accuracy']:.4f}")
        print(f"   Sequence accuracy: {result['best_val_seq_accuracy']:.4f}")
        print(f"   Training time: {result['training_time']/60:.1f} minutes")
    else:
        print(f"❌ {exp_name} failed: {result.get('error', 'Unknown error')}")

    # Clear memory after each experiment
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"\n🎉 All experiments completed!")
print(f"📊 Results summary:")
print(f"  • Total experiments: {len(results)}")
print(f"  • Successful: {sum(1 for r in results if r['status'] == 'completed')}")
print(f"  • Failed: {sum(1 for r in results if r['status'] == 'failed')}")

# Find best result
best_result = None
for result in results:
    if result['status'] == 'completed':
        if best_result is None or result['best_val_char_accuracy'] > best_result['best_val_char_accuracy']:
            best_result = result

if best_result:
    print(f"\n🏆 Best experiment: {best_result['experiment_name']}")
    print(f"  • Character accuracy: {best_result['best_val_char_accuracy']:.4f}")
    print(f"  • Sequence accuracy: {best_result['best_val_seq_accuracy']:.4f}")
    print(f"  • Training time: {best_result['training_time']/60:.1f} minutes")
    print(f"  • Epochs completed: {best_result['epochs_trained']}")
else:
    print("\n❌ No successful experiments")

# Save results to Google Drive
print(f"\n💾 Saving results to Google Drive...")
tuner.results = results  # Update the tuner's results
if best_result:
    tuner.best_result = best_result
results_file = tuner.save_results()
print(f"✅ Results saved to: {results_file}")


In [None]:
# 🛠️ STANDALONE FIXED HYPERPARAMETER TUNER (No external configs needed!)
print("🚀 Creating standalone hyperparameter tuning system...")

class StandaloneHyperparameterTuner:
    """Standalone hyperparameter tuner that doesn't require external config files."""

    def __init__(self):
        """Initialize the standalone tuner."""
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.results = []
        self.best_result = None

        # Load metadata
        with open('generated_data/metadata.yaml', 'r') as f:
            self.metadata = yaml.safe_load(f)

        print(f"🔥 Using device: {self.device}")
        print(f"📊 Dataset loaded with {self.metadata['dataset_info']['total_samples']} samples")

    def run_single_experiment(self, experiment_name, experiment_config):
        """Run a single hyperparameter experiment."""
        print(f"🧪 Starting experiment: {experiment_name}")
        start_time = time.time()

        try:
            # Create datasets
            train_data = KhmerDataset(
                self.metadata['splits']['train'],
                self.metadata['dataset_info']['char_to_idx'],
                self.metadata['dataset_info']['max_sequence_length'] + 1
            )

            val_data = KhmerDataset(
                self.metadata['splits']['val'],
                self.metadata['dataset_info']['char_to_idx'],
                self.metadata['dataset_info']['max_sequence_length'] + 1
            )

            # Create data loaders with adaptive batch size
            batch_size = experiment_config['training']['batch_size']
            if self.device.type == 'cpu':
                batch_size = min(32, batch_size)  # Smaller batch for CPU
                print(f"⚠️ Using smaller batch size ({batch_size}) for CPU training")

            train_loader = DataLoader(
                train_data,
                batch_size=batch_size,
                shuffle=True,
                pin_memory=(self.device.type == 'cuda')
            )

            val_loader = DataLoader(
                val_data,
                batch_size=batch_size,
                shuffle=False,
                pin_memory=(self.device.type == 'cuda')
            )

            print(f"📈 Training samples: {len(train_data)}")
            print(f"📊 Validation samples: {len(val_data)}")
            print(f"🔢 Batch size: {batch_size}")

            # Create fixed model
            model = create_model_fixed(
                model_size=experiment_config['model']['name'],
                vocab_size=len(self.metadata['dataset_info']['char_to_idx']),
                max_sequence_length=self.metadata['dataset_info']['max_sequence_length'] + 1
            )

            print(f"🧠 Model created: {experiment_config['model']['name']}")
            print(f"📏 Max sequence length: {model.max_sequence_length}")

            # Create trainer
            trainer = FixedSimpleTrainer(
                model=model,
                train_loader=train_loader,
                val_loader=val_loader,
                config=experiment_config,
                device=self.device,
                char_to_idx=self.metadata['dataset_info']['char_to_idx']
            )

            # Run training
            print(f"🚀 Starting training for {experiment_config['training']['num_epochs']} epochs...")
            history = trainer.train()

            # Calculate metrics
            end_time = time.time()
            training_time = end_time - start_time

            # Create result
            result = {
                'experiment_name': experiment_name,
                'status': 'completed',
                'training_time': training_time,
                'best_val_char_accuracy': max(history['val_char_accuracy']) if history['val_char_accuracy'] else 0,
                'best_val_seq_accuracy': max(history['val_seq_accuracy']) if history['val_seq_accuracy'] else 0,
                'final_train_loss': history['train_loss'][-1] if history['train_loss'] else float('inf'),
                'final_val_loss': history['val_loss'][-1] if history['val_loss'] else float('inf'),
                'epochs_trained': len(history['train_loss']),
                'hyperparameters': {
                    'model_size': experiment_config['model']['name'],
                    'batch_size': batch_size,
                    'learning_rate': experiment_config['training']['learning_rate'],
                    'weight_decay': experiment_config['training']['weight_decay'],
                    'optimizer': experiment_config['optimizer']['type'],
                    'scheduler': experiment_config['scheduler']['type']
                },
                'history': self.convert_tensors_to_python(history)
            }

            # Save model to Google Drive if best model exists
            if trainer.best_model_state:
                model_filename = f"{experiment_name}_best_model.pth"
                model_path = f"{project_drive_path}/models/{model_filename}"
                torch.save({
                    'model_state_dict': trainer.best_model_state,
                    'config': experiment_config,
                    'metadata': self.metadata,
                    'result': result
                }, model_path)
                result['model_path'] = model_path
                print(f"💾 Model saved to: {model_path}")

            print(f"✅ Experiment {experiment_name} completed successfully!")
            print(f"📊 Best character accuracy: {result['best_val_char_accuracy']:.4f}")
            print(f"📊 Best sequence accuracy: {result['best_val_seq_accuracy']:.4f}")
            print(f"⏱️ Training time: {training_time/60:.1f} minutes")

            return result

        except Exception as e:
            print(f"❌ Experiment {experiment_name} failed: {str(e)}")
            import traceback
            traceback.print_exc()
            return {
                'experiment_name': experiment_name,
                'status': 'failed',
                'error': str(e),
                'training_time': time.time() - start_time
            }

    def convert_tensors_to_python(self, obj):
        """Recursively convert tensors to Python types for JSON serialization."""
        if isinstance(obj, torch.Tensor):
            return obj.item() if obj.numel() == 1 else obj.tolist()
        elif isinstance(obj, dict):
            return {key: self.convert_tensors_to_python(value) for key, value in obj.items()}
        elif isinstance(obj, list):
            return [self.convert_tensors_to_python(item) for item in obj]
        elif isinstance(obj, (int, float, str, bool, type(None))):
            return obj
        else:
            return str(obj)

    def save_results(self):
        """Save tuning results to Google Drive."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Convert tensors to Python types for JSON serialization
        serializable_results = self.convert_tensors_to_python({
            'timestamp': timestamp,
            'device': str(self.device),
            'dataset_info': self.metadata['dataset_info'],
            'best_result': self.best_result,
            'all_results': self.results,
            'summary': self.generate_summary()
        })

        # Save detailed results
        results_file = f"{project_drive_path}/results/hyperparameter_tuning_results_{timestamp}.json"
        with open(results_file, 'w') as f:
            json.dump(serializable_results, f, indent=2)

        print(f"💾 Results saved to: {results_file}")
        return results_file

    def generate_summary(self):
        """Generate experiment summary."""
        if not self.results:
            return {}

        completed_results = [r for r in self.results if r.get('status') == 'completed']

        if not completed_results:
            return {'message': 'No completed experiments'}

        return {
            'total_experiments': len(self.results),
            'completed_experiments': len(completed_results),
            'failed_experiments': len(self.results) - len(completed_results),
            'best_char_accuracy': max(r['best_val_char_accuracy'] for r in completed_results),
            'best_seq_accuracy': max(r['best_val_seq_accuracy'] for r in completed_results),
            'average_training_time': sum(r['training_time'] for r in completed_results) / len(completed_results),
            'best_experiment': self.best_result['experiment_name'] if self.best_result else None
        }

print("✅ Standalone hyperparameter tuning system created!")


In [None]:
# 🎯 RUN ALL EXPERIMENTS WITH STANDALONE TUNER
print("🚀 Starting all experiments with standalone tuner...")

# Initialize the standalone tuner (no config files needed!)
tuner = StandaloneHyperparameterTuner()

# Define experiment configurations with early_stopping configuration
experiments = {
    'baseline_gpu_optimized_fixed': {
        'experiment_name': 'baseline_gpu_optimized_fixed',
        'model': {
            'name': 'medium',
            'config_path': 'config/model_config.yaml'
        },
        'training': {
            'batch_size': 128,
            'learning_rate': 0.002,
            'weight_decay': 0.0001,
            'num_epochs': 25,
            'loss_type': 'crossentropy',
            'label_smoothing': 0.1
        },
        'optimizer': {
            'type': 'adamw',
            'betas': [0.9, 0.999]
        },
        'scheduler': {
            'type': 'cosine',
            'warmup_epochs': 3,
            'min_lr': 1e-6
        },
        'early_stopping': {
            'patience': 5,
            'min_delta': 0.001,
            'monitor': 'val_char_accuracy'
        }
    },

    'aggressive_learning_fixed': {
        'experiment_name': 'aggressive_learning_fixed',
        'model': {
            'name': 'medium',
            'config_path': 'config/model_config.yaml'
        },
        'training': {
            'batch_size': 256,
            'learning_rate': 0.003,
            'weight_decay': 0.00005,
            'num_epochs': 25,
            'loss_type': 'crossentropy',
            'label_smoothing': 0.05
        },
        'optimizer': {
            'type': 'adam',
            'betas': [0.9, 0.999]
        },
        'scheduler': {
            'type': 'step',
            'step_size': 8,
            'gamma': 0.5
        },
        'early_stopping': {
            'patience': 5,
            'min_delta': 0.001,
            'monitor': 'val_char_accuracy'
        }
    },

    'large_model_gpu_fixed': {
        'experiment_name': 'large_model_gpu_fixed',
        'model': {
            'name': 'large',
            'config_path': 'config/model_config.yaml'
        },
        'training': {
            'batch_size': 64,
            'learning_rate': 0.0008,
            'weight_decay': 0.0003,
            'num_epochs': 30,
            'loss_type': 'crossentropy',
            'label_smoothing': 0.15
        },
        'optimizer': {
            'type': 'adamw',
            'betas': [0.9, 0.99]
        },
        'scheduler': {
            'type': 'cosine',
            'warmup_epochs': 5,
            'min_lr': 1e-7
        },
        'early_stopping': {
            'patience': 7,
            'min_delta': 0.001,
            'monitor': 'val_char_accuracy'
        }
    }
}

print(f"📋 Will run {len(experiments)} experiments:")
for name in experiments.keys():
    print(f"  • {name}")
print()

# Run experiments one by one
results = []
for exp_name, exp_config in experiments.items():
    print(f"\n{'='*60}")
    print(f"🧪 Starting experiment: {exp_name}")
    print(f"{'='*60}")

    result = tuner.run_single_experiment(exp_name, exp_config)
    results.append(result)
    tuner.results.append(result)  # Add to tuner's results too

    # Update best result
    if (result.get('status') == 'completed' and
        (tuner.best_result is None or
         result['best_val_char_accuracy'] > tuner.best_result['best_val_char_accuracy'])):
        tuner.best_result = result

    # Print immediate result
    if result['status'] == 'completed':
        print(f"✅ {exp_name} completed!")
        print(f"   Character accuracy: {result['best_val_char_accuracy']:.4f}")
        print(f"   Sequence accuracy: {result['best_val_seq_accuracy']:.4f}")
        print(f"   Training time: {result['training_time']/60:.1f} minutes")
    else:
        print(f"❌ {exp_name} failed: {result.get('error', 'Unknown error')}")

    # Clear memory after each experiment
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"\n🎉 All experiments completed!")
print(f"📊 Results summary:")
print(f"  • Total experiments: {len(results)}")
print(f"  • Successful: {sum(1 for r in results if r['status'] == 'completed')}")
print(f"  • Failed: {sum(1 for r in results if r['status'] == 'failed')}")

# Find best result
if tuner.best_result:
    print(f"\n🏆 Best experiment: {tuner.best_result['experiment_name']}")
    print(f"  • Character accuracy: {tuner.best_result['best_val_char_accuracy']:.4f}")
    print(f"  • Sequence accuracy: {tuner.best_result['best_val_seq_accuracy']:.4f}")
    print(f"  • Training time: {tuner.best_result['training_time']/60:.1f} minutes")
    print(f"  • Epochs completed: {tuner.best_result['epochs_trained']}")
else:
    print("\n❌ No successful experiments")

# Save results to Google Drive
print(f"\n💾 Saving results to Google Drive...")
results_file = tuner.save_results()
print(f"✅ Results saved to: {results_file}")

# Display summary
summary = tuner.generate_summary()
print(f"\n📈 FINAL SUMMARY:")
for key, value in summary.items():
    if isinstance(value, float):
        print(f"  • {key}: {value:.4f}")
    elif key.endswith('_time'):
        print(f"  • {key}: {value/60:.1f} minutes" if isinstance(value, (int, float)) else f"  • {key}: {value}")
    else:
        print(f"  • {key}: {value}")

print(f"\n🎊 Hyperparameter tuning completed successfully!")


In [None]:
# 🔧 QUICK TEST: Run one experiment with FIXED configuration
print("🧪 Testing one experiment with fixed early_stopping configuration...")

# Initialize the standalone tuner
test_tuner = StandaloneHyperparameterTuner()

# Test configuration with early_stopping included
test_config = {
    'experiment_name': 'test_fixed_config',
    'model': {
        'name': 'small',  # Use small model for faster testing
        'config_path': 'config/model_config.yaml'
    },
    'training': {
        'batch_size': 32,  # Small batch for quick test
        'learning_rate': 0.001,
        'weight_decay': 0.0001,
        'num_epochs': 5,  # Only 5 epochs for quick test
        'loss_type': 'crossentropy',
        'label_smoothing': 0.1
    },
    'optimizer': {
        'type': 'adamw',
        'betas': [0.9, 0.999]
    },
    'scheduler': {
        'type': 'cosine',
        'warmup_epochs': 1,
        'min_lr': 1e-6
    },
    'early_stopping': {
        'patience': 3,
        'min_delta': 0.001,
        'monitor': 'val_char_accuracy'
    }
}

print("📋 Test configuration:")
print(f"  • Model: {test_config['model']['name']}")
print(f"  • Batch size: {test_config['training']['batch_size']}")
print(f"  • Epochs: {test_config['training']['num_epochs']}")
print(f"  • Early stopping patience: {test_config['early_stopping']['patience']}")

# Run the test experiment
result = test_tuner.run_single_experiment('test_fixed_config', test_config)

print(f"\n📊 Test result:")
print(f"  • Status: {result['status']}")
if result['status'] == 'completed':
    print(f"  • Character accuracy: {result['best_val_char_accuracy']:.4f}")
    print(f"  • Sequence accuracy: {result['best_val_seq_accuracy']:.4f}")
    print(f"  • Training time: {result['training_time']:.1f} seconds")
    print(f"  • Epochs completed: {result['epochs_trained']}")
    print("  ✅ SUCCESS! Configuration is working correctly!")
else:
    print(f"  • Error: {result.get('error', 'Unknown error')}")
    print("  ❌ There's still an issue with the configuration")

# Clear memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("\n" + "="*60)
if result['status'] == 'completed':
    print("🎉 TEST PASSED! Configuration is working correctly.")
    print("💡 You can now run the full experiments in cell 42.")
else:
    print("❌ Test failed. Please check the error above.")
print("="*60)


In [None]:
# Hyperparameter Tuning System
import json
import matplotlib.pyplot as plt
from IPython.display import display, clear_output

class HyperparameterTuner:
    """Comprehensive hyperparameter tuning system for Colab."""

    def __init__(self, config_file='config/phase3_colab_configs.yaml'):
        self.config_file = config_file
        self.results = []
        self.best_result = None
        self.experiments_completed = 0

        # Load configuration
        with open(config_file, 'r') as f:
            self.config = yaml.safe_load(f)

        # Load metadata
        with open('generated_data/metadata.yaml', 'r') as f:
            self.metadata = yaml.safe_load(f)

        # Device setup
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"🔥 Using device: {self.device}")

    def create_data_loaders(self, batch_size):
        """Create train and validation data loaders."""
        train_dataset = KhmerDataset(
            self.metadata['splits']['train'],
            self.metadata['dataset_info']['char_to_idx'],
            self.metadata['dataset_info']['max_sequence_length'] + 1
        )

        val_dataset = KhmerDataset(
            self.metadata['splits']['val'],
            self.metadata['dataset_info']['char_to_idx'],
            self.metadata['dataset_info']['max_sequence_length'] + 1
        )

        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True,
            num_workers=2,
            pin_memory=True if self.device.type == 'cuda' else False
        )

        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=2,
            pin_memory=True if self.device.type == 'cuda' else False
        )

        return train_loader, val_loader

    def run_single_experiment(self, experiment_name, experiment_config):
        """Run a single hyperparameter experiment."""
        print(f"\n{'='*60}")
        print(f"🧪 Starting experiment: {experiment_name}")
        print(f"{'='*60}")

        start_time = time.time()

        try:
            # Merge base config with experiment config
            merged_config = copy.deepcopy(self.config['base_config'])
            merged_config.update(experiment_config)

            # Create data loaders
            train_loader, val_loader = self.create_data_loaders(
                merged_config['training']['batch_size']
            )

            # Create model
            model = create_model(
                model_size=merged_config['model']['name'],
                vocab_size=len(self.metadata['dataset_info']['char_to_idx']),
                max_sequence_length=self.metadata['dataset_info']['max_sequence_length'] + 1
            )

            # Initialize trainer
            trainer = SimpleTrainer(
                model=model,
                train_loader=train_loader,
                val_loader=val_loader,
                config=merged_config,
                device=self.device
            )

            # Run training
            history = trainer.train()

            # Calculate metrics
            end_time = time.time()
            training_time = end_time - start_time

            # Create result
            result = {
                'experiment_name': experiment_name,
                'status': 'completed',
                'training_time': training_time,
                'best_val_char_accuracy': max(history['val_char_accuracy']),
                'best_val_seq_accuracy': max(history['val_seq_accuracy']),
                'final_train_loss': history['train_loss'][-1],
                'final_val_loss': history['val_loss'][-1],
                'epochs_trained': len(history['train_loss']),
                'hyperparameters': {
                    'model_size': merged_config['model']['name'],
                    'batch_size': merged_config['training']['batch_size'],
                    'learning_rate': merged_config['training']['learning_rate'],
                    'weight_decay': merged_config['training']['weight_decay'],
                    'optimizer': merged_config['optimizer']['type'],
                    'scheduler': merged_config['scheduler']['type']
                },
                'history': history
            }

            # Save model to Google Drive
            if trainer.best_model_state:
                model_filename = f"{experiment_name}_best_model.pth"
                model_path = f"{project_drive_path}/models/{model_filename}"
                torch.save({
                    'model_state_dict': trainer.best_model_state,
                    'config': merged_config,
                    'metadata': self.metadata,
                    'result': result
                }, model_path)
                result['model_path'] = model_path
                print(f"💾 Model saved to: {model_path}")

            print(f"✅ Experiment {experiment_name} completed successfully!")
            print(f"📊 Best character accuracy: {result['best_val_char_accuracy']:.4f}")
            print(f"📊 Best sequence accuracy: {result['best_val_seq_accuracy']:.4f}")
            print(f"⏱️ Training time: {training_time/60:.1f} minutes")

            return result

        except Exception as e:
            print(f"❌ Experiment {experiment_name} failed: {str(e)}")
            import traceback
            traceback.print_exc()
            return {
                'experiment_name': experiment_name,
                'status': 'failed',
                'error': str(e),
                'training_time': time.time() - start_time
            }

    def run_experiments(self, experiment_names=None):
        """Run all or specified experiments."""
        experiments = self.config['experiments']

        if experiment_names:
            experiments = {name: config for name, config in experiments.items()
                          if name in experiment_names}

        print(f"🎯 Starting hyperparameter tuning with {len(experiments)} experiments")
        print(f"📊 Total dataset size: {self.metadata['dataset_info']['total_samples']}")
        print(f"🏋️ Training samples: {self.metadata['dataset_info']['train_samples']}")
        print(f"🔬 Validation samples: {self.metadata['dataset_info']['val_samples']}")

        for exp_name, exp_config in experiments.items():
            result = self.run_single_experiment(exp_name, exp_config)
            self.results.append(result)

            # Update best result
            if (result.get('status') == 'completed' and
                (self.best_result is None or
                 result['best_val_char_accuracy'] >
                 self.best_result['best_val_char_accuracy'])):
                self.best_result = result

            self.experiments_completed += 1

            # Clear memory
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    def save_results(self):
        """Save tuning results to Google Drive."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Save detailed results
        results_file = f"{project_drive_path}/results/hyperparameter_tuning_results_{timestamp}.json"
        with open(results_file, 'w') as f:
            json.dump({
                'timestamp': timestamp,
                'device': str(self.device),
                'dataset_info': self.metadata['dataset_info'],
                'best_result': self.best_result,
                'all_results': self.results,
                'summary': self.generate_summary()
            }, f, indent=2)

        # Save summary CSV
        summary_file = f"{project_drive_path}/results/summary_{timestamp}.csv"
        self.save_summary_csv(summary_file)

        print(f"💾 Results saved to: {results_file}")
        print(f"📊 Summary saved to: {summary_file}")

        return results_file, summary_file

    def generate_summary(self):
        """Generate experiment summary."""
        if not self.results:
            return {}

        completed_results = [r for r in self.results if r.get('status') == 'completed']

        if not completed_results:
            return {'message': 'No completed experiments'}

        return {
            'total_experiments': len(self.results),
            'completed_experiments': len(completed_results),
            'failed_experiments': len(self.results) - len(completed_results),
            'best_char_accuracy': max(r['best_val_char_accuracy'] for r in completed_results),
            'best_seq_accuracy': max(r['best_val_seq_accuracy'] for r in completed_results),
            'average_training_time': sum(r['training_time'] for r in completed_results) / len(completed_results),
            'best_experiment': self.best_result['experiment_name'] if self.best_result else None
        }

    def save_summary_csv(self, filename):
        """Save summary as CSV."""
        import pandas as pd

        data = []
        for result in self.results:
            if result.get('status') == 'completed':
                data.append({
                    'experiment_name': result['experiment_name'],
                    'char_accuracy': result['best_val_char_accuracy'],
                    'seq_accuracy': result['best_val_seq_accuracy'],
                    'training_time_min': result['training_time'] / 60,
                    'epochs_trained': result['epochs_trained'],
                    **result['hyperparameters']
                })

        if data:
            df = pd.DataFrame(data)
            df.to_csv(filename, index=False)

    def plot_results(self):
        """Plot experiment results."""
        if not self.results:
            print("No results to plot")
            return

        completed_results = [r for r in self.results if r.get('status') == 'completed']

        if not completed_results:
            print("No completed experiments to plot")
            return

        fig, axes = plt.subplots(2, 2, figsize=(15, 10))

        # Character accuracy
        exp_names = [r['experiment_name'] for r in completed_results]
        char_accs = [r['best_val_char_accuracy'] for r in completed_results]

        axes[0, 0].bar(exp_names, char_accs)
        axes[0, 0].set_title('Best Character Accuracy by Experiment')
        axes[0, 0].set_ylabel('Character Accuracy')
        axes[0, 0].tick_params(axis='x', rotation=45)

        # Sequence accuracy
        seq_accs = [r['best_val_seq_accuracy'] for r in completed_results]
        axes[0, 1].bar(exp_names, seq_accs)
        axes[0, 1].set_title('Best Sequence Accuracy by Experiment')
        axes[0, 1].set_ylabel('Sequence Accuracy')
        axes[0, 1].tick_params(axis='x', rotation=45)

        # Training time
        training_times = [r['training_time'] / 60 for r in completed_results]
        axes[1, 0].bar(exp_names, training_times)
        axes[1, 0].set_title('Training Time by Experiment')
        axes[1, 0].set_ylabel('Training Time (minutes)')
        axes[1, 0].tick_params(axis='x', rotation=45)

        # Learning curves for best experiment
        if self.best_result:
            history = self.best_result['history']
            epochs = range(1, len(history['train_loss']) + 1)

            axes[1, 1].plot(epochs, history['train_loss'], label='Train Loss')
            axes[1, 1].plot(epochs, history['val_loss'], label='Val Loss')
            axes[1, 1].plot(epochs, history['val_char_accuracy'], label='Val Char Acc')
            axes[1, 1].set_title(f"Learning Curves - {self.best_result['experiment_name']}")
            axes[1, 1].set_xlabel('Epoch')
            axes[1, 1].legend()

        plt.tight_layout()
        plt.savefig(f"{project_drive_path}/results/experiment_plots_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png")
        plt.show()

print("✅ Hyperparameter tuning system created!")


In [None]:
# ⚠️ CELL DISABLED - References old tuner system
print("❌ This cell is DISABLED!")
print("✅ Results are automatically saved by the StandaloneHyperparameterTuner!")
print("💾 Check Google Drive for saved results and models")
print("📁 Location: /content/drive/MyDrive/khmer_ocr_training/")

# OLD CODE DISABLED - references old broken tuner system


In [None]:
# ⚠️ CELL DISABLED - References old tuner system
print("❌ This cell is DISABLED!")
print("✅ Best models are automatically saved to Google Drive!")
print("📁 Check: /content/drive/MyDrive/khmer_ocr_training/models/")
print("💡 Download the best model and load it in a new notebook for testing")

# OLD CODE DISABLED - references old broken tuner system
if False:  # Disabled
    print("🔄 Loading best model for testing...")

    # Load saved model
    checkpoint = torch.load(tuner.best_result['model_path'], map_location=tuner.device)

    # Create model
    best_model = create_model(
        model_size=checkpoint['config']['model']['name'],
        vocab_size=len(checkpoint['metadata']['dataset_info']['char_to_idx']),
        max_sequence_length=checkpoint['metadata']['dataset_info']['max_sequence_length'] + 1
    )

    # Load weights
    best_model.load_state_dict(checkpoint['model_state_dict'])
    best_model = best_model.to(tuner.device)
    best_model.eval()

    print(f"✅ Best model loaded: {tuner.best_result['experiment_name']}")
    print(f"📊 Character accuracy: {tuner.best_result['best_val_char_accuracy']:.4f}")
    print(f"📊 Sequence accuracy: {tuner.best_result['best_val_seq_accuracy']:.4f}")

    # Test on a few validation samples
    with torch.no_grad():
        val_loader = tuner.create_data_loaders(32)[1]
        images, targets, lengths = next(iter(val_loader))
        images = images[:5].to(tuner.device)  # Test on 5 samples
        targets = targets[:5]
        lengths = lengths[:5]

        outputs = best_model(images)
        predictions = torch.argmax(outputs, dim=-1)

        idx_to_char = checkpoint['metadata']['dataset_info']['idx_to_char']

        print("\n🔍 Sample predictions:")
        for i in range(len(images)):
            # Get actual text
            actual_chars = [idx_to_char[str(idx.item())] for idx in targets[i][:lengths[i]-1]]  # -1 for EOS
            actual_text = ''.join([char for char in actual_chars if char not in ['<PAD>', '<EOS>', '<BLANK>']])

            # Get predicted text
            pred_chars = [idx_to_char[str(idx.item())] for idx in predictions[i][:lengths[i]-1]]
            pred_text = ''.join([char for char in pred_chars if char not in ['<PAD>', '<EOS>', '<BLANK>']])

            print(f"  Sample {i+1}: Actual='{actual_text}', Predicted='{pred_text}'")

else:
    print("❌ No best model available to load")


In [None]:
# Save results and generate summary
print("💾 Saving results to Google Drive...")
results_file = tuner.save_results()

# Generate summary report
summary = tuner.generate_summary()
print("📋 Generating experiment summary...")

# Print summary results
completed_experiments = [r for r in tuner.results if r.get('status') == 'completed']
failed_experiments = [r for r in tuner.results if r.get('status') == 'failed']

print(f"\n📊 HYPERPARAMETER TUNING SUMMARY")
print(f"{'='*50}")
print(f"✅ Completed experiments: {len(completed_experiments)}")
print(f"❌ Failed experiments: {len(failed_experiments)}")

if tuner.best_result:
    best = tuner.best_result
    print(f"\n🏆 BEST EXPERIMENT: {best['experiment_name']}")
    print(f"  📊 Character accuracy: {best['best_val_char_accuracy']:.4f}")
    print(f"  📊 Sequence accuracy: {best['best_val_seq_accuracy']:.4f}")
    print(f"  ⏱️ Training time: {best['training_time']/60:.1f} minutes")
    print(f"  📈 Epochs trained: {best['epochs_trained']}")
    print(f"  🧠 Model size: {best['hyperparameters']['model_size']}")
    print(f"  📚 Batch size: {best['hyperparameters']['batch_size']}")
    print(f"  🎯 Learning rate: {best['hyperparameters']['learning_rate']}")
    
    if 'model_path' in best:
        print(f"  💾 Model saved to: {best['model_path']}")

print(f"\n💾 Full results saved to: {results_file}")
print(f"📁 Check Google Drive: /content/drive/MyDrive/khmer_ocr_training/")

# Show detailed results for each experiment
print(f"\n📋 DETAILED RESULTS:")
print(f"{'='*80}")
for i, result in enumerate(completed_experiments, 1):
    print(f"\n{i}. {result['experiment_name']}")
    print(f"   Character Acc: {result['best_val_char_accuracy']:.4f}")
    print(f"   Sequence Acc:  {result['best_val_seq_accuracy']:.4f}")
    print(f"   Training Time: {result['training_time']/60:.1f} min")
    print(f"   Model: {result['hyperparameters']['model_size']}")
    print(f"   Batch: {result['hyperparameters']['batch_size']}")
    print(f"   LR: {result['hyperparameters']['learning_rate']}")

# Generate comprehensive final report
final_report = {
    'timestamp': datetime.now().isoformat(),
    'environment': {
        'device': str(tuner.device),
        'cuda_available': torch.cuda.is_available(),
        'python_version': sys.version,
        'pytorch_version': torch.__version__
    },
    'dataset_info': tuner.metadata['dataset_info'],
    'experiment_summary': {
        'total_experiments': len(tuner.results),
        'completed': len(completed_experiments),
        'failed': len(failed_experiments),
        'success_rate': len(completed_experiments) / len(tuner.results) if tuner.results else 0
    },
    'best_result': tuner.best_result,
    'all_results': tuner.results,
    'summary': tuner.generate_summary()
}

# Save comprehensive report
comprehensive_file = f"{project_drive_path}/results/comprehensive_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(comprehensive_file, 'w') as f:
    json.dump(tuner.convert_tensors_to_python(final_report), f, indent=2)

print(f"\n📋 Comprehensive report saved to: {comprehensive_file}")
print(f"🎉 Hyperparameter tuning completed successfully!")

# Plot results
tuner.plot_results()
