In [None]:
# 1. Check GPU
import torch
print(f'CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available(): print(f'GPU: {torch.cuda.get_device_name(0)}')
else: raise RuntimeError('Enable GPU in Runtime > Change runtime type!')

In [None]:
# 2. Install packages
!pip install -q nltk gensim pillow tqdm
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)
print('Packages installed!')

In [None]:
# 3. Mount Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 4. Copy data (only processed data needed, not raw WikiArt)
import os
DRIVE_PATH = '/content/drive/MyDrive/artemis-captioning'
!mkdir -p /content/artemis
!cp -r "{DRIVE_PATH}/data" /content/artemis/
!cp -r "{DRIVE_PATH}/utils" /content/artemis/
!cp -r "{DRIVE_PATH}/models" /content/artemis/
!cp "{DRIVE_PATH}/train.py" /content/artemis/
print('Data copied!')

In [None]:
# 5. Setup paths
import sys
os.chdir('/content/artemis')
sys.path.insert(0, '/content/artemis')
print(f'Working directory: {os.getcwd()}')

In [None]:
# 6. Imports
import torch, json, time, gc
import numpy as np
from utils.text_preprocessing import TextPreprocessor
from utils.data_loader import create_data_loaders
from models.cnn_lstm import create_model as create_cnn_model
from models.vision_transformer import VisionTransformerCaptioning
from train import Trainer
print('All imports OK!')

In [None]:
# 7. Define all 4 training configurations
CONFIGS = {
    1: {
        'name': 'colab_cnn_high_lr',
        'model_type': 'cnn_lstm',
        'description': 'CNN+LSTM High LR - Best from training analysis',
        'batch_size': 32,
        'num_images': 15000,
        'epochs': 30,
        'learning_rate': 3e-4,
        'embed_dim': 512,
        'hidden_dim': 1024,
        'attention_dim': 512,
        'dropout': 0.4,
    },
    2: {
        'name': 'colab_cnn_glove',
        'model_type': 'cnn_lstm',
        'description': 'CNN+LSTM with GloVe Embeddings',
        'batch_size': 32,
        'num_images': 15000,
        'epochs': 30,
        'learning_rate': 2e-4,
        'embed_dim': 300,  # GloVe dimension
        'hidden_dim': 512,
        'attention_dim': 256,
        'dropout': 0.3,
        'use_glove': True,
    },
    3: {
        'name': 'colab_vit_standard',
        'model_type': 'vit',
        'description': 'Vision Transformer Standard (6 layers)',
        'batch_size': 32,
        'num_images': 15000,
        'epochs': 25,
        'learning_rate': 2e-4,
        'embed_dim': 256,
        'num_heads': 8,
        'num_layers': 6,
        'ff_dim': 1024,
        'dropout': 0.1,
    },
    4: {
        'name': 'colab_vit_compact',
        'model_type': 'vit',
        'description': 'Vision Transformer Compact (4 layers, higher LR)',
        'batch_size': 32,
        'num_images': 15000,
        'epochs': 25,
        'learning_rate': 3e-4,
        'embed_dim': 256,
        'num_heads': 8,
        'num_layers': 4,
        'ff_dim': 512,
        'dropout': 0.1,
    },
}

print('Configurations defined:')
for k, v in CONFIGS.items():
    print(f"  {k}: {v['name']} - {v['description']}")

In [None]:
# 8. Helper class for limiting batches
class LimitedLoader:
    def __init__(self, loader, max_batches):
        self.loader = loader
        self.max_batches = max_batches
        self.batch_size = loader.batch_size
        self.dataset = loader.dataset
    def __iter__(self):
        for i, b in enumerate(self.loader):
            if i >= self.max_batches: break
            yield b
    def __len__(self): return min(len(self.loader), self.max_batches)

print('LimitedLoader defined')

In [None]:
# 9. Load vocabulary (shared across all models)
text_proc = TextPreprocessor()
text_proc.load_vocabulary('data/processed/vocabulary.json')
print(f'Vocabulary size: {text_proc.vocab_size}')

# Load GloVe embeddings for config 2
glove_matrix = None
if os.path.exists('data/embeddings/glove_embeddings.npy'):
    glove_matrix = np.load('data/embeddings/glove_embeddings.npy')
    print(f'GloVe embeddings loaded: {glove_matrix.shape}')
else:
    print('Warning: GloVe embeddings not found')

In [None]:
# 10. Training function
def train_config(config_num, text_proc, glove_matrix=None):
    """Train a single configuration."""
    config = CONFIGS[config_num]
    name = config['name']
    
    print(f"\n{'='*70}")
    print(f"STARTING CONFIG {config_num}: {name}")
    print(f"Description: {config['description']}")
    print(f"{'='*70}")
    
    # Create data loaders
    loaders = create_data_loaders(
        text_preprocessor=text_proc,
        batch_size=config['batch_size'],
        num_workers=2,
        splits=['train', 'val']
    )
    train_loader = LimitedLoader(loaders['train'], config['num_images'] // config['batch_size'])
    val_loader = LimitedLoader(loaders['val'], 50)
    print(f'Train batches: {len(train_loader)}, Val batches: {len(val_loader)}')
    
    # Create model
    if config['model_type'] == 'cnn_lstm':
        if config.get('use_glove') and glove_matrix is not None:
            model = create_cnn_model(
                embedding_matrix=glove_matrix,
                vocab_size=text_proc.vocab_size,
                embed_dim=config['embed_dim'],
                decoder_dim=config['hidden_dim'],
                attention_dim=config['attention_dim'],
                dropout=config['dropout']
            )
        else:
            model = create_cnn_model(
                vocab_size=text_proc.vocab_size,
                embed_dim=config['embed_dim'],
                decoder_dim=config['hidden_dim'],
                attention_dim=config['attention_dim'],
                dropout=config['dropout']
            )
    else:  # vit
        model = VisionTransformerCaptioning(
            vocab_size=text_proc.vocab_size,
            embed_dim=config['embed_dim'],
            num_heads=config['num_heads'],
            num_encoder_layers=config['num_layers'],
            num_decoder_layers=config['num_layers'],
            ff_dim=config['ff_dim'],
            max_seq_len=30,
            dropout=config['dropout'],
            img_size=128,
            patch_size=16
        )
    
    model = model.to('cuda')
    print(f'Model params: {sum(p.numel() for p in model.parameters()):,}')
    
    # Create directories
    os.makedirs(f"checkpoints/{name}", exist_ok=True)
    os.makedirs(f"outputs/{name}", exist_ok=True)
    
    # Train
    trainer = Trainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        text_preprocessor=text_proc,
        learning_rate=config['learning_rate'],
        device='cuda',
        checkpoint_dir=f"checkpoints/{name}",
        output_dir=f"outputs/{name}"
    )
    
    t0 = time.time()
    history = trainer.train(num_epochs=config['epochs'])
    duration = (time.time() - t0) / 60
    
    # Results
    best_bleu = max(history['val_bleu'])
    best_loss = min(history['val_loss'])
    
    print(f"\n{'='*70}")
    print(f"COMPLETED: {name}")
    print(f"Duration: {duration:.1f} min")
    print(f"Best BLEU: {best_bleu:.4f}")
    print(f"Best Loss: {best_loss:.4f}")
    print(f"{'='*70}")
    
    # Save to Drive immediately
    DRIVE_PATH = '/content/drive/MyDrive/artemis-captioning'
    os.system(f'mkdir -p "{DRIVE_PATH}/checkpoints"')
    os.system(f'mkdir -p "{DRIVE_PATH}/outputs"')
    os.system(f'cp -r "checkpoints/{name}" "{DRIVE_PATH}/checkpoints/"')
    os.system(f'cp -r "outputs/{name}" "{DRIVE_PATH}/outputs/"')
    print(f'Saved to Drive: {DRIVE_PATH}')
    
    # Cleanup GPU memory
    del model, trainer, train_loader, val_loader, loaders
    gc.collect()
    torch.cuda.empty_cache()
    
    return {'name': name, 'duration': duration, 'best_bleu': best_bleu, 'best_loss': best_loss}

print('Training function defined')

In [None]:
# 11. RUN ALL 4 CONFIGURATIONS SEQUENTIALLY
all_results = []
total_start = time.time()

for config_num in [1, 2, 3, 4]:
    try:
        result = train_config(config_num, text_proc, glove_matrix)
        all_results.append(result)
    except Exception as e:
        print(f"ERROR in config {config_num}: {e}")
        all_results.append({'name': CONFIGS[config_num]['name'], 'error': str(e)})

total_duration = (time.time() - total_start) / 60

print(f"\n\n{'='*70}")
print("ALL TRAINING COMPLETE!")
print(f"{'='*70}")
print(f"Total time: {total_duration:.1f} minutes ({total_duration/60:.1f} hours)")
print("\nResults Summary:")
print("-" * 50)
for r in all_results:
    if 'error' in r:
        print(f"  {r['name']}: ERROR - {r['error']}")
    else:
        print(f"  {r['name']}: BLEU={r['best_bleu']:.4f}, Loss={r['best_loss']:.4f}, Time={r['duration']:.1f}min")
print("-" * 50)

In [None]:
# 12. Save final summary to Drive
import json
DRIVE_PATH = '/content/drive/MyDrive/artemis-captioning'

summary = {
    'total_duration_minutes': total_duration,
    'results': all_results,
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
}

with open(f'{DRIVE_PATH}/colab_training_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"Summary saved to {DRIVE_PATH}/colab_training_summary.json")