In [1]:
# 1. Check GPU
import torch
print(f'CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available(): print(f'GPU: {torch.cuda.get_device_name(0)}')
else: raise RuntimeError('Enable GPU in Runtime > Change runtime type!')

CUDA: True
GPU: Tesla T4


In [2]:
# 2. Install packages
!pip install -q nltk gensim pillow tqdm
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)
print('Packages installed!')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hPackages installed!


In [3]:
# 3. Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# 4. Copy data (only processed data needed, not raw WikiArt)
import os
DRIVE_PATH = '/content/drive/MyDrive/artemis-captioning'
!mkdir -p /content/artemis
!cp -r "{DRIVE_PATH}/data" /content/artemis/
!cp -r "{DRIVE_PATH}/utils" /content/artemis/
!cp -r "{DRIVE_PATH}/models" /content/artemis/
!cp "{DRIVE_PATH}/train.py" /content/artemis/
print('Data copied!')

Data copied!


In [16]:
# 4b. CRITICAL FIX: Patch cnn_lstm.py to fix contiguous tensor issue
# This fixes "rnn: hx is not contiguous" error
import re

cnn_file = '/content/artemis/models/cnn_lstm.py'
with open(cnn_file, 'r') as f:
    content = f.read()

# Fix the non-contiguous hidden state issue
old_pattern = r'\(h\[:, :batch_size_t\], c\[:, :batch_size_t\]\)'
new_pattern = '(h[:, :batch_size_t].contiguous(), c[:, :batch_size_t].contiguous())'

if '.contiguous()' not in content:
    content = re.sub(old_pattern, new_pattern, content)
    with open(cnn_file, 'w') as f:
        f.write(content)
    print('✓ Patched cnn_lstm.py with contiguous() fix')
else:
    print('✓ cnn_lstm.py already has contiguous() fix')

✓ cnn_lstm.py already has contiguous() fix


In [31]:
# 5. Setup paths
import sys
os.chdir('/content/artemis')
sys.path.insert(0, '/content/artemis')
print(f'Working directory: {os.getcwd()}')

Working directory: /content/artemis


In [38]:
# Run this in Colab to diagnose the filename encoding issue
# Paste this into a cell and run it

import os
import json
from pathlib import Path

# Check what's in the JSON
splits_dir = Path('/content/artemis/data/processed/splits')
images_dir = Path('/content/artemis/data/processed/images')

print("=" * 70)
print("DIAGNOSING FILENAME ENCODING ISSUE")
print("=" * 70)

# 1. Sample filenames from JSON
print("\n1. SAMPLE NAMES FROM JSON (train.json):")
with open(splits_dir / 'train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Find some with special characters
special_char_paintings = []
for p in data['paintings']:
    name = p['painting']
    if any(ord(c) > 127 for c in name):
        special_char_paintings.append((p['style'], name))
        if len(special_char_paintings) >= 5:
            break

for style, name in special_char_paintings:
    print(f"  JSON: {style}/{name}.jpg")
    # Check if file exists
    path = images_dir / style / f"{name}.jpg"
    print(f"  Exists: {path.exists()}")

# 2. Sample actual filenames from disk
print("\n2. SAMPLE ACTUAL FILENAMES ON DISK:")
for style_dir in sorted(images_dir.iterdir())[:3]:
    if style_dir.is_dir():
        files = list(style_dir.glob('*.jpg'))[:3]
        for f in files:
            name = f.name
            # Show hex of any special chars
            if any(ord(c) > 127 for c in name):
                print(f"  DISK: {style_dir.name}/{name}")
                print(f"        Bytes: {name.encode('utf-8')[:50]}")

# 3. Find mismatches for one problematic artist
print("\n3. CHECKING JOAQUÍN SOROLLA FILES:")
target_style = "Impressionism"
target_prefix = "joaqu"

# What's in JSON?
json_names = []
for p in data['paintings']:
    if p['style'] == target_style and target_prefix in p['painting'].lower():
        json_names.append(p['painting'])

print(f"  Found {len(json_names)} in JSON with '{target_prefix}'")
if json_names:
    print(f"  Example: {json_names[0]}")
    print(f"  Bytes: {json_names[0].encode('utf-8')}")

# What's on disk?
disk_names = []
style_path = images_dir / target_style
if style_path.exists():
    for f in style_path.glob('*.jpg'):
        if target_prefix in f.stem.lower():
            disk_names.append(f.stem)

print(f"  Found {len(disk_names)} on disk with '{target_prefix}'")
if disk_names:
    print(f"  Example: {disk_names[0]}")
    print(f"  Bytes: {disk_names[0].encode('utf-8')}")

# 4. Compare byte representations
print("\n4. BYTE COMPARISON:")
if json_names and disk_names:
    j = json_names[0]
    d = disk_names[0]
    print(f"  JSON name bytes: {j.encode('utf-8')}")
    print(f"  DISK name bytes: {d.encode('utf-8')}")
    print(f"  Are they equal? {j == d}")


DIAGNOSING FILENAME ENCODING ISSUE

1. SAMPLE NAMES FROM JSON (train.json):
  JSON: Impressionism/joaquã­n-sorolla_the-tunny-catch-1919.jpg
  Exists: False
  JSON: Impressionism/joaquã­n-sorolla_children-on-the-seashore-1903.jpg
  Exists: False
  JSON: Impressionism/joaquã­n-sorolla_square-of-valencia.jpg
  Exists: False
  JSON: Impressionism/joaquã­n-sorolla_on-the-beach-at-valencia-1910.jpg
  Exists: False
  JSON: Romanticism/arnold-bã¶cklin_portrait-of-angela-b-cklin-in-red-fishnet.jpg
  Exists: False

2. SAMPLE ACTUAL FILENAMES ON DISK:

3. CHECKING JOAQUÍN SOROLLA FILES:
  Found 62 in JSON with 'joaqu'
  Example: joaquã­n-sorolla_the-tunny-catch-1919
  Bytes: b'joaqu\xc3\xa3\xc2\xadn-sorolla_the-tunny-catch-1919'
  Found 81 on disk with 'joaqu'
  Example: joaquã­n-sorolla_study-for-the-comeback-of-the-fisheries-1894
  Bytes: b'joaqua\xcc\x83\xc2\xadn-sorolla_study-for-the-comeback-of-the-fisheries-1894'

4. BYTE COMPARISON:
  JSON name bytes: b'joaqu\xc3\xa3\xc2\xadn-sorolla_the-

In [39]:
# 6. Imports (with forced reload to ensure fresh code)
import torch, json, time, gc
import numpy as np
import importlib

# Import modules
from utils import text_preprocessing, data_loader
from models import cnn_lstm, vision_transformer
import train

# Force reload to use the updated code (not cached versions)
importlib.reload(cnn_lstm)
importlib.reload(vision_transformer)
importlib.reload(train)

# Now import the classes/functions we need
from utils.text_preprocessing import TextPreprocessor
from utils.data_loader import create_data_loaders
from models.cnn_lstm import create_model as create_cnn_model
from models.vision_transformer import VisionTransformerCaptioning
from train import Trainer

print('All imports OK (with forced reload)!')

All imports OK (with forced reload)!


In [40]:
# 7. Define all 4 training configurations
CONFIGS = {
    1: {
        'name': 'colab_cnn_high_lr',
        'model_type': 'cnn_lstm',
        'description': 'CNN+LSTM High LR - Best from training analysis',
        'batch_size': 32,
        'num_images': 15000,
        'epochs': 30,
        'learning_rate': 3e-4,
        'embed_dim': 512,
        'hidden_dim': 1024,
        'attention_dim': 512,
        'dropout': 0.4,
    },
    2: {
        'name': 'colab_cnn_glove',
        'model_type': 'cnn_lstm',
        'description': 'CNN+LSTM with GloVe Embeddings',
        'batch_size': 32,
        'num_images': 15000,
        'epochs': 30,
        'learning_rate': 2e-4,
        'embed_dim': 300,  # GloVe dimension
        'hidden_dim': 512,
        'attention_dim': 256,
        'dropout': 0.3,
        'use_glove': True,
    },
    3: {
        'name': 'colab_vit_standard',
        'model_type': 'vit',
        'description': 'Vision Transformer Standard (6 layers)',
        'batch_size': 32,
        'num_images': 15000,
        'epochs': 25,
        'learning_rate': 2e-4,
        'embed_dim': 256,
        'num_heads': 8,
        'num_layers': 6,
        'ff_dim': 1024,
        'dropout': 0.1,
    },
    4: {
        'name': 'colab_vit_compact',
        'model_type': 'vit',
        'description': 'Vision Transformer Compact (4 layers, higher LR)',
        'batch_size': 32,
        'num_images': 15000,
        'epochs': 25,
        'learning_rate': 3e-4,
        'embed_dim': 256,
        'num_heads': 8,
        'num_layers': 4,
        'ff_dim': 512,
        'dropout': 0.1,
    },
}

print('Configurations defined:')
for k, v in CONFIGS.items():
    print(f"  {k}: {v['name']} - {v['description']}")

Configurations defined:
  1: colab_cnn_high_lr - CNN+LSTM High LR - Best from training analysis
  2: colab_cnn_glove - CNN+LSTM with GloVe Embeddings
  3: colab_vit_standard - Vision Transformer Standard (6 layers)
  4: colab_vit_compact - Vision Transformer Compact (4 layers, higher LR)


In [41]:
# 8. Helper class for limiting batches
class LimitedLoader:
    def __init__(self, loader, max_batches):
        self.loader = loader
        self.max_batches = max_batches
        self.batch_size = loader.batch_size
        self.dataset = loader.dataset
    def __iter__(self):
        for i, b in enumerate(self.loader):
            if i >= self.max_batches: break
            yield b
    def __len__(self): return min(len(self.loader), self.max_batches)

print('LimitedLoader defined')

LimitedLoader defined


In [42]:
# 9. Load vocabulary (shared across all models)
text_proc = TextPreprocessor()
text_proc.load_vocabulary('data/processed/vocabulary.json')
print(f'Vocabulary size: {text_proc.vocab_size}')

# Load GloVe embeddings for config 2 (convert to tensor)
glove_matrix = None
if os.path.exists('data/embeddings/glove_embeddings.npy'):
    glove_np = np.load('data/embeddings/glove_embeddings.npy')
    glove_matrix = torch.tensor(glove_np, dtype=torch.float32)
    print(f'GloVe embeddings loaded: {glove_matrix.shape}')
else:
    print('Warning: GloVe embeddings not found')


✓ Vocabulary loaded from: data/processed/vocabulary.json
  - Vocabulary size: 10000
  - Max caption length: 30
Vocabulary size: 10000
GloVe embeddings loaded: torch.Size([10000, 300])


In [43]:
# 10. Training function
def train_config(config_num, text_proc, glove_matrix=None):
    """Train a single configuration."""
    config = CONFIGS[config_num]
    name = config['name']

    print(f"\n{'='*70}")
    print(f"STARTING CONFIG {config_num}: {name}")
    print(f"Description: {config['description']}")
    print(f"{'='*70}")

    # Create data loaders
    loaders = create_data_loaders(
        text_preprocessor=text_proc,
        batch_size=config['batch_size'],
        num_workers=2,
        splits=['train', 'val']
    )
    train_loader = LimitedLoader(loaders['train'], config['num_images'] // config['batch_size'])
    val_loader = LimitedLoader(loaders['val'], 50)
    print(f'Train batches: {len(train_loader)}, Val batches: {len(val_loader)}')

    # Create model
    if config['model_type'] == 'cnn_lstm':
        if config.get('use_glove') and glove_matrix is not None:
            model = create_cnn_model(
                embedding_matrix=glove_matrix,
                vocab_size=text_proc.vocab_size,
                embed_dim=config['embed_dim'],
                decoder_dim=config['hidden_dim'],
                attention_dim=config['attention_dim'],
                dropout=config['dropout']
            )
        else:
            model = create_cnn_model(
                vocab_size=text_proc.vocab_size,
                embed_dim=config['embed_dim'],
                decoder_dim=config['hidden_dim'],
                attention_dim=config['attention_dim'],
                dropout=config['dropout']
            )
    else:  # vit
        model = VisionTransformerCaptioning(
            vocab_size=text_proc.vocab_size,
            embed_dim=config['embed_dim'],
            num_heads=config['num_heads'],
            encoder_layers=config['num_layers'],
            decoder_layers=config['num_layers'],
            mlp_ratio=4,
            max_length=30,
            dropout=config['dropout'],
            img_size=128,
            patch_size=16
        )

    model = model.to('cuda')
    print(f'Model params: {sum(p.numel() for p in model.parameters()):,}')

    # Create directories
    os.makedirs(f"checkpoints/{name}", exist_ok=True)
    os.makedirs(f"outputs/{name}", exist_ok=True)

    # Train
    trainer = Trainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        text_preprocessor=text_proc,
        learning_rate=config['learning_rate'],
        device='cuda',
        checkpoint_dir=f"checkpoints/{name}",
        output_dir=f"outputs/{name}"
    )

    t0 = time.time()
    history = trainer.train(num_epochs=config['epochs'])
    duration = (time.time() - t0) / 60

    # Results
    best_bleu = max(history['val_bleu'])
    best_loss = min(history['val_loss'])

    print(f"\n{'='*70}")
    print(f"COMPLETED: {name}")
    print(f"Duration: {duration:.1f} min")
    print(f"Best BLEU: {best_bleu:.4f}")
    print(f"Best Loss: {best_loss:.4f}")
    print(f"{'='*70}")

    # Save to Drive immediately
    DRIVE_PATH = '/content/drive/MyDrive/artemis-captioning'
    os.system(f'mkdir -p "{DRIVE_PATH}/checkpoints"')
    os.system(f'mkdir -p "{DRIVE_PATH}/outputs"')
    os.system(f'cp -r "checkpoints/{name}" "{DRIVE_PATH}/checkpoints/"')
    os.system(f'cp -r "outputs/{name}" "{DRIVE_PATH}/outputs/"')
    print(f'Saved to Drive: {DRIVE_PATH}')

    # Cleanup GPU memory
    del model, trainer, train_loader, val_loader, loaders
    gc.collect()
    torch.cuda.empty_cache()

    return {'name': name, 'duration': duration, 'best_bleu': best_bleu, 'best_loss': best_loss}

print('Training function defined')

Training function defined


In [44]:
# 11. RUN ALL 4 CONFIGURATIONS SEQUENTIALLY
all_results = []
total_start = time.time()

for config_num in [1, 2, 3, 4]:
    try:
        result = train_config(config_num, text_proc, glove_matrix)
        all_results.append(result)
    except Exception as e:
        print(f"ERROR in config {config_num}: {e}")
        all_results.append({'name': CONFIGS[config_num]['name'], 'error': str(e)})

total_duration = (time.time() - total_start) / 60

print(f"\n\n{'='*70}")
print("ALL TRAINING COMPLETE!")
print(f"{'='*70}")
print(f"Total time: {total_duration:.1f} minutes ({total_duration/60:.1f} hours)")
print("\nResults Summary:")
print("-" * 50)
for r in all_results:
    if 'error' in r:
        print(f"  {r['name']}: ERROR - {r['error']}")
    else:
        print(f"  {r['name']}: BLEU={r['best_bleu']:.4f}, Loss={r['best_loss']:.4f}, Time={r['duration']:.1f}min")
print("-" * 50)


STARTING CONFIG 1: colab_cnn_high_lr
Description: CNN+LSTM High LR - Best from training analysis

CREATING DATA LOADERS

TRAIN DataLoader:
  - Loaded 68108 image-caption pairs from train.json
  - Samples: 68,108
  - Paintings: 11,991
  - Batches: 2,128
  - Batch size: 32
  - Shuffle: True

VAL DataLoader:
  - Loaded 8575 image-caption pairs from val.json
  - Samples: 8,575
  - Paintings: 1,490
  - Batches: 268
  - Batch size: 32
  - Shuffle: False
Train batches: 468, Val batches: 50
Model params: 43,869,009

COMPLETED: colab_cnn_high_lr
Duration: 65.8 min
Best BLEU: 0.0197
Best Loss: 3.8454
Saved to Drive: /content/drive/MyDrive/artemis-captioning

STARTING CONFIG 2: colab_cnn_glove
Description: CNN+LSTM with GloVe Embeddings

CREATING DATA LOADERS

TRAIN DataLoader:
  - Loaded 68108 image-caption pairs from train.json
  - Samples: 68,108
  - Paintings: 11,991
  - Batches: 2,128
  - Batch size: 32
  - Shuffle: True

VAL DataLoader:
  - Loaded 8575 image-caption pairs from val.json
  -

In [45]:
# 12. Save final summary to Drive
import json
DRIVE_PATH = '/content/drive/MyDrive/artemis-captioning'

summary = {
    'total_duration_minutes': total_duration,
    'results': all_results,
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
}

with open(f'{DRIVE_PATH}/colab_training_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"Summary saved to {DRIVE_PATH}/colab_training_summary.json")

Summary saved to /content/drive/MyDrive/artemis-captioning/colab_training_summary.json
