# Siamese CNN Training for PhotoTriage
End-to-end training of VGG16/ResNet50 for pairwise image quality ranking

## Setup
This notebook:
1. Uses the existing PhotoTriage dataset on Kaggle (https://www.kaggle.com/datasets/ericwolter/triage)
2. Clones and installs sim_bench package
3. Trains Siamese CNN + MLP end-to-end
4. Saves results and plots

## Before Running
1. **Add the dataset**: Click "+ Add Data" → Search "triage" → Add the dataset by ericwolter
2. **Enable GPU**: Settings → Accelerator → GPU T4 x2
3. **Enable Internet**: Settings → Internet → On (to clone GitHub repo)


In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")


## 1. Download PhotoTriage Dataset


In [None]:
# Use existing PhotoTriage dataset from Kaggle
# Dataset: https://www.kaggle.com/datasets/ericwolter/triage
# Add it to your notebook: Click "+ Add Data" → Search "triage" → Add

from pathlib import Path
import os

# Check for the dataset
dataset_path = Path('/kaggle/input/triage')
if dataset_path.exists():
    print(f"✓ Dataset found: {dataset_path}")
    print("\nDataset contents:")
    !ls -lh {dataset_path}
    
    # Check subdirectories
    for subdir in ['train_val', 'test']:
        subdir_path = dataset_path / subdir
        if subdir_path.exists():
            img_count = len(list(subdir_path.rglob('*.JPG'))) + len(list(subdir_path.rglob('*.jpg')))
            print(f"\n{subdir}: {img_count} images")
else:
    print("❌ Dataset not found!")
    print("\nTo add the dataset:")
    print("1. Click '+ Add Data' in the right sidebar")
    print("2. Search for 'triage' or 'ericwolter/triage'")
    print("3. Click 'Add' on the PhotoTriage dataset")
    print("4. Re-run this cell")


## 2. Clone and Install sim_bench


In [None]:
# Clone the repository
!git clone https://github.com/YOUR_USERNAME/sim-bench.git /kaggle/working/sim-bench
%cd /kaggle/working/sim-bench


In [None]:
# Install dependencies
%pip install -e .
%pip install pandas pillow pyyaml matplotlib seaborn


## 3. Create Configuration


In [None]:
import yaml
from pathlib import Path

# Create config directory
config_dir = Path('/kaggle/working/configs')
config_dir.mkdir(parents=True, exist_ok=True)

# VGG16 configuration (paper replication)
vgg16_config = {
    'name': 'siamese_e2e_vgg16_kaggle',
    'data': {
        'root_dir': '/kaggle/input/triage',
        'min_agreement': 0.7,
        'min_reviewers': 2,
        'split_mode': 'series_based',
        'quick_experiment': None
    },
    'model': {
        'cnn_backbone': 'vgg16',
        'pretrained': True,
        'use_paper_preprocessing': True,
        'padding_mean_color': [0.460, 0.450, 0.430],
        'mlp_hidden_dims': [128, 128],
        'dropout': 0.0,
        'activation': 'tanh'
    },
    'training': {
        'batch_size': 32,
        'learning_rate': 0.001,
        'optimizer': 'sgd',
        'momentum': 0.9,
        'weight_decay': 0.0005,
        'max_epochs': 30,
        'early_stop_patience': 5
    },
    'output_dir': '/kaggle/working/outputs/siamese_e2e_vgg16',
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'seed': 42,
    'log_interval': 10
}

# Save config
config_path = config_dir / 'vgg16_kaggle.yaml'
with open(config_path, 'w') as f:
    yaml.dump(vgg16_config, f, default_flow_style=False)

print(f"Config saved to: {config_path}")
print(f"Device: {vgg16_config['device']}")


In [None]:
# Run quick test with 10% of data to verify everything works
!python -m sim_bench.training.train_siamese_e2e --config /kaggle/working/configs/vgg16_kaggle.yaml --quick-experiment 0.1 --output-dir /kaggle/working/outputs/quick_test


## 5. Full Training


In [None]:
# Run full training
!python -m sim_bench.training.train_siamese_e2e --config /kaggle/working/configs/vgg16_kaggle.yaml


## 6. Load and Visualize Results


In [None]:
import json
from pathlib import Path

# Load results
output_dir = Path('/kaggle/working/outputs/siamese_e2e_vgg16')
results_file = output_dir / 'results.json'

if results_file.exists():
    with open(results_file, 'r') as f:
        results = json.load(f)
    
    print("\n=== Final Results ===")
    print(f"Test Accuracy: {results['test_acc']:.3f}")
    print(f"Test Loss: {results['test_loss']:.4f}")
    
    # Check model
    model_file = output_dir / 'best_model.pt'
    if model_file.exists():
        checkpoint = torch.load(model_file, map_location='cpu')
        print(f"\nBest model from epoch: {checkpoint['epoch'] + 1}")
        print(f"Validation accuracy: {checkpoint['val_acc']:.3f}")
else:
    print(f"Results not found: {results_file}")


## 7. Package and Download Results


In [None]:
import shutil
import os

# Create a zip file of all results
output_zip = '/kaggle/working/siamese_training_results'
shutil.make_archive(output_zip, 'zip', '/kaggle/working/outputs')

zip_file = output_zip + '.zip'
print(f"Results packaged: {zip_file}")
print(f"File size: {os.path.getsize(zip_file) / 1024 / 1024:.2f} MB")
print("\nDownload this file from the Kaggle output section")
