In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms as T
from pytorch_metric_learning import losses, samplers
import pandas as pd
from PIL import Image
from pathlib import Path
import numpy as np
from tqdm import tqdm
from timm.data import resolve_data_config
import timm
from transformers import AutoImageProcessor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_device():
    """Get best available device"""
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

# Dataset

In [3]:
class JaguarTrainDataset(Dataset):
    def __init__(self, csv_path, img_dir, transform=None):
        self.df = pd.read_csv(csv_path)
        self.img_dir = Path(img_dir)
        self.transform = transform

        # Create label mapping
        unique_jaguars = sorted(self.df['ground_truth'].unique())
        self.label_map = {name: idx for idx, name in enumerate(unique_jaguars)}
        self.num_classes = len(unique_jaguars)

        print(f"Found {self.num_classes} unique jaguars")
        print(f"Total training images: {len(self.df)}")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = self.img_dir / row['filename']
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        label = self.label_map[row['ground_truth']]
        return image, label


class JaguarTestDataset(Dataset):
    """Dataset for extracting embeddings from test images"""
    def __init__(self, test_dir, transform=None):
        self.test_dir = Path(test_dir)
        # Get all test images
        self.image_files = sorted(self.test_dir.glob('*.png'))
        self.transform = transform

        print(f"Found {len(self.image_files)} test images")

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = self.image_files[idx]
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image, img_path.name



# Transforms

In [4]:
def get_normalization_stats(model_name='dinov2_vitb14'):
    
    # Map model names to Hugging Face identifiers
    model_map = {
        'dinov2_vits14': 'facebook/dinov2-small',
        'dinov2_vitb14': 'facebook/dinov2-base',
        'dinov2_vitl14': 'facebook/dinov2-large',
    }
    
    hf_name = model_map.get(model_name, 'facebook/dinov2-base')
    
    print(f"Loading normalization from {hf_name}...")
    processor = AutoImageProcessor.from_pretrained(hf_name)
    
    mean = processor.image_mean
    std = processor.image_std
    
    print(f"  mean: {mean}")
    print(f"  std: {std}")
    
    return {
        'mean': mean,
        'std': std
    }

In [5]:
def get_transforms(img_size=224, model_name='dinov2_vitb14'):
    """
    Create train and test transforms with model-specific normalization.
    
    Args:
        img_size: Target image size (default: 224)
        model_name: Name of the pretrained model to match normalization
    
    Returns:
        train_transform, test_transform
    """
    # Load normalization stats from model
    norm_stats = get_normalization_stats(model_name)
    
    train_transform = T.Compose([
        T.Resize((img_size, img_size)),
        T.RandomHorizontalFlip(p=0.5),
        T.RandomRotation(degrees=15),
        T.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.2, hue=0.1),
        T.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.9, 1.1)),
        T.ToTensor(),
        T.Normalize(mean=norm_stats['mean'], std=norm_stats['std'])
    ])
    
    test_transform = T.Compose([
        T.Resize((img_size, img_size)),
        T.ToTensor(),
        T.Normalize(mean=norm_stats['mean'], std=norm_stats['std'])
    ])
    
    return train_transform, test_transform

# Model

In [6]:
class JaguarReIDModel(nn.Module):
    def __init__(self, embedding_dim=512, backbone='dinov2'):
        super().__init__()

        if backbone == 'dinov2':
            # DINOv2 ViT-Base
            print("Loading DINOv2 ViT-Base...")
            self.backbone = torch.hub.load('facebookresearch/dinov2',
                                          'dinov2_vitb14')
            backbone_dim = 768
        elif backbone == 'dinov2_small':
            print("Loading DINOv2 ViT-Small (faster for CPU/MPS)...")
            self.backbone = torch.hub.load('facebookresearch/dinov2',
                                          'dinov2_vits14')
            backbone_dim = 384
        else:
            raise ValueError(f"Unknown backbone: {backbone}")

        # Projection head to desired embedding dimension
        self.projection = nn.Sequential(
            nn.Linear(backbone_dim, embedding_dim),
            nn.BatchNorm1d(embedding_dim)
        )

    def forward(self, x):
        features = self.backbone(x)
        embeddings = self.projection(features)
        return embeddings

# Training

In [15]:
def train_model(train_csv, train_dir, num_epochs=20, batch_size=32,
                embedding_dim=512, lr=1e-4, device='cpu', img_size=224, backbone='dinov2_small'):

    # Setup transforms
    train_transform, _ = get_transforms(img_size=img_size)

    # Create dataset
    train_dataset = JaguarTrainDataset(train_csv, train_dir, train_transform)
    num_classes = train_dataset.num_classes

    # Create labels array for sampler
    labels = [train_dataset.label_map[row['ground_truth']]
              for _, row in train_dataset.df.iterrows()]

    # Balanced sampler (4 images per jaguar per batch)
    sampler = samplers.MPerClassSampler(
        labels=labels,
        m=4,  # 4 images per class
        length_before_new_iter=len(train_dataset)
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        sampler=sampler,
        num_workers=16,
        pin_memory=True
    )

    # Create model
    model = JaguarReIDModel(embedding_dim=embedding_dim,
                           backbone=backbone).to(device)

    # ArcFace loss
    loss_func = losses.ArcFaceLoss(
        num_classes=num_classes,
        embedding_size=embedding_dim,
        margin=28.6,  # degrees
        scale=64
    ).to(device)

    # Optimizer for both model and loss function
    optimizer = torch.optim.Adam([
        {'params': model.parameters(), 'lr': lr},
        {'params': loss_func.parameters(), 'lr': lr}
    ])

    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=num_epochs
    )

    # Training loop
    print(f"\nStarting training for {num_epochs} epochs...")
    model.train()

    for epoch in range(num_epochs):
        epoch_loss = 0
        num_batches = 0

        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for images, labels_batch in pbar:
            images = images.to(device)
            labels_batch = labels_batch.to(device)

            # Forward pass
            embeddings = model(images)
            loss = loss_func(embeddings, labels_batch)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            num_batches += 1

            pbar.set_postfix({'loss': f'{loss.item():.4f}'})

        avg_loss = epoch_loss / num_batches
        current_lr = scheduler.get_last_lr()[0]
        print(f"Epoch {epoch+1}/{num_epochs}, Avg Loss: {avg_loss:.4f}, LR: {current_lr:.6f}")

        scheduler.step()

    return model


# Inference

In [16]:
def extract_test_embeddings(model, test_dir, device='cpu'):
    """Extract embeddings for all test images"""
    _, test_transform = get_transforms()

    test_dataset = JaguarTestDataset(test_dir, test_transform)
    test_loader = DataLoader(
        test_dataset,
        batch_size=32,
        shuffle=False,
        num_workers=16,
        pin_memory=True
    )

    model.eval()
    embeddings_dict = {}

    print("\nExtracting test embeddings...")
    with torch.no_grad():
        for images, filenames in tqdm(test_loader):
            images = images.to(device)
            embeddings = model(images)

            # Normalize embeddings for cosine similarity
            embeddings = F.normalize(embeddings, p=2, dim=1)

            for emb, fname in zip(embeddings, filenames):
                embeddings_dict[fname] = emb.cpu().numpy()

    return embeddings_dict


def create_submission(embeddings_dict, test_csv, output_path):
    """Create submission file from embeddings"""
    test_df = pd.read_csv(test_csv)

    print("\nComputing similarities...")
    similarities = []

    for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
        query_emb = embeddings_dict[row['query_image']]
        gallery_emb = embeddings_dict[row['gallery_image']]

        # Cosine similarity (already normalized, so just dot product)
        sim = np.dot(query_emb, gallery_emb)

        # Map from [-1, 1] to [0, 1]
        sim = (sim + 1) / 2

        # Clip to ensure valid range
        sim = np.clip(sim, 0.0, 1.0)

        similarities.append(sim)

    # Create submission
    submission = pd.DataFrame({
        'row_id': test_df['row_id'],
        'similarity': similarities
    })

    # Validate
    assert len(submission) == 137270, f"Wrong number of rows: {len(submission)}"
    assert (submission['similarity'] >= 0).all(), "Found negative values"
    assert (submission['similarity'] <= 1).all(), "Found values > 1"

    # Save
    submission.to_csv(output_path, index=False)

    print(f"\n✓ Submission saved to {output_path}")
    print(f"  Rows: {len(submission):,}")
    print(f"  Similarity range: [{submission['similarity'].min():.4f}, {submission['similarity'].max():.4f}]")
    print(f"  Similarity mean: {submission['similarity'].mean():.4f}")

    return submission



# Execution

In [None]:
def main():
    # Paths
    TRAIN_CSV = 'jaguar-re-id/train.csv'
    TRAIN_DIR = 'jaguar-re-id/train/train'
    TEST_CSV = 'jaguar-re-id/test.csv'
    TEST_DIR = 'jaguar-re-id/test/test'
    OUTPUT_CSV = 'submission.csv'
    MODEL_PATH = 'jaguar_reid_model.pth'
    
    # Hyperparameters (adjusted for MacOS)
    NUM_EPOCHS = 200
    BATCH_SIZE = 64
    EMBEDDING_DIM = 512
    LEARNING_RATE = 1e-4
    IMG_SIZE = 224
    BACKBONE = 'dinov2'

    # Get device
    DEVICE = get_device()

    # Train model
    model = train_model(
        train_csv=TRAIN_CSV,
        train_dir=TRAIN_DIR,
        num_epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,
        embedding_dim=EMBEDDING_DIM,
        lr=LEARNING_RATE,
        device=DEVICE,
        img_size=IMG_SIZE,
        backbone=BACKBONE
    )
    
    # Save model
    torch.save(model.state_dict(), MODEL_PATH)
    print(f"\n✓ Model saved to {MODEL_PATH}")
    
    # Extract test embeddings
    embeddings_dict = extract_test_embeddings(model, TEST_DIR, device=DEVICE)
    
    # Create submission
    submission = create_submission(embeddings_dict, TEST_CSV, OUTPUT_CSV)
    
    print("\n✓ Done!")


if __name__ == '__main__':
    main()

Using GPU: NVIDIA GeForce RTX 4090
GPU Memory: 25.25 GB
Loading normalization from facebook/dinov2-base...


  mean: [0.485, 0.456, 0.406]
  std: [0.229, 0.224, 0.225]
Found 31 unique jaguars
Total training images: 1895
Loading DINOv2 ViT-Base...


Using cache found in /root/.cache/torch/hub/facebookresearch_dinov2_main



Starting training for 200 epochs...


Epoch 1/200: 100%|██████████| 30/30 [00:37<00:00,  1.26s/it, loss=36.4938]


Epoch 1/200, Avg Loss: 34.7338, LR: 0.000100


Epoch 2/200: 100%|██████████| 30/30 [00:36<00:00,  1.22s/it, loss=34.9135]


Epoch 2/200, Avg Loss: 33.0621, LR: 0.000100


Epoch 3/200:  53%|█████▎    | 16/30 [00:21<00:04,  3.32it/s, loss=31.7813]