In [None]:
# Mount Google Drive
# If already mounted this will show "Drive is already mounted" ‚Äî that's fine.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install packages that are not pre-installed in Colab
# (torch, torchvision, numpy, Pillow, requests are already available)
!pip install -q git+https://github.com/openai/CLIP.git ftfy

# Download Uppsala Collection & Semantic Image Search with CLIP

In this notebook, you will:

1. **Download** the Uppsala University collection from Europeana (thumbnail resolution)
2. **Learn** how CLIP connects images and text
3. **Search** the collection using natural language (e.g., "waterbody", "portrait")
4. **Explore** how different queries find different images

---

## What is CLIP?

**CLIP** (Contrastive Language-Image Pre-training) is a neural network trained by OpenAI that learns to connect images and text. It can:

- **Understand images** by converting them into numerical representations (embeddings)
- **Understand text** by converting descriptions into the same embedding space
- **Match** images and text by measuring how similar their embeddings are

This allows us to search for images using natural language descriptions like:
- "a painting of a stormy sea"
- "winter landscape with snow"
- "flowers in a vase"

### How CLIP Works

```mermaid
flowchart LR
    subgraph Input
        IMG["üñºÔ∏è Image"]
        TXT["üìù Text\n'a river landscape'"]
    end
    
    subgraph CLIP["CLIP Model"]
        IE["Image\nEncoder"]
        TE["Text\nEncoder"]
    end
    
    subgraph Embeddings["Embedding Space"]
        IV["[0.12, -0.45, 0.78, ...]"]
        TV["[0.11, -0.42, 0.81, ...]"]
    end
    
    SIM["üìä Cosine\nSimilarity\n= 0.94"]
    
    IMG --> IE --> IV
    TXT --> TE --> TV
    IV --> SIM
    TV --> SIM
```

The key insight: **similar concepts end up close together** in embedding space, whether they come from images or text!

---

## Pre-calculated Embeddings

Computing image embeddings requires significant computational resources (ideally a GPU). For this workshop, we use **pre-calculated embeddings**:

- Image embeddings were computed beforehand by the instructor
- You only need to compute **text embeddings** (fast on any laptop)
- This makes the workshop accessible on any hardware!

If you want to compute your own embeddings, see **Notebook 04 (Advanced)**.

---

## Part 1: Setup and Imports

In [None]:
# Standard library imports
import os
import json
import time
from pathlib import Path

# External libraries
import numpy as np
import requests
from PIL import Image as PILImage
from IPython.display import display, Image, HTML

# Import CLIP
try:
    import torch
    import clip
    CLIP_AVAILABLE = True
    print(f"‚úì CLIP loaded successfully!")
except ImportError:
    CLIP_AVAILABLE = False
    print("‚ö†Ô∏è CLIP not installed.")
    print("  Install with: pip install git+https://github.com/openai/CLIP.git torch torchvision")

# Select compute device: CUDA GPU > Apple Silicon GPU > CPU
if CLIP_AVAILABLE:
    if torch.cuda.is_available():
        DEVICE = 'cuda'
        print(f"‚úì NVIDIA GPU (CUDA): {torch.cuda.get_device_name(0)}")
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        DEVICE = 'mps'
        print("‚úì Apple Silicon GPU (MPS) ‚Äî good performance!")
    else:
        DEVICE = 'cpu'
        print("‚ÑπÔ∏è No GPU detected. Using CPU ‚Äî fine for text search in this workshop.")

In [None]:
# Set up base paths
PROJECT_ROOT = Path("/content/drive/MyDrive/Distant_viewing")
DATA_DIR     = PROJECT_ROOT / "data"

print(f"Project root: {PROJECT_ROOT}")
print(f"Data dir:     {DATA_DIR}")

---

## Part 2: Download from a Swedish collection


  1. Swedish National Heritage Board                            (1,414,618 items)
  2. Nordic Museum Foundation                                   (351,553 items)
  3. Malm√∂ Museum                                               (288,308 items)
  4. Museum of Ethnography                                      (271,525 items)
  5. Museum of World Culture                                    (230,176 items)
  6. Upplands Museum                                            (218,928 items)
  7. Jamtli                                                     (173,095 items)
  8. Museum of Gothenburg                                       (171,960 items)
  9. Swedish National Museum of Science and Technology          (150,809 items)
 10. Swedish Railway Museum                                     (145,093 items)
 11. Naval Museum                                               (143,623 items)
 12. Bohusl√§n Museum                                            (140,521 items)
 13. G√§vleborg County Museum                                    (133,207 items)
 14. Kulturen                                                   (130,732 items)
 15. √ñrebro County Museum                                       (125,272 items)
 16. V√§sterg√∂tlands Museum                                      (117,479 items)
 17. Army Museum                                                (114,513 items)
 18. S√∂rmland Museum                                            (107,177 items)
 19. National Maritime Museum                                   (103,829 items)
 20. V√§nersborgs museum                                         (103,249 items)
 21. The Museum of Mediterranean and Near Eastern Antiquities... (74,761 items)
 22. Museum of Far Eastern Antiquities                          (74,426 items)
 23. Uppsala University                                         (74,233 items)
 24. H√§lsinglands Museum                                        (73,678 items)
 25. Swedish Centre for Architecture and Design                 (67,581 items)

In [None]:
# API Setup (same as Notebook 01)
BASE_URL = "https://api.europeana.eu/record/v2"

# Load API key
NOTEBOOK_DIR = PROJECT_ROOT
API_KEY_LOCATIONS = [
    NOTEBOOK_DIR / "api-key-europeana.txt",
    PROJECT_ROOT / "misc" / "api-key-europeana.txt",
]

API_KEY = "api2demo"
for key_file in API_KEY_LOCATIONS:
    if key_file.exists():
        with open(key_file, 'r') as f:
            custom_key = f.read().strip()
            if custom_key and custom_key != "api2demo":
                API_KEY = custom_key
                print(f"‚úì API key loaded from {key_file}")
                break
else:
    print("‚ÑπÔ∏è Using demo API key")

In [None]:
# API and helper functions

def search_europeana(query="*", rows=12, reusability="open", qf=None,
                     profile="standard", cursor=None, theme=None):
    """Search the Europeana collection."""
    url = f"{BASE_URL}/search.json"
    params = {
        "wskey": API_KEY,
        "query": query,
        "rows": min(rows, 100),
        "profile": profile
    }
    if reusability:
        params["reusability"] = reusability
    if qf:
        params["qf"] = qf
    if cursor:
        params["cursor"] = cursor
    if theme:
        params["theme"] = theme

    try:
        response = requests.get(url, params=params, timeout=30)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error: {e}")
        return None


def get_item_title(item):
    if 'title' in item and item['title']:
        return item['title'][0] if isinstance(item['title'], list) else item['title']
    return "Untitled"


def get_item_creator(item):
    if 'dcCreator' in item and item['dcCreator']:
        return item['dcCreator'][0] if isinstance(item['dcCreator'], list) else item['dcCreator']
    return "Unknown"


def get_item_year(item):
    if 'year' in item and item['year']:
        return item['year'][0] if isinstance(item['year'], list) else item['year']
    return "n.d."


def get_item_preview(item):
    if 'edmPreview' in item and item['edmPreview']:
        return item['edmPreview'][0] if isinstance(item['edmPreview'], list) else item['edmPreview']
    return None


def sanitize_filename(name):
    if not name:
        return "unknown"
    safe = "".join(c for c in name if c.isalnum() or c in ' ._-')
    return safe.strip()[:80]


print("‚úì Functions ready")

### Function definition: download

In [None]:
def download_collection(
    collection_name,
    theme=None,
    keyword=None,
    max_images=200,
    output_dir=None,
    delay=0.3
):
    """
    Download thumbnail images from a specific Europeana institution.

    Parameters:
        collection_name : DATA_PROVIDER name in Europeana (e.g. "Museum of Gothenburg")
        theme           : Thematic collection filter (e.g. "art"), or None for all
        keyword         : Optional keyword to narrow results (e.g. "river"), or None
        max_images      : Maximum number of images to download
        output_dir      : Path to save images (auto-generated from params if None)
        delay           : Seconds to wait between requests (be polite to the server)

    Returns:
        List of Path objects for all images in the output directory after download
    """
    # Build output directory name from params if not given explicitly
    if output_dir is None:
        parts = [collection_name.replace(' ', '_')]
        if theme:
            parts.append(theme)
        if keyword:
            parts.append(keyword)
        output_dir = DATA_DIR / "images" / "_".join(parts)

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Build API query
    qf    = [f'DATA_PROVIDER:"{collection_name}"', "TYPE:IMAGE"]
    query = keyword or "*"

    print(f"üì• {collection_name}")
    if theme:   print(f"   theme   = {theme}")
    if keyword: print(f"   keyword = {keyword}")
    print(f"   max     = {max_images}")
    print(f"   folder  ‚Üí {output_dir.name}/")
    print("=" * 60)

    # Check total available before starting
    check = search_europeana(query=query, rows=1, qf=qf, theme=theme, reusability="open")
    if not check or not check.get('success'):
        print("‚ùå Could not reach API")
        return []
    total_available = check['totalResults']
    print(f"‚úì {total_available:,} images available")
    print()

    # Paginate and download
    cursor       = "*"
    processed    = 0
    new_dl       = 0

    while processed < max_images:
        batch  = min(100, max_images - processed)
        result = search_europeana(
            query=query, rows=batch, qf=qf, theme=theme,
            reusability="open", cursor=cursor
        )
        if not result or not result.get('items'):
            break

        for item in result['items']:
            if processed >= max_images:
                break

            preview_url = get_item_preview(item)
            if not preview_url:
                processed += 1
                continue

            item_id    = item.get('id', 'unknown').replace('/', '_')
            safe_title = sanitize_filename(get_item_title(item))[:40]
            filepath   = output_dir / f"{item_id}_{safe_title}.jpg"

            if filepath.exists() and filepath.stat().st_size > 0:
                processed += 1
                continue

            try:
                resp = requests.get(preview_url, timeout=20)
                resp.raise_for_status()
                with open(filepath, 'wb') as f:
                    f.write(resp.content)
                new_dl += 1
                if new_dl % 50 == 0:
                    print(f"  {new_dl} new images downloaded...")
                time.sleep(delay)
            except Exception:
                pass

            processed += 1

        cursor = result.get('nextCursor')
        if not cursor:
            break

    all_files = list(output_dir.glob("*.jpg"))
    existing  = len(all_files) - new_dl
    print(f"\n‚úì Done: {new_dl} new  +  {existing} already existed  =  {len(all_files)} total")
    print(f"   {output_dir}")
    return all_files

In [None]:
# ============================================================
# CONFIGURATION ‚Äî change these to switch collection / filter
# ============================================================

COLLECTION_NAME = "Museum of Gothenburg"  # <-- DATA_PROVIDER name in Europeana
THEME           = "art"                   # <-- "art", "photography", etc. ‚Äî or None for all
SEARCH_KEYWORD  = None                    # <-- extra keyword filter, e.g. "river" ‚Äî or None for all
MAX_DOWNLOAD    = 200                     # <-- how many images to download
DO_DOWNLOAD     = True                  # <-- set to True when ready to download

# ============================================================
# Auto-generate folder name and file paths from the config above
# Examples:
#   Museum of Gothenburg + art          ‚Üí  Museum_of_Gothenburg_art/
#   Museum of Gothenburg + art + river  ‚Üí  Museum_of_Gothenburg_art_river/
# ============================================================

_parts = [COLLECTION_NAME.replace(' ', '_')]
if THEME:
    _parts.append(THEME)
if SEARCH_KEYWORD:
    _parts.append(SEARCH_KEYWORD)

_folder_name = "_".join(_parts)

COLLECTION_IMAGES_DIR      = DATA_DIR / "images"     / _folder_name
COLLECTION_EMBEDDINGS_FILE = DATA_DIR / "embeddings" / _folder_name / f"{_folder_name}_clip_embeddings.npz"

COLLECTION_IMAGES_DIR.mkdir(parents=True, exist_ok=True)
COLLECTION_EMBEDDINGS_FILE.parent.mkdir(parents=True, exist_ok=True)

print(f"Collection : {COLLECTION_NAME}")
print(f"Theme      : {THEME      or '(all)'}")
print(f"Keyword    : {SEARCH_KEYWORD or '(all)'}")
print(f"Folder     : {_folder_name}/")
print(f"Images     : {COLLECTION_IMAGES_DIR}")
print(f"Embeddings : {COLLECTION_EMBEDDINGS_FILE}")

### Run Download function

In [None]:
if DO_DOWNLOAD:
    downloaded_files = download_collection(
        collection_name = COLLECTION_NAME,
        theme           = THEME,
        keyword         = SEARCH_KEYWORD,
        max_images      = MAX_DOWNLOAD,
        output_dir      = COLLECTION_IMAGES_DIR,
    )
else:
    print("‚ÑπÔ∏è  Download skipped ‚Äî set DO_DOWNLOAD = True in the config cell to download.")
    existing = list(COLLECTION_IMAGES_DIR.glob("*.jpg"))
    if existing:
        print(f"   Found {len(existing)} existing images in {COLLECTION_IMAGES_DIR.name}/")
    else:
        print(f"   No images yet in {COLLECTION_IMAGES_DIR.name}/")

---

## Summary

In this notebook, you learned:

1. **How to explore** Swedish cultural institutions in Europeana
2. **How to configure** a flexible download (collection, theme, keyword)
3. **How folder names** are auto-generated from your configuration
4. **How cursor-based pagination** lets you retrieve more than 100 results

### Next Steps

In **Notebook 03**, you will:
- Load the CLIP model
- Compute embeddings for your downloaded images
- Search the collection using natural language
