In [12]:
# 04_retrieve_metadata_from_apis.ipynb

import requests
import pandas as pd
import time
from tqdm import tqdm
import random

# 1. Set your Rijksmuseum API Key
RIJKSMUSEUM_API_KEY = "PWJQNj1Q"

# 2. Define the API endpoint
RIJKSMUSEUM_ENDPOINT = "https://www.rijksmuseum.nl/api/en/collection"

# 3. Define a function to search artworks
def search_rijksmuseum(query, max_items=10):
    random_page = random.randint(1, 100)  # Randomly choose a page between 1–10
    params = {
        "key": RIJKSMUSEUM_API_KEY,
        "format": "json",
        "q": query,
        "ps": 100,        # Page size (max 100)
        "p": random_page  # ✅ Add this line to apply random page
        
    }
    response = requests.get(RIJKSMUSEUM_ENDPOINT, params=params)
    response.raise_for_status()
    data = response.json()
    artworks = []
    
    for art_obj in tqdm(data.get('artObjects', [])[:max_items]):
        artwork = {
            "id": art_obj.get("objectNumber"),
            "title": art_obj.get("title"),
            "artist": art_obj.get("principalOrFirstMaker"),
            "longTitle": art_obj.get("longTitle"),
            "image_url": art_obj.get("webImage", {}).get("url", None),
            "description": None  # will fill later if available
        }
        artworks.append(artwork)
    
    return artworks

# 4. Optional: function to fetch detailed description (if available)
def fetch_artwork_details(object_number):
    details_url = f"https://www.rijksmuseum.nl/api/en/collection/{object_number}"
    params = {
        "key": RIJKSMUSEUM_API_KEY,
        "format": "json",
    }
    response = requests.get(details_url, params=params)
    response.raise_for_status()
    data = response.json()
    
    description = data.get('artObject', {}).get('label', {}).get('description', None)
    return description

# 5. Collect artworks
search_query = "painting"  # You can change this later
artworks = search_rijksmuseum(search_query, max_items=10)

# 6. Remove duplicate artworks based on unique ID
seen_ids = set()
unique_artworks = []

for art in artworks:
    if art["id"] not in seen_ids:
        unique_artworks.append(art)
        seen_ids.add(art["id"])

# Replace original list
artworks = unique_artworks

# 6. Fetch detailed descriptions if available
for art in tqdm(artworks):
    if art['id']:
        try:
            desc = fetch_artwork_details(art['id'])
            art['description'] = desc
            time.sleep(0.5)  # be gentle to the API
        except Exception as e:
            print(f"Failed to fetch details for {art['id']}:", e)

# 7. Convert to DataFrame
df = pd.DataFrame(artworks)

# 8. Save to JSON
df.to_json("data/real_museum_metadata.json", orient="records", indent=4)

print("✅ Successfully saved to '../data/real_museum_metadata.json'")



100%|████████████████████████████████████████| 10/10 [00:00<00:00, 84733.41it/s]
100%|███████████████████████████████████████████| 10/10 [00:09<00:00,  1.03it/s]

✅ Successfully saved to '../data/real_museum_metadata.json'



