In [None]:
!pip install azure-storage-blob requests tqdm
import requests
import json
import time
from tqdm import tqdm
from azure.storage.blob import BlobServiceClient
import re
from concurrent.futures import ThreadPoolExecutor

# Configuration
SEARCH_QUERY = "Your query"
SMITHSONIAN_API_KEY = "YOUR_SMITHSONIAN_API_KEY"
API_URL = "https://api.si.edu/openaccess/api/v1.0/search"
RESULTS_PER_REQUEST = 1000
MAX_WORKERS = 4

# Azure Configuration
connection_string = "your azure connection string"
container_name = "your azure blob container name"

# Initialize Azure client
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

def sanitize_filename(filename, is_json=False):
    """Create safe blob names with proper extensions"""
    clean = re.sub(r'[^a-zA-Z0-9_-]', '', str(filename).replace(' ', '_'))[:150]
    if is_json:
        return f"{clean}.json"
    return f"{clean}.jpg"

def upload_to_blob(data, filename, is_json=False):
    """Enhanced blob upload with error handling"""
    blob_name = sanitize_filename(filename, is_json)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    try:
        blob_client.upload_blob(data, overwrite=True)
        return True
    except Exception as e:
        print(f"Upload failed for {blob_name}: {str(e)[:200]}")
        return False

def fetch_smithsonian_metadata():
    """Fetch all matching items from Smithsonian API with better error handling"""
    params = {
        'api_key': SMITHSONIAN_API_KEY,
        'q': SEARCH_QUERY,
        'rows': RESULTS_PER_REQUEST,
        'start': 0,
        'fq': 'online_media_type:Images AND media_usage:CC0'
    }

    all_items = []
    total_results = None

    with tqdm(desc="Fetching Smithsonian metadata") as pbar:
        while True:
            try:
                response = requests.get(API_URL, params=params, timeout=60)
                response.raise_for_status()
                data = response.json()

                if not isinstance(data.get('response', {}).get('rows', []), list):
                    print("Unexpected API response format")
                    break

                if total_results is None:
                    total_results = min(data['response'].get('rowCount', 0), 50000)  # Safety cap
                    pbar.total = total_results

                items = data['response']['rows']
                all_items.extend(items)
                pbar.update(len(items))

                if len(all_items) >= total_results or len(items) < RESULTS_PER_REQUEST:
                    break

                params['start'] += RESULTS_PER_REQUEST
                time.sleep(0.5)

            except Exception as e:
                print(f"Error fetching data: {str(e)[:200]}")
                break

    return all_items

def safe_get(data, *keys, default=None):
    """Safely navigate nested dictionaries"""
    for key in keys:
        try:
            data = data[key]
        except (TypeError, KeyError, AttributeError):
            return default
    return data

def process_smithsonian_item(item):
    """Robust item processing with comprehensive error handling"""
    try:
        if not isinstance(item, dict):
            return False

        item_id = safe_get(item, 'id', default='')
        title = safe_get(item, 'title', default='Untitled')

        # Safely extract creator information
        creator = ''
        name_data = safe_get(item, 'name', default=[])
        if isinstance(name_data, list) and len(name_data) > 0:
            creator = safe_get(name_data[0], 'display', default='')
        elif isinstance(name_data, dict):
            creator = safe_get(name_data, 'display', default='')

        # Build metadata
        metadata = {
            "title": title,
            "creator": creator,
            "date": safe_get(item, 'date', default=''),
            "culture": safe_get(item, 'culture', default=''),
            "medium": safe_get(item, 'medium', default=''),
            "collection": safe_get(item, 'data_source', default=''),
            "object_type": safe_get(item, 'object_type', default=''),
            "credit_line": safe_get(item, 'credit_line', default=''),
            "url": f"https://www.si.edu/object/{item_id}"
        }

        # Find best image URL
        image_url = None
        content_data = safe_get(item, 'content', default=[])
        if isinstance(content_data, list):
            for content in content_data:
                if (isinstance(content, dict) and
                    content.get('type') == 'Images' and
                    content.get('usage') == 'CC0'):
                    image_url = content.get('url', '')
                    if image_url and 'ids.si.edu' in image_url:
                        break

        if not image_url:
            return False

        # Upload metadata
        metadata_bytes = json.dumps(metadata, indent=2).encode('utf-8')
        if not upload_to_blob(metadata_bytes, f"{item_id}_metadata", is_json=True):
            return False

        # Download and upload image
        try:
            response = requests.get(image_url, stream=True, timeout=60)
            response.raise_for_status()
            if not upload_to_blob(response.content, f"{item_id}_{title}"):
                return False
            return True
        except Exception as e:
            print(f"Image download failed for {item_id}: {str(e)[:200]}")
            return False

    except Exception as e:
        print(f"Error processing {item_id}: {str(e)[:200]}")
        return False

def main():
    print(f"üîç Searching Smithsonian for '{SEARCH_QUERY}'...")
    items = fetch_smithsonian_metadata()

    if not items:
        print("‚ùå No items found")
        return

    print(f"\n‚úÖ Found {len(items)} items")
    confirm = input("Proceed with processing and upload? (y/n): ")

    if confirm.lower() != 'y':
        print("üö´ Operation cancelled")
        return

    print(f"‚è≥ Processing {len(items)} items...")
    success_count = 0

    # Process items in parallel with progress tracking
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(process_smithsonian_item, item) for item in items]
        for future in tqdm(as_completed(futures), total=len(items), desc="Processing"):
            success_count += future.result()

    print(f"\nüèÅ Completed! Successfully processed {success_count}/{len(items)} items")
    print(f"üì¶ Files uploaded to Azure Blob Storage container: {container_name}")

if __name__ == "__main__":
    main()

üîç Searching Smithsonian for 'bust'...


Fetching Smithsonian metadata: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9289/9289 [01:02<00:00, 147.92it/s]



‚úÖ Found 9289 items
Proceed with processing and upload? (y/n): y
‚è≥ Processing 9289 items...


Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9289/9289 [00:00<00:00, 186640.78it/s]


üèÅ Completed! Successfully processed 0/9289 items
üì¶ Files uploaded to Azure Blob Storage container: smithsonianbust



