In [10]:
import requests
import json
import time
from datetime import datetime

max_retries = 5
retry_delay = 2  # seconds

for attempt in range(max_retries):
    response = requests.get(search_url, params=params)
    if response.status_code == 200:
        break
    print(f"Attempt {attempt + 1} failed with status {response.status_code}. Retrying in {retry_delay} seconds...")
    time.sleep(retry_delay)
else:
    raise Exception(f"Failed to fetch data after {max_retries} attempts. Last status: {response.status_code}")

search_data = response.json()
object_ids = search_data["objectIDs"]
print(f"Found {len(object_ids)} textile objects")

with open("textile_object_ids.json", "w") as f:
    json.dump(object_ids, f)
print("Object IDs saved to textile_object_ids.json")


Attempt 1 failed with status 403. Retrying in 2 seconds...
Attempt 2 failed with status 403. Retrying in 2 seconds...
Attempt 3 failed with status 403. Retrying in 2 seconds...
Found 33437 textile objects
Object IDs saved to textile_object_ids.json


In [None]:
with open("textile_object_ids.json", "r") as f_read:
    loaded_object_ids = json.load(f_read)

# Verify that the loaded IDs match the original object_ids
print(f"Loaded {len(loaded_object_ids)} object IDs from JSON.")
print("Verification:", loaded_object_ids == object_ids)


Loaded 33437 object IDs from JSON.
Verification: True


In [13]:
batch_object_ids_extracted = [obj["objectID"] for obj in batch_object_ids]
print(f"Extracted {len(batch_object_ids_extracted)} object IDs from batch.")
print("Match with loaded_object_ids:", set(batch_object_ids_extracted) == set(loaded_object_ids))


Extracted 22054 object IDs from batch.
Match with loaded_object_ids: False


In [16]:
import json

# Load the complete object ID list
with open("textile_object_ids.json", "r") as f:
    all_object_ids = json.load(f)

total_objects = len(all_object_ids)
print(f"Total textile objects: {total_objects}")

# Load your downloaded JSON file
json_file = 'met_textiles_batch_22800_20250705_134702.json'

try:
    with open(json_file, 'r', encoding='utf-8') as f:
        downloaded_data = json.load(f)
    
    downloaded_object_ids = set()
    objects_with_images = 0
    
    for obj in downloaded_data:
        if isinstance(obj, dict) and 'objectID' in obj:
            downloaded_object_ids.add(obj['objectID'])
            if obj.get('primaryImage'):
                objects_with_images += 1
    
    print(f"Objects in your JSON file: {len(downloaded_object_ids)}")
    print(f"Objects with images: {objects_with_images}")
    
    # Calculate what's left
    all_object_ids_set = set(all_object_ids)
    remaining_ids = all_object_ids_set - downloaded_object_ids
    completed_ids = all_object_ids_set & downloaded_object_ids
    
    print(f"\n=== PROGRESS SUMMARY ===")
    print(f"✅ Downloaded: {len(completed_ids)}")
    print(f"⏳ Remaining: {len(remaining_ids)}")
    print(f"📊 Progress: {len(completed_ids)/total_objects*100:.1f}%")
    
    if len(remaining_ids) > 0:
        print(f"\n🎯 You have {len(remaining_ids)} objects left to download")
        print(f"📋 Sample remaining IDs: {sorted(list(remaining_ids))[:10]}")
    else:
        print(f"\n🎉 All objects downloaded!")
        
except Exception as e:
    print(f"❌ Error loading JSON file: {e}")

Total textile objects: 33437
Objects in your JSON file: 22104
Objects with images: 17749

=== PROGRESS SUMMARY ===
✅ Downloaded: 22104
⏳ Remaining: 11333
📊 Progress: 66.1%

🎯 You have 11333 objects left to download
📋 Sample remaining IDs: [13737, 13740, 13748, 13795, 13798, 13801, 14054, 14056, 14081, 14086]


In [17]:
import json

# Load the complete object ID list
with open("textile_object_ids.json", "r") as f:
    all_object_ids = json.load(f)

total_objects = len(all_object_ids)
print(f"Total textile objects: {total_objects}")

# Load both downloaded JSON files
forward_file = 'met_textiles_batch_22800_20250705_134702.json'
reverse_file = 'idun/met_textiles_batch_11988_20250705_134921.json'

all_downloaded_ids = set()
total_objects_with_images = 0
forward_count = 0
reverse_count = 0

print("\n=== CHECKING FORWARD DOWNLOAD ===")
try:
    with open(forward_file, 'r', encoding='utf-8') as f:
        forward_data = json.load(f)
    
    forward_ids = set()
    forward_images = 0
    
    for obj in forward_data:
        if isinstance(obj, dict) and 'objectID' in obj:
            forward_ids.add(obj['objectID'])
            all_downloaded_ids.add(obj['objectID'])
            if obj.get('primaryImage'):
                forward_images += 1
                total_objects_with_images += 1
    
    forward_count = len(forward_ids)
    print(f"✅ Forward file: {forward_count} objects, {forward_images} with images")
    
except Exception as e:
    print(f"❌ Error loading forward file: {e}")

print("\n=== CHECKING REVERSE DOWNLOAD ===")
try:
    with open(reverse_file, 'r', encoding='utf-8') as f:
        reverse_data = json.load(f)
    
    reverse_ids = set()
    reverse_images = 0
    
    for obj in reverse_data:
        if isinstance(obj, dict) and 'objectID' in obj:
            reverse_ids.add(obj['objectID'])
            if obj['objectID'] not in all_downloaded_ids:  # Avoid double counting images
                all_downloaded_ids.add(obj['objectID'])
                if obj.get('primaryImage'):
                    total_objects_with_images += 1
            if obj.get('primaryImage'):
                reverse_images += 1
    
    reverse_count = len(reverse_ids)
    print(f"✅ Reverse file: {reverse_count} objects, {reverse_images} with images")
    
except Exception as e:
    print(f"❌ Error loading reverse file: {e}")

# Check for overlap/duplicates
if forward_count > 0 and reverse_count > 0:
    overlap = forward_ids & reverse_ids
    print(f"\n=== OVERLAP ANALYSIS ===")
    print(f"🔄 Duplicate objects: {len(overlap)}")
    if len(overlap) > 0:
        print(f"📋 Sample duplicates: {sorted(list(overlap))[:10]}")

# Calculate final progress
all_object_ids_set = set(all_object_ids)
remaining_ids = all_object_ids_set - all_downloaded_ids
completed_ids = all_object_ids_set & all_downloaded_ids

print(f"\n=== FINAL PROGRESS SUMMARY ===")
print(f"📁 Forward file objects: {forward_count}")
print(f"📁 Reverse file objects: {reverse_count}")
print(f"🔄 Duplicate objects: {len(overlap) if 'overlap' in locals() else 0}")
print(f"✅ Total unique downloaded: {len(all_downloaded_ids)}")
print(f"🖼️  Total objects with images: {total_objects_with_images}")
print(f"⏳ Remaining to download: {len(remaining_ids)}")
print(f"📊 Overall progress: {len(completed_ids)/total_objects*100:.1f}%")

if len(remaining_ids) > 0:
    print(f"\n🎯 You still need {len(remaining_ids)} objects")
    print(f"📋 Sample remaining IDs: {sorted(list(remaining_ids))[:10]}")
else:
    print(f"\n🎉 ALL OBJECTS DOWNLOADED! 🎉")

Total textile objects: 33437

=== CHECKING FORWARD DOWNLOAD ===
✅ Forward file: 22104 objects, 17749 with images

=== CHECKING REVERSE DOWNLOAD ===
✅ Reverse file: 20497 objects, 16850 with images

=== OVERLAP ANALYSIS ===
🔄 Duplicate objects: 10143
📋 Sample duplicates: [13561, 13562, 13794, 13796, 13799, 13800, 13838, 14186, 21466, 21946]

=== FINAL PROGRESS SUMMARY ===
📁 Forward file objects: 22104
📁 Reverse file objects: 20497
🔄 Duplicate objects: 10143
✅ Total unique downloaded: 32458
🖼️  Total objects with images: 26624
⏳ Remaining to download: 979
📊 Overall progress: 97.1%

🎯 You still need 979 objects
📋 Sample remaining IDs: [21107, 21919, 22276, 22739, 22874, 24007, 24647, 25030, 28904, 30925]


In [18]:
import requests
import json
import time
from datetime import datetime

# Load the complete object ID list
with open("textile_object_ids.json", "r") as f:
    all_object_ids = json.load(f)

# Load both downloaded JSON files to get what we already have
forward_file = 'met_textiles_batch_22800_20250705_134702.json'
reverse_file = 'idun/met_textiles_batch_11988_20250705_134921.json'

all_downloaded_ids = set()

# Get already downloaded IDs
try:
    with open(forward_file, 'r', encoding='utf-8') as f:
        forward_data = json.load(f)
    for obj in forward_data:
        if isinstance(obj, dict) and 'objectID' in obj:
            all_downloaded_ids.add(obj['objectID'])
    print(f"Loaded {len(all_downloaded_ids)} IDs from forward file")
except Exception as e:
    print(f"Error loading forward file: {e}")

try:
    with open(reverse_file, 'r', encoding='utf-8') as f:
        reverse_data = json.load(f)
    for obj in reverse_data:
        if isinstance(obj, dict) and 'objectID' in obj:
            all_downloaded_ids.add(obj['objectID'])
    print(f"Total unique IDs after reverse: {len(all_downloaded_ids)}")
except Exception as e:
    print(f"Error loading reverse file: {e}")

# Calculate remaining IDs
all_object_ids_set = set(all_object_ids)
remaining_ids = list(all_object_ids_set - all_downloaded_ids)

print(f"Found {len(remaining_ids)} remaining objects to download")

# Function to get object details with retry logic
def get_object_details(object_id, max_retries=5):
    detail_url = f"https://collectionapi.metmuseum.org/public/collection/v1/objects/{object_id}"
    
    for attempt in range(max_retries):
        try:
            response = requests.get(detail_url)
            if response.status_code == 200:
                return response.json(), "success"
            elif response.status_code == 403:
                print(f"Rate limited on object {object_id}, waiting 10 seconds...")
                time.sleep(10)
                continue
            elif response.status_code == 404:
                return None, "not_found"
            else:
                print(f"Failed to get object {object_id}: {response.status_code}")
                time.sleep(2)
                continue
        except Exception as e:
            print(f"Error getting object {object_id}: {e}")
            time.sleep(2)
    
    return None, "failed_after_retries"

# Download remaining objects
downloaded_objects = []
failed_ids = []
not_found_ids = []
batch_size = 50

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"remaining_download_log_{timestamp}.txt"

print(f"Starting download of {len(remaining_ids)} remaining objects...")
print(f"Log file: {log_file}")

start_time = time.time()

with open(log_file, 'w') as log:
    log.write(f"Download started at {datetime.now()}\n")
    log.write(f"Remaining objects to download: {len(remaining_ids)}\n\n")
    
    for i, object_id in enumerate(remaining_ids):
        if i % 10 == 0:
            elapsed = time.time() - start_time
            rate = i / elapsed if elapsed > 0 else 0
            progress_msg = f"Progress: {i+1}/{len(remaining_ids)} ({(i+1)/len(remaining_ids)*100:.1f}%) - Rate: {rate:.1f} req/sec"
            print(progress_msg)
            log.write(f"{progress_msg}\n")
            log.flush()
        
        object_data, status = get_object_details(object_id)
        
        if status == "success" and object_data:
            downloaded_objects.append(object_data)
            log.write(f"✅ {object_id}: Downloaded successfully\n")
        elif status == "not_found":
            not_found_ids.append(object_id)
            log.write(f"❌ {object_id}: Not found (404)\n")
        else:
            failed_ids.append(object_id)
            log.write(f"💥 {object_id}: Failed after retries\n")
        
        # Save progress every batch_size objects
        if (i + 1) % batch_size == 0:
            batch_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"remaining_textiles_batch_{i+1}_{batch_timestamp}.json"
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(downloaded_objects, f, indent=2, ensure_ascii=False)
            log.write(f"💾 Saved {len(downloaded_objects)} objects to {filename}\n")
            print(f"Saved batch to {filename}")
        
        # Slower rate for remaining objects
        time.sleep(0.2)  # 5 requests per second
    
    # Final summary
    total_time = time.time() - start_time
    log.write(f"\n=== FINAL SUMMARY ===\n")
    log.write(f"Total time: {total_time/60:.1f} minutes\n")
    log.write(f"Successfully downloaded: {len(downloaded_objects)}\n")
    log.write(f"Not found (404): {len(not_found_ids)}\n")
    log.write(f"Failed after retries: {len(failed_ids)}\n")
    log.write(f"Success rate: {len(downloaded_objects)/len(remaining_ids)*100:.1f}%\n")

# Save final results
final_filename = f"remaining_textiles_complete_{timestamp}.json"
with open(final_filename, 'w', encoding='utf-8') as f:
    json.dump(downloaded_objects, f, indent=2, ensure_ascii=False)

# Save failed and not found IDs
with open(f"not_found_ids_{timestamp}.json", 'w') as f:
    json.dump(not_found_ids, f)

with open(f"failed_ids_{timestamp}.json", 'w') as f:
    json.dump(failed_ids, f)

# Final summary
print(f"\n🎉 REMAINING DOWNLOAD COMPLETE!")
print(f"⏱️  Total time: {(time.time() - start_time)/60:.1f} minutes")
print(f"✅ Successfully downloaded: {len(downloaded_objects)}")
print(f"❌ Not found (404): {len(not_found_ids)}")
print(f"💥 Failed after retries: {len(failed_ids)}")
print(f"📊 Success rate: {len(downloaded_objects)/len(remaining_ids)*100:.1f}%")
print(f"📁 Final file: {final_filename}")
print(f"📝 Log file: {log_file}")

# Create summary JSON
summary = {
    "timestamp": timestamp,
    "total_remaining": len(remaining_ids),
    "successfully_downloaded": len(downloaded_objects),
    "not_found_404": len(not_found_ids),
    "failed_after_retries": len(failed_ids),
    "success_rate_percent": len(downloaded_objects)/len(remaining_ids)*100,
    "download_time_minutes": (time.time() - start_time)/60,
    "files_created": {
        "main_data": final_filename,
        "log": log_file,
        "not_found_ids": f"not_found_ids_{timestamp}.json",
        "failed_ids": f"failed_ids_{timestamp}.json"
    }
}

with open(f"remaining_download_summary_{timestamp}.json", 'w') as f:
    json.dump(summary, f, indent=2)

print(f"📋 Summary saved: remaining_download_summary_{timestamp}.json")

Loaded 22104 IDs from forward file
Total unique IDs after reverse: 32458
Found 979 remaining objects to download
Starting download of 979 remaining objects...
Log file: remaining_download_log_20250705_135732.txt
Progress: 1/979 (0.1%) - Rate: 0.0 req/sec
Progress: 11/979 (1.1%) - Rate: 2.0 req/sec
Progress: 21/979 (2.1%) - Rate: 2.1 req/sec
Progress: 31/979 (3.2%) - Rate: 2.1 req/sec
Rate limited on object 226347, waiting 10 seconds...
Rate limited on object 226347, waiting 10 seconds...
Rate limited on object 226347, waiting 10 seconds...
Rate limited on object 226347, waiting 10 seconds...
Rate limited on object 223360, waiting 10 seconds...
Progress: 41/979 (4.2%) - Rate: 0.6 req/sec
Saved batch to remaining_textiles_batch_50_20250705_135846.json
Progress: 51/979 (5.2%) - Rate: 0.7 req/sec
Rate limited on object 841930, waiting 10 seconds...
Rate limited on object 841930, waiting 10 seconds...
Rate limited on object 841930, waiting 10 seconds...
Rate limited on object 841930, waitin

In [21]:
import requests
import time
from datetime import datetime
import json

def search_with_retry(search_params, search_name, max_retries=5):
    """Search with retry logic and return count and object IDs"""
    search_url = "https://collectionapi.metmuseum.org/public/collection/v1/search"
    
    for attempt in range(max_retries):
        try:
            response = requests.get(search_url, params=search_params)
            
            if response.status_code == 200:
                data = response.json()
                return data.get("total", 0), data.get("objectIDs", [])
            elif response.status_code == 403:
                print(f"Rate limited for '{search_name}', waiting 10 seconds...")
                time.sleep(10)
                continue
            else:
                print(f"Error {response.status_code} for '{search_name}', retrying...")
                time.sleep(2)
                continue
                
        except Exception as e:
            print(f"Exception for '{search_name}': {e}, retrying...")
            time.sleep(2)
    
    print(f"Failed to get data for '{search_name}' after {max_retries} attempts")
    return 0, []

# Define search configurations
searches = [
    {
        "name": "Textiles Only",
        "params": {
            "medium": "Textiles",
            "q": "*"
        }
    },
    {
        "name": "Tapestries Only", 
        "params": {
            "medium": "Tapestries",
            "q": "*"
        }
    },
    {
        "name": "Textiles + Tapestries (Proper Way)",
        "params": {
            "medium": "Textiles|Tapestries",  # Using | operator
            "q": "*"
        }
    },
    {
        "name": "All Textile-related",
        "params": {
            "medium": "Textiles|Tapestries|Embroidery|Lace",
            "q": "*"
        }
    }
]

print("=== PROPER TEXTILE SEARCH COMPARISON ===")
print(f"Search started at: {datetime.now()}")
print()

results = {}

for search_config in searches:
    search_name = search_config["name"]
    search_params = search_config["params"]
    
    print(f"🔍 Searching: {search_name}")
    print(f"   Parameters: {search_params}")
    
    count, object_ids = search_with_retry(search_params, search_name)
    results[search_name] = {
        "count": count,
        "object_ids": set(object_ids) if object_ids else set(),
        "params": search_params
    }
    print(f"   ✅ Found: {count:,} objects")
    print()
    
    # Small delay between searches
    time.sleep(2)

print(f"=== RESULTS SUMMARY ===")
for search_name, data in results.items():
    print(f"{search_name}: {data['count']:,} objects")

# Calculate overlaps and differences
textiles_ids = results.get("Textiles Only", {}).get("object_ids", set())
tapestries_ids = results.get("Tapestries Only", {}).get("object_ids", set())
combined_ids = results.get("Textiles + Tapestries (Proper Way)", {}).get("object_ids", set())

print(f"\n=== DETAILED OVERLAP ANALYSIS ===")

if textiles_ids and tapestries_ids:
    overlap = textiles_ids & tapestries_ids
    textiles_only = textiles_ids - tapestries_ids
    tapestries_only = tapestries_ids - textiles_ids
    
    print(f"📊 Textiles only: {len(textiles_only):,}")
    print(f"📊 Tapestries only: {len(tapestries_only):,}")
    print(f"📊 Overlap (objects in both): {len(overlap):,}")
    
    theoretical_union = textiles_ids | tapestries_ids
    print(f"📊 Theoretical union: {len(theoretical_union):,}")
    
    if combined_ids:
        print(f"📊 API combined search (Textiles|Tapestries): {len(combined_ids):,}")
        
        # Check if API combined search matches theoretical union
        if len(combined_ids) == len(theoretical_union):
            print("✅ API combined search matches theoretical union perfectly!")
        else:
            difference = len(theoretical_union) - len(combined_ids)
            print(f"❓ Difference: {difference:,} objects")
            
            # Check which objects are missing/extra
            missing_from_api = theoretical_union - combined_ids
            extra_in_api = combined_ids - theoretical_union
            
            if missing_from_api:
                print(f"   Missing from API: {len(missing_from_api):,} objects")
                print(f"   Sample missing: {sorted(list(missing_from_api))[:5]}")
            
            if extra_in_api:
                print(f"   Extra in API: {len(extra_in_api):,} objects")
                print(f"   Sample extra: {sorted(list(extra_in_api))[:5]}")
    
    # Show sample overlapping objects
    if len(overlap) > 0:
        print(f"\n📋 Sample objects that are both Textiles AND Tapestries:")
        print(f"    {sorted(list(overlap))[:10]}")

# Verify your original textile count
original_count = 33437
textiles_count = results.get("Textiles Only", {}).get("count", 0)
print(f"\n=== VERIFICATION ===")
print(f"Your original count: {original_count:,}")
print(f"Current API count: {textiles_count:,}")

if textiles_count == original_count:
    print("✅ Counts match perfectly!")
elif abs(textiles_count - original_count) < 100:
    print(f"⚠️  Small difference: {abs(textiles_count - original_count)} objects")
else:
    print(f"❌ Significant difference: {abs(textiles_count - original_count)} objects")

# Save detailed results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
detailed_results = {
    "timestamp": timestamp,
    "search_results": {
        name: {
            "count": data["count"],
            "parameters": data["params"]
        } for name, data in results.items()
    },
    "overlap_analysis": {
        "textiles_only": len(textiles_only) if textiles_ids and tapestries_ids else 0,
        "tapestries_only": len(tapestries_only) if textiles_ids and tapestries_ids else 0,
        "overlap": len(overlap) if textiles_ids and tapestries_ids else 0,
        "theoretical_union": len(theoretical_union) if textiles_ids and tapestries_ids else 0,
        "api_combined": len(combined_ids) if combined_ids else 0
    },
    "verification": {
        "original_textile_count": original_count,
        "current_api_count": textiles_count,
        "difference": abs(textiles_count - original_count)
    }
}

filename = f"proper_textile_search_analysis_{timestamp}.json"
with open(filename, "w") as f:
    json.dump(detailed_results, f, indent=2)

print(f"\n📁 Detailed results saved to: {filename}")
print(f"✅ Analysis complete at: {datetime.now()}")

=== PROPER TEXTILE SEARCH COMPARISON ===
Search started at: 2025-07-05 19:08:48.561694

🔍 Searching: Textiles Only
   Parameters: {'medium': 'Textiles', 'q': '*'}
   ✅ Found: 33,437 objects

🔍 Searching: Tapestries Only
   Parameters: {'medium': 'Tapestries', 'q': '*'}
   ✅ Found: 2,355 objects

🔍 Searching: Textiles + Tapestries (Proper Way)
   Parameters: {'medium': 'Textiles|Tapestries', 'q': '*'}
   ✅ Found: 2,204 objects

🔍 Searching: All Textile-related
   Parameters: {'medium': 'Textiles|Tapestries|Embroidery|Lace', 'q': '*'}
   ✅ Found: 0 objects

=== RESULTS SUMMARY ===
Textiles Only: 33,437 objects
Tapestries Only: 2,355 objects
Textiles + Tapestries (Proper Way): 2,204 objects
All Textile-related: 0 objects

=== DETAILED OVERLAP ANALYSIS ===
📊 Textiles only: 31,233
📊 Tapestries only: 151
📊 Overlap (objects in both): 2,204
📊 Theoretical union: 33,588
📊 API combined search (Textiles|Tapestries): 2,204
❓ Difference: 31,384 objects
   Missing from API: 31,384 objects
   Sample m

In [22]:
import requests
import time
from datetime import datetime
import json

def search_with_retry(search_params, search_name, max_retries=5):
    """Search with retry logic and return count and object IDs"""
    search_url = "https://collectionapi.metmuseum.org/public/collection/v1/search"
    
    for attempt in range(max_retries):
        try:
            response = requests.get(search_url, params=search_params)
            
            if response.status_code == 200:
                data = response.json()
                return data.get("total", 0), data.get("objectIDs", [])
            elif response.status_code == 403:
                print(f"Rate limited for '{search_name}', waiting 10 seconds...")
                time.sleep(10)
                continue
            else:
                print(f"Error {response.status_code} for '{search_name}', retrying...")
                time.sleep(2)
                continue
                
        except Exception as e:
            print(f"Exception for '{search_name}': {e}, retrying...")
            time.sleep(2)
    
    print(f"Failed to get data for '{search_name}' after {max_retries} attempts")
    return 0, []

def get_object_details(object_id, max_retries=5):
    """Get object details with retry logic"""
    detail_url = f"https://collectionapi.metmuseum.org/public/collection/v1/objects/{object_id}"
    
    for attempt in range(max_retries):
        try:
            response = requests.get(detail_url)
            if response.status_code == 200:
                return response.json(), "success"
            elif response.status_code == 403:
                print(f"Rate limited on object {object_id}, waiting 10 seconds...")
                time.sleep(10)
                continue
            elif response.status_code == 404:
                return None, "not_found"
            else:
                print(f"Failed to get object {object_id}: {response.status_code}")
                time.sleep(2)
                continue
        except Exception as e:
            print(f"Error getting object {object_id}: {e}")
            time.sleep(2)
    
    return None, "failed_after_retries"

print("=== FINDING MISSING TAPESTRY OBJECTS ===")
print(f"Started at: {datetime.now()}")

# Step 1: Get all textile and tapestry object IDs
print("\n🔍 Step 1: Getting Textile and Tapestry object IDs...")

textiles_count, textiles_ids = search_with_retry(
    {"medium": "Textiles", "q": "*"}, 
    "Textiles Only"
)
print(f"✅ Textiles: {textiles_count:,} objects")

tapestries_count, tapestries_ids = search_with_retry(
    {"medium": "Tapestries", "q": "*"}, 
    "Tapestries Only"
)
print(f"✅ Tapestries: {tapestries_count:,} objects")

# Convert to sets for easier manipulation
textiles_set = set(textiles_ids) if textiles_ids else set()
tapestries_set = set(tapestries_ids) if tapestries_ids else set()

# Step 2: Find tapestries that are NOT in textiles
print("\n📊 Step 2: Analyzing overlap...")
tapestries_only = tapestries_set - textiles_set  # Tapestries NOT in textiles
overlap = textiles_set & tapestries_set  # Objects in both

print(f"📋 Total tapestries: {len(tapestries_set):,}")
print(f"📋 Tapestries also in textiles: {len(overlap):,}")
print(f"🎯 Tapestries ONLY (missing from textiles): {len(tapestries_only):,}")

# Step 3: Save the missing tapestry IDs
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
missing_ids_list = sorted(list(tapestries_only))

# Save missing IDs to file
missing_ids_file = f"missing_tapestry_ids_{timestamp}.json"
with open(missing_ids_file, 'w') as f:
    json.dump(missing_ids_list, f, indent=2)

print(f"\n💾 Saved {len(missing_ids_list)} missing tapestry IDs to: {missing_ids_file}")
print(f"📋 Sample missing IDs: {missing_ids_list[:10]}")

# Step 4: Download details for missing tapestry objects
if len(missing_ids_list) > 0:
    print(f"\n🚀 Step 3: Downloading details for {len(missing_ids_list)} missing tapestry objects...")
    
    downloaded_objects = []
    failed_ids = []
    not_found_ids = []
    batch_size = 50
    
    log_file = f"missing_tapestries_download_log_{timestamp}.txt"
    start_time = time.time()
    
    with open(log_file, 'w') as log:
        log.write(f"Missing tapestries download started at {datetime.now()}\n")
        log.write(f"Objects to download: {len(missing_ids_list)}\n\n")
        
        for i, object_id in enumerate(missing_ids_list):
            if i % 10 == 0:
                elapsed = time.time() - start_time
                rate = i / elapsed if elapsed > 0 else 0
                progress_msg = f"Progress: {i+1}/{len(missing_ids_list)} ({(i+1)/len(missing_ids_list)*100:.1f}%) - Rate: {rate:.1f} req/sec"
                print(progress_msg)
                log.write(f"{progress_msg}\n")
                log.flush()
            
            object_data, status = get_object_details(object_id)
            
            if status == "success" and object_data:
                downloaded_objects.append(object_data)
                log.write(f"✅ {object_id}: Downloaded successfully\n")
            elif status == "not_found":
                not_found_ids.append(object_id)
                log.write(f"❌ {object_id}: Not found (404)\n")
            else:
                failed_ids.append(object_id)
                log.write(f"💥 {object_id}: Failed after retries\n")
            
            # Save progress every batch_size objects
            if (i + 1) % batch_size == 0:
                batch_filename = f"missing_tapestries_batch_{i+1}_{timestamp}.json"
                with open(batch_filename, 'w', encoding='utf-8') as f:
                    json.dump(downloaded_objects, f, indent=2, ensure_ascii=False)
                log.write(f"💾 Saved {len(downloaded_objects)} objects to {batch_filename}\n")
                print(f"💾 Saved batch to {batch_filename}")
            
            # Reasonable rate limiting
            time.sleep(0.2)  # 5 requests per second
        
        # Final summary in log
        total_time = time.time() - start_time
        log.write(f"\n=== DOWNLOAD SUMMARY ===\n")
        log.write(f"Total time: {total_time/60:.1f} minutes\n")
        log.write(f"Successfully downloaded: {len(downloaded_objects)}\n")
        log.write(f"Not found (404): {len(not_found_ids)}\n")
        log.write(f"Failed after retries: {len(failed_ids)}\n")
        log.write(f"Success rate: {len(downloaded_objects)/len(missing_ids_list)*100:.1f}%\n")
    
    # Save final results
    final_filename = f"missing_tapestries_complete_{timestamp}.json"
    with open(final_filename, 'w', encoding='utf-8') as f:
        json.dump(downloaded_objects, f, indent=2, ensure_ascii=False)
    
    # Save failed and not found IDs
    if not_found_ids:
        with open(f"missing_tapestries_not_found_{timestamp}.json", 'w') as f:
            json.dump(not_found_ids, f)
    
    if failed_ids:
        with open(f"missing_tapestries_failed_{timestamp}.json", 'w') as f:
            json.dump(failed_ids, f)
    
    # Count objects with images
    objects_with_images = sum(1 for obj in downloaded_objects if obj.get('primaryImage'))
    
    # Final summary
    print(f"\n🎉 MISSING TAPESTRIES DOWNLOAD COMPLETE!")
    print(f"⏱️  Total time: {(time.time() - start_time)/60:.1f} minutes")
    print(f"✅ Successfully downloaded: {len(downloaded_objects)}")
    print(f"🖼️  Objects with images: {objects_with_images}")
    print(f"❌ Not found (404): {len(not_found_ids)}")
    print(f"💥 Failed after retries: {len(failed_ids)}")
    print(f"📊 Success rate: {len(downloaded_objects)/len(missing_ids_list)*100:.1f}%")
    print(f"📁 Main file: {final_filename}")
    print(f"📝 Log file: {log_file}")
    
    # Create comprehensive summary
    summary = {
        "timestamp": timestamp,
        "analysis": {
            "total_textiles": len(textiles_set),
            "total_tapestries": len(tapestries_set),
            "tapestries_in_textiles": len(overlap),
            "tapestries_missing_from_textiles": len(tapestries_only)
        },
        "download_results": {
            "total_missing": len(missing_ids_list),
            "successfully_downloaded": len(downloaded_objects),
            "objects_with_images": objects_with_images,
            "not_found_404": len(not_found_ids),
            "failed_after_retries": len(failed_ids),
            "success_rate_percent": len(downloaded_objects)/len(missing_ids_list)*100 if missing_ids_list else 0,
            "download_time_minutes": (time.time() - start_time)/60
        },
        "files_created": {
            "missing_ids": missing_ids_file,
            "main_data": final_filename,
            "log": log_file,
            "not_found_ids": f"missing_tapestries_not_found_{timestamp}.json" if not_found_ids else None,
            "failed_ids": f"missing_tapestries_failed_{timestamp}.json" if failed_ids else None
        }
    }
    
    summary_file = f"missing_tapestries_summary_{timestamp}.json"
    with open(summary_file, 'w') as f:
        json.dump(summary, f, indent=2)
    
    print(f"📋 Summary saved: {summary_file}")

else:
    print(f"\n✅ No missing tapestry objects found - all tapestries are already included in textiles!")

print(f"\n✅ Analysis complete at: {datetime.now()}")

=== FINDING MISSING TAPESTRY OBJECTS ===
Started at: 2025-07-05 19:19:09.003753

🔍 Step 1: Getting Textile and Tapestry object IDs...
✅ Textiles: 33,437 objects
✅ Tapestries: 2,355 objects

📊 Step 2: Analyzing overlap...
📋 Total tapestries: 2,355
📋 Tapestries also in textiles: 2,204
🎯 Tapestries ONLY (missing from textiles): 151

💾 Saved 151 missing tapestry IDs to: missing_tapestry_ids_20250705_191910.json
📋 Sample missing IDs: [237, 14147, 14148, 14149, 14150, 14151, 14152, 14153, 14154, 14155]

🚀 Step 3: Downloading details for 151 missing tapestry objects...
Progress: 1/151 (0.7%) - Rate: 0.0 req/sec
Progress: 11/151 (7.3%) - Rate: 2.2 req/sec
Progress: 21/151 (13.9%) - Rate: 2.2 req/sec
Rate limited on object 51485, waiting 10 seconds...
Rate limited on object 51485, waiting 10 seconds...
Rate limited on object 51485, waiting 10 seconds...
Rate limited on object 51485, waiting 10 seconds...
Progress: 31/151 (20.5%) - Rate: 0.6 req/sec
Rate limited on object 53714, waiting 10 secon

In [23]:
import requests
import json
import time
from datetime import datetime

def get_object_details(object_id, max_retries=5):
    """Get object details with retry logic"""
    detail_url = f"https://collectionapi.metmuseum.org/public/collection/v1/objects/{object_id}"
    
    for attempt in range(max_retries):
        try:
            response = requests.get(detail_url)
            if response.status_code == 200:
                return response.json(), "success"
            elif response.status_code == 403:
                print(f"Rate limited on object {object_id}, waiting 15 seconds...")
                time.sleep(15)  # Longer wait for rate limits
                continue
            elif response.status_code == 404:
                return None, "not_found"
            else:
                print(f"Failed to get object {object_id}: {response.status_code}")
                time.sleep(3)  # Slightly longer delay
                continue
        except Exception as e:
            print(f"Error getting object {object_id}: {e}")
            time.sleep(3)
    
    return None, "failed_after_retries"

print("=== RETRY FAILED DOWNLOADS ===")
print(f"Started at: {datetime.now()}")

# Look for failed IDs files from previous runs
failed_files = [
    "failed_ids_20250705_135732.json",  # From your latest run
    "not_found_ids_20250705_135732.json"  # Let's also retry the 404s
]

all_retry_ids = []

# Load failed IDs from all available files
for file in failed_files:
    try:
        with open(file, 'r') as f:
            ids = json.load(f)
            if ids:  # Only add if file contains IDs
                all_retry_ids.extend(ids)
                print(f"📁 Loaded {len(ids)} IDs from {file}")
    except FileNotFoundError:
        print(f"❌ File not found: {file}")
    except Exception as e:
        print(f"❌ Error loading {file}: {e}")

# Remove duplicates and sort
retry_ids = sorted(list(set(all_retry_ids)))
print(f"\n🎯 Total unique IDs to retry: {len(retry_ids)}")

if len(retry_ids) == 0:
    print("✅ No failed IDs found to retry!")
else:
    print(f"📋 Sample retry IDs: {retry_ids[:10]}")
    
    # Start retry process
    downloaded_objects = []
    still_failed_ids = []
    still_not_found_ids = []
    batch_size = 25  # Smaller batches for retry
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = f"retry_download_log_{timestamp}.txt"
    
    print(f"\n🚀 Starting retry download...")
    print(f"📝 Log file: {log_file}")
    
    start_time = time.time()
    
    with open(log_file, 'w') as log:
        log.write(f"Retry download started at {datetime.now()}\n")
        log.write(f"Objects to retry: {len(retry_ids)}\n\n")
        
        for i, object_id in enumerate(retry_ids):
            if i % 5 == 0:  # More frequent progress updates
                elapsed = time.time() - start_time
                rate = i / elapsed if elapsed > 0 else 0
                progress_msg = f"Progress: {i+1}/{len(retry_ids)} ({(i+1)/len(retry_ids)*100:.1f}%) - Rate: {rate:.1f} req/sec"
                print(progress_msg)
                log.write(f"{progress_msg}\n")
                log.flush()
            
            object_data, status = get_object_details(object_id, max_retries=7)  # More retries
            
            if status == "success" and object_data:
                downloaded_objects.append(object_data)
                log.write(f"✅ {object_id}: Downloaded successfully on retry!\n")
                print(f"✅ Recovered object {object_id}")
            elif status == "not_found":
                still_not_found_ids.append(object_id)
                log.write(f"❌ {object_id}: Still not found (404)\n")
            else:
                still_failed_ids.append(object_id)
                log.write(f"💥 {object_id}: Still failed after retries\n")
            
            # Save progress every batch_size objects
            if (i + 1) % batch_size == 0 and downloaded_objects:
                batch_filename = f"retry_batch_{i+1}_{timestamp}.json"
                with open(batch_filename, 'w', encoding='utf-8') as f:
                    json.dump(downloaded_objects, f, indent=2, ensure_ascii=False)
                log.write(f"💾 Saved {len(downloaded_objects)} recovered objects to {batch_filename}\n")
                print(f"💾 Saved batch to {batch_filename}")
            
            # Conservative rate limiting for retries
            time.sleep(0.3)  # 3.3 requests per second
        
        # Final summary in log
        total_time = time.time() - start_time
        log.write(f"\n=== RETRY SUMMARY ===\n")
        log.write(f"Total time: {total_time/60:.1f} minutes\n")
        log.write(f"Successfully recovered: {len(downloaded_objects)}\n")
        log.write(f"Still not found (404): {len(still_not_found_ids)}\n")
        log.write(f"Still failed: {len(still_failed_ids)}\n")
        log.write(f"Recovery rate: {len(downloaded_objects)/len(retry_ids)*100:.1f}%\n")
    
    # Save final results
    if downloaded_objects:
        final_filename = f"retry_recovered_objects_{timestamp}.json"
        with open(final_filename, 'w', encoding='utf-8') as f:
            json.dump(downloaded_objects, f, indent=2, ensure_ascii=False)
        print(f"📁 Recovered objects saved to: {final_filename}")
    
    # Save still failed IDs
    if still_failed_ids:
        with open(f"still_failed_ids_{timestamp}.json", 'w') as f:
            json.dump(still_failed_ids, f)
    
    if still_not_found_ids:
        with open(f"still_not_found_ids_{timestamp}.json", 'w') as f:
            json.dump(still_not_found_ids, f)
    
    # Count objects with images
    objects_with_images = sum(1 for obj in downloaded_objects if obj.get('primaryImage'))
    
    # Final summary
    print(f"\n🎉 RETRY DOWNLOAD COMPLETE!")
    print(f"⏱️  Total time: {(time.time() - start_time)/60:.1f} minutes")
    print(f"✅ Successfully recovered: {len(downloaded_objects)}")
    print(f"🖼️  Recovered objects with images: {objects_with_images}")
    print(f"❌ Still not found (404): {len(still_not_found_ids)}")
    print(f"💥 Still failed: {len(still_failed_ids)}")
    print(f"📊 Recovery rate: {len(downloaded_objects)/len(retry_ids)*100:.1f}%")
    
    if downloaded_objects:
        print(f"📁 Recovered objects file: {final_filename}")
    
    print(f"📝 Log file: {log_file}")
    
    # Create retry summary
    retry_summary = {
        "timestamp": timestamp,
        "retry_results": {
            "total_retry_attempts": len(retry_ids),
            "successfully_recovered": len(downloaded_objects),
            "objects_with_images": objects_with_images,
            "still_not_found_404": len(still_not_found_ids),
            "still_failed": len(still_failed_ids),
            "recovery_rate_percent": len(downloaded_objects)/len(retry_ids)*100 if retry_ids else 0,
            "retry_time_minutes": (time.time() - start_time)/60
        },
        "files_created": {
            "recovered_objects": final_filename if downloaded_objects else None,
            "log": log_file,
            "still_failed_ids": f"still_failed_ids_{timestamp}.json" if still_failed_ids else None,
            "still_not_found_ids": f"still_not_found_ids_{timestamp}.json" if still_not_found_ids else None
        }
    }
    
    summary_file = f"retry_summary_{timestamp}.json"
    with open(summary_file, 'w') as f:
        json.dump(retry_summary, f, indent=2)
    
    print(f"📋 Retry summary saved: {summary_file}")

print(f"\n✅ Retry process complete at: {datetime.now()}")

=== RETRY FAILED DOWNLOADS ===
Started at: 2025-07-05 22:06:44.719669
📁 Loaded 243 IDs from not_found_ids_20250705_135732.json

🎯 Total unique IDs to retry: 243
📋 Sample retry IDs: [21107, 72339, 77239, 77240, 77241, 77242, 212220, 212221, 212223, 212224]

🚀 Starting retry download...
📝 Log file: retry_download_log_20250705_220644.txt
Progress: 1/243 (0.4%) - Rate: 0.0 req/sec
Progress: 6/243 (2.5%) - Rate: 1.7 req/sec
Progress: 11/243 (4.5%) - Rate: 1.7 req/sec
Progress: 16/243 (6.6%) - Rate: 1.7 req/sec
Progress: 21/243 (8.6%) - Rate: 1.7 req/sec
Progress: 26/243 (10.7%) - Rate: 1.7 req/sec
Progress: 31/243 (12.8%) - Rate: 1.7 req/sec
Rate limited on object 212246, waiting 15 seconds...
Rate limited on object 212246, waiting 15 seconds...
Rate limited on object 212246, waiting 15 seconds...
Progress: 36/243 (14.8%) - Rate: 0.5 req/sec
Progress: 41/243 (16.9%) - Rate: 0.6 req/sec
Progress: 46/243 (18.9%) - Rate: 0.6 req/sec
Progress: 51/243 (21.0%) - Rate: 0.7 req/sec
Progress: 56/243

In [24]:
import json
import os
from datetime import datetime

def analyze_downloads():
    print("=== COMPREHENSIVE DOWNLOAD ANALYSIS ===")
    print(f"Analysis started at: {datetime.now()}")
    
    # 1. TEXTILES ANALYSIS
    print("\n🧵 === TEXTILES ANALYSIS ===")
    
    # Load original textile IDs
    try:
        with open("textile_object_ids.json", "r") as f:
            all_textile_ids = json.load(f)
        total_textiles = len(all_textile_ids)
        print(f"📊 Total Textiles Expected: {total_textiles:,}")
    except FileNotFoundError:
        print("❌ textile_object_ids.json not found")
        return
    
    # Load all textile downloads
    textile_files = [
        'met_textiles_batch_22800_20250705_134702.json',
        'idun/met_textiles_batch_11988_20250705_134921.json',
        'remaining_textiles_complete_20250705_135732.json'
    ]
    
    # Add any retry files if they exist
    retry_files = [f for f in os.listdir('.') if f.startswith('retry_recovered_objects_')]
    textile_files.extend(retry_files)
    
    downloaded_textile_ids = set()
    textiles_with_images = 0
    
    for file in textile_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            file_ids = 0
            file_images = 0
            for obj in data:
                if isinstance(obj, dict) and 'objectID' in obj:
                    if obj['objectID'] not in downloaded_textile_ids:
                        downloaded_textile_ids.add(obj['objectID'])
                        file_ids += 1
                        if obj.get('primaryImage'):
                            textiles_with_images += 1
                            file_images += 1
            
            print(f"  📁 {file}: {file_ids} objects, {file_images} with images")
            
        except FileNotFoundError:
            print(f"  ❌ File not found: {file}")
        except Exception as e:
            print(f"  ❌ Error loading {file}: {e}")
    
    # Calculate textile errors
    textile_errors = total_textiles - len(downloaded_textile_ids)
    
    print(f"\n📊 TEXTILES SUMMARY:")
    print(f"  Total Expected: {total_textiles:,}")
    print(f"  Successfully Downloaded: {len(downloaded_textile_ids):,}")
    print(f"  With Images: {textiles_with_images:,}")
    print(f"  Errors/Missing: {textile_errors:,}")
    print(f"  Success Rate: {len(downloaded_textile_ids)/total_textiles*100:.1f}%")
    
    # 2. TAPESTRIES ANALYSIS
    print("\n🎨 === TAPESTRIES ANALYSIS ===")
    
    # Look for tapestry files
    tapestry_files = [f for f in os.listdir('.') if f.startswith('missing_tapestries_complete_')]
    
    downloaded_tapestry_ids = set()
    tapestries_with_images = 0
    total_tapestries_expected = 0
    
    # Load tapestry summary to get expected count
    summary_files = [f for f in os.listdir('.') if f.startswith('missing_tapestries_summary_')]
    if summary_files:
        try:
            with open(summary_files[0], 'r') as f:
                summary = json.load(f)
            total_tapestries_expected = summary.get('analysis', {}).get('tapestries_missing_from_textiles', 0)
            print(f"📊 Total Tapestries Expected (missing from textiles): {total_tapestries_expected:,}")
        except Exception as e:
            print(f"❌ Error loading tapestry summary: {e}")
    
    for file in tapestry_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            file_ids = 0
            file_images = 0
            for obj in data:
                if isinstance(obj, dict) and 'objectID' in obj:
                    if obj['objectID'] not in downloaded_tapestry_ids:
                        downloaded_tapestry_ids.add(obj['objectID'])
                        file_ids += 1
                        if obj.get('primaryImage'):
                            tapestries_with_images += 1
                            file_images += 1
            
            print(f"  📁 {file}: {file_ids} objects, {file_images} with images")
            
        except Exception as e:
            print(f"  ❌ Error loading {file}: {e}")
    
    # Calculate tapestry errors
    tapestry_errors = total_tapestries_expected - len(downloaded_tapestry_ids) if total_tapestries_expected > 0 else 0
    
    print(f"\n📊 TAPESTRIES SUMMARY:")
    print(f"  Total Expected: {total_tapestries_expected:,}")
    print(f"  Successfully Downloaded: {len(downloaded_tapestry_ids):,}")
    print(f"  With Images: {tapestries_with_images:,}")
    print(f"  Errors/Missing: {tapestry_errors:,}")
    if total_tapestries_expected > 0:
        print(f"  Success Rate: {len(downloaded_tapestry_ids)/total_tapestries_expected*100:.1f}%")
    
    # 3. COMBINED ANALYSIS
    print("\n🔗 === TEXTILES + TAPESTRIES COMBINED ===")
    
    total_combined = len(downloaded_textile_ids) + len(downloaded_tapestry_ids)
    total_images_combined = textiles_with_images + tapestries_with_images
    total_expected_combined = total_textiles + total_tapestries_expected
    total_errors_combined = textile_errors + tapestry_errors
    
    print(f"📊 COMBINED SUMMARY:")
    print(f"  Total Expected: {total_expected_combined:,}")
    print(f"  Successfully Downloaded: {total_combined:,}")
    print(f"  With Images: {total_images_combined:,}")
    print(f"  Errors/Missing: {total_errors_combined:,}")
    if total_expected_combined > 0:
        print(f"  Overall Success Rate: {total_combined/total_expected_combined*100:.1f}%")
    
    # 4. CREATE FINAL SUMMARY JSON
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    final_summary = {
        "timestamp": timestamp,
        "textiles": {
            "total_expected": total_textiles,
            "successfully_downloaded": len(downloaded_textile_ids),
            "with_images": textiles_with_images,
            "errors_missing": textile_errors,
            "success_rate_percent": len(downloaded_textile_ids)/total_textiles*100 if total_textiles > 0 else 0
        },
        "tapestries": {
            "total_expected": total_tapestries_expected,
            "successfully_downloaded": len(downloaded_tapestry_ids),
            "with_images": tapestries_with_images,
            "errors_missing": tapestry_errors,
            "success_rate_percent": len(downloaded_tapestry_ids)/total_tapestries_expected*100 if total_tapestries_expected > 0 else 0
        },
        "combined": {
            "total_expected": total_expected_combined,
            "successfully_downloaded": total_combined,
            "with_images": total_images_combined,
            "errors_missing": total_errors_combined,
            "success_rate_percent": total_combined/total_expected_combined*100 if total_expected_combined > 0 else 0
        },
        "files_analyzed": {
            "textile_files": textile_files,
            "tapestry_files": tapestry_files
        }
    }
    
    summary_filename = f"final_download_summary_{timestamp}.json"
    with open(summary_filename, 'w') as f:
        json.dump(final_summary, f, indent=2)
    
    print(f"\n📁 Final summary saved to: {summary_filename}")
    print(f"✅ Analysis complete at: {datetime.now()}")
    
    return final_summary

# Run the analysis
summary = analyze_downloads()

=== COMPREHENSIVE DOWNLOAD ANALYSIS ===
Analysis started at: 2025-07-05 22:21:43.642699

🧵 === TEXTILES ANALYSIS ===
📊 Total Textiles Expected: 33,437
  📁 met_textiles_batch_22800_20250705_134702.json: 22104 objects, 17749 with images
  📁 idun/met_textiles_batch_11988_20250705_134921.json: 10354 objects, 8875 with images
  📁 remaining_textiles_complete_20250705_135732.json: 736 objects, 647 with images

📊 TEXTILES SUMMARY:
  Total Expected: 33,437
  Successfully Downloaded: 33,194
  With Images: 27,271
  Errors/Missing: 243
  Success Rate: 99.3%

🎨 === TAPESTRIES ANALYSIS ===
📊 Total Tapestries Expected (missing from textiles): 151
  📁 missing_tapestries_complete_20250705_191910.json: 150 objects, 102 with images

📊 TAPESTRIES SUMMARY:
  Total Expected: 151
  Successfully Downloaded: 150
  With Images: 102
  Errors/Missing: 1
  Success Rate: 99.3%

🔗 === TEXTILES + TAPESTRIES COMBINED ===
📊 COMBINED SUMMARY:
  Total Expected: 33,588
  Successfully Downloaded: 33,344
  With Images: 27,3

In [25]:
import json
import os
from datetime import datetime

def analyze_all_downloads_including_reverse():
    print("=== COMPREHENSIVE DOWNLOAD ANALYSIS (INCLUDING REVERSE COMPLETE) ===")
    print(f"Analysis started at: {datetime.now()}")
    
    # 1. TEXTILES ANALYSIS
    print("\n🧵 === TEXTILES ANALYSIS ===")
    
    # Load original textile IDs
    try:
        with open("textile_object_ids.json", "r") as f:
            all_textile_ids = json.load(f)
        total_textiles = len(all_textile_ids)
        print(f"📊 Total Textiles Expected: {total_textiles:,}")
    except FileNotFoundError:
        print("❌ textile_object_ids.json not found")
        return
    
    # Load all textile downloads INCLUDING the new reverse complete file
    textile_files = [
        'met_textiles_batch_22800_20250705_134702.json',
        'idun/met_textiles_batch_11988_20250705_134921.json',
        'remaining_textiles_complete_20250705_135732.json',
        'idun/met_textiles_complete_reverse_20250705_222820.json'  # NEW FILE!
    ]
    
    # Add any retry files if they exist
    retry_files = [f for f in os.listdir('.') if f.startswith('retry_recovered_objects_')]
    textile_files.extend(retry_files)
    
    downloaded_textile_ids = set()
    textiles_with_images = 0
    
    for file in textile_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            file_ids = 0
            file_images = 0
            for obj in data:
                if isinstance(obj, dict) and 'objectID' in obj:
                    if obj['objectID'] not in downloaded_textile_ids:
                        downloaded_textile_ids.add(obj['objectID'])
                        file_ids += 1
                        if obj.get('primaryImage'):
                            textiles_with_images += 1
                            file_images += 1
            
            print(f"  📁 {file}: {file_ids} unique objects, {file_images} with images")
            
        except FileNotFoundError:
            print(f"  ❌ File not found: {file}")
        except Exception as e:
            print(f"  ❌ Error loading {file}: {e}")
    
    # Calculate textile errors
    textile_errors = total_textiles - len(downloaded_textile_ids)
    
    print(f"\n📊 TEXTILES SUMMARY:")
    print(f"  Total Expected: {total_textiles:,}")
    print(f"  Successfully Downloaded: {len(downloaded_textile_ids):,}")
    print(f"  With Images: {textiles_with_images:,}")
    print(f"  Errors/Missing: {textile_errors:,}")
    print(f"  Success Rate: {len(downloaded_textile_ids)/total_textiles*100:.1f}%")
    
    # 2. TAPESTRIES ANALYSIS
    print("\n🎨 === TAPESTRIES ANALYSIS ===")
    
    # Look for tapestry files
    tapestry_files = [f for f in os.listdir('.') if f.startswith('missing_tapestries_complete_')]
    
    downloaded_tapestry_ids = set()
    tapestries_with_images = 0
    total_tapestries_expected = 0
    
    # Load tapestry summary to get expected count
    summary_files = [f for f in os.listdir('.') if f.startswith('missing_tapestries_summary_')]
    if summary_files:
        try:
            with open(summary_files[0], 'r') as f:
                summary = json.load(f)
            total_tapestries_expected = summary.get('analysis', {}).get('tapestries_missing_from_textiles', 0)
            print(f"📊 Total Tapestries Expected (missing from textiles): {total_tapestries_expected:,}")
        except Exception as e:
            print(f"❌ Error loading tapestry summary: {e}")
    
    for file in tapestry_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            file_ids = 0
            file_images = 0
            for obj in data:
                if isinstance(obj, dict) and 'objectID' in obj:
                    if obj['objectID'] not in downloaded_tapestry_ids:
                        downloaded_tapestry_ids.add(obj['objectID'])
                        file_ids += 1
                        if obj.get('primaryImage'):
                            tapestries_with_images += 1
                            file_images += 1
            
            print(f"  📁 {file}: {file_ids} objects, {file_images} with images")
            
        except Exception as e:
            print(f"  ❌ Error loading {file}: {e}")
    
    # Calculate tapestry errors
    tapestry_errors = total_tapestries_expected - len(downloaded_tapestry_ids) if total_tapestries_expected > 0 else 0
    
    print(f"\n📊 TAPESTRIES SUMMARY:")
    print(f"  Total Expected: {total_tapestries_expected:,}")
    print(f"  Successfully Downloaded: {len(downloaded_tapestry_ids):,}")
    print(f"  With Images: {tapestries_with_images:,}")
    print(f"  Errors/Missing: {tapestry_errors:,}")
    if total_tapestries_expected > 0:
        print(f"  Success Rate: {len(downloaded_tapestry_ids)/total_tapestries_expected*100:.1f}%")
    
    # 3. COMBINED ANALYSIS
    print("\n🔗 === TEXTILES + TAPESTRIES COMBINED ===")
    
    total_combined = len(downloaded_textile_ids) + len(downloaded_tapestry_ids)
    total_images_combined = textiles_with_images + tapestries_with_images
    total_expected_combined = total_textiles + total_tapestries_expected
    total_errors_combined = textile_errors + tapestry_errors
    
    print(f"📊 COMBINED SUMMARY:")
    print(f"  Total Expected: {total_expected_combined:,}")
    print(f"  Successfully Downloaded: {total_combined:,}")
    print(f"  With Images: {total_images_combined:,}")
    print(f"  Errors/Missing: {total_errors_combined:,}")
    if total_expected_combined > 0:
        print(f"  Overall Success Rate: {total_combined/total_expected_combined*100:.1f}%")
    
    # 4. DETAILED ANALYSIS OF THE NEW REVERSE FILE
    print("\n🔄 === REVERSE COMPLETE FILE ANALYSIS ===")
    try:
        with open('idun/met_textiles_complete_reverse_20250705_222820.json', 'r', encoding='utf-8') as f:
            reverse_complete_data = json.load(f)
        
        reverse_complete_ids = set()
        reverse_complete_images = 0
        
        for obj in reverse_complete_data:
            if isinstance(obj, dict) and 'objectID' in obj:
                reverse_complete_ids.add(obj['objectID'])
                if obj.get('primaryImage'):
                    reverse_complete_images += 1
        
        print(f"  📁 Reverse Complete File:")
        print(f"    Total objects: {len(reverse_complete_ids):,}")
        print(f"    Objects with images: {reverse_complete_images:,}")
        print(f"    Image percentage: {reverse_complete_images/len(reverse_complete_ids)*100:.1f}%")
        
        # Check if this file fills any gaps
        all_other_textile_ids = downloaded_textile_ids - reverse_complete_ids
        missing_filled = len(reverse_complete_ids - all_other_textile_ids)
        
        print(f"    New objects not in other files: {missing_filled:,}")
        
        if missing_filled > 0:
            print(f"    🎉 This file fills {missing_filled} gaps!")
        
    except Exception as e:
        print(f"  ❌ Error analyzing reverse complete file: {e}")
    
    # 5. CHECK FOR PERFECT COMPLETION
    if textile_errors == 0:
        print(f"\n🎉 PERFECT! ALL TEXTILE OBJECTS DOWNLOADED! 🎉")
    elif textile_errors < 10:
        print(f"\n✅ NEARLY PERFECT! Only {textile_errors} objects missing!")
    
    # 6. CREATE UPDATED FINAL SUMMARY JSON
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    final_summary = {
        "timestamp": timestamp,
        "textiles": {
            "total_expected": total_textiles,
            "successfully_downloaded": len(downloaded_textile_ids),
            "with_images": textiles_with_images,
            "errors_missing": textile_errors,
            "success_rate_percent": len(downloaded_textile_ids)/total_textiles*100 if total_textiles > 0 else 0
        },
        "tapestries": {
            "total_expected": total_tapestries_expected,
            "successfully_downloaded": len(downloaded_tapestry_ids),
            "with_images": tapestries_with_images,
            "errors_missing": tapestry_errors,
            "success_rate_percent": len(downloaded_tapestry_ids)/total_tapestries_expected*100 if total_tapestries_expected > 0 else 0
        },
        "combined": {
            "total_expected": total_expected_combined,
            "successfully_downloaded": total_combined,
            "with_images": total_images_combined,
            "errors_missing": total_errors_combined,
            "success_rate_percent": total_combined/total_expected_combined*100 if total_expected_combined > 0 else 0
        },
        "files_analyzed": {
            "textile_files": textile_files,
            "tapestry_files": tapestry_files
        }
    }
    
    summary_filename = f"final_download_summary_with_reverse_{timestamp}.json"
    with open(summary_filename, 'w') as f:
        json.dump(final_summary, f, indent=2)
    
    print(f"\n📁 Updated final summary saved to: {summary_filename}")
    print(f"✅ Analysis complete at: {datetime.now()}")
    
    return final_summary

# Run the updated analysis
summary = analyze_all_downloads_including_reverse()

=== COMPREHENSIVE DOWNLOAD ANALYSIS (INCLUDING REVERSE COMPLETE) ===
Analysis started at: 2025-07-05 22:30:47.642878

🧵 === TEXTILES ANALYSIS ===
📊 Total Textiles Expected: 33,437
  📁 met_textiles_batch_22800_20250705_134702.json: 22104 unique objects, 17749 with images
  📁 idun/met_textiles_batch_11988_20250705_134921.json: 10354 unique objects, 8875 with images
  📁 remaining_textiles_complete_20250705_135732.json: 736 unique objects, 647 with images
  📁 idun/met_textiles_complete_reverse_20250705_222820.json: 0 unique objects, 0 with images

📊 TEXTILES SUMMARY:
  Total Expected: 33,437
  Successfully Downloaded: 33,194
  With Images: 27,271
  Errors/Missing: 243
  Success Rate: 99.3%

🎨 === TAPESTRIES ANALYSIS ===
📊 Total Tapestries Expected (missing from textiles): 151
  📁 missing_tapestries_complete_20250705_191910.json: 150 objects, 102 with images

📊 TAPESTRIES SUMMARY:
  Total Expected: 151
  Successfully Downloaded: 150
  With Images: 102
  Errors/Missing: 1
  Success Rate: 99.

In [26]:
import json
import os
from datetime import datetime

def analyze_all_downloads_with_correct_logic():
    print("=== CORRECTED COMPREHENSIVE DOWNLOAD ANALYSIS ===")
    print(f"Analysis started at: {datetime.now()}")
    
    # 1. TEXTILES ANALYSIS
    print("\n🧵 === TEXTILES ANALYSIS ===")
    
    # Load original textile IDs
    try:
        with open("textile_object_ids.json", "r") as f:
            all_textile_ids = json.load(f)
        total_textiles = len(all_textile_ids)
        print(f"📊 Total Textiles Expected: {total_textiles:,}")
    except FileNotFoundError:
        print("❌ textile_object_ids.json not found")
        return
    
    # Load all textile downloads INCLUDING the reverse complete file
    textile_files = [
        'met_textiles_batch_22800_20250705_134702.json',
        'idun/met_textiles_batch_11988_20250705_134921.json', 
        'remaining_textiles_complete_20250705_135732.json',
        'idun/met_textiles_complete_reverse_20250705_222820.json'
    ]
    
    # Add any retry files if they exist
    retry_files = [f for f in os.listdir('.') if f.startswith('retry_recovered_objects_')]
    textile_files.extend(retry_files)
    
    downloaded_textile_ids = set()
    textiles_with_images = 0
    
    # Track each file's contribution
    file_contributions = {}
    
    for file in textile_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            file_ids = set()
            file_images = 0
            
            for obj in data:
                if isinstance(obj, dict) and 'objectID' in obj:
                    file_ids.add(obj['objectID'])
                    if obj.get('primaryImage'):
                        file_images += 1
            
            # Count NEW objects from this file
            new_objects = file_ids - downloaded_textile_ids
            new_images = 0
            
            # Count images in new objects only
            for obj in data:
                if isinstance(obj, dict) and 'objectID' in obj:
                    if obj['objectID'] in new_objects and obj.get('primaryImage'):
                        new_images += 1
            
            # Add new objects to total
            downloaded_textile_ids.update(new_objects)
            textiles_with_images += new_images
            
            file_contributions[file] = {
                'total_in_file': len(file_ids),
                'new_objects': len(new_objects),
                'new_images': new_images,
                'duplicates': len(file_ids) - len(new_objects)
            }
            
            print(f"  📁 {file}:")
            print(f"      Total in file: {len(file_ids):,}")
            print(f"      New objects: {len(new_objects):,}")
            print(f"      New with images: {new_images:,}")
            print(f"      Duplicates: {len(file_ids) - len(new_objects):,}")
            
        except FileNotFoundError:
            print(f"  ❌ File not found: {file}")
        except Exception as e:
            print(f"  ❌ Error loading {file}: {e}")
    
    # Calculate textile errors
    textile_errors = total_textiles - len(downloaded_textile_ids)
    
    print(f"\n📊 CORRECTED TEXTILES SUMMARY:")
    print(f"  Total Expected: {total_textiles:,}")
    print(f"  Successfully Downloaded: {len(downloaded_textile_ids):,}")
    print(f"  With Images: {textiles_with_images:,}")
    print(f"  Errors/Missing: {textile_errors:,}")
    print(f"  Success Rate: {len(downloaded_textile_ids)/total_textiles*100:.1f}%")
    
    # 2. TAPESTRIES ANALYSIS (unchanged)
    print("\n🎨 === TAPESTRIES ANALYSIS ===")
    
    tapestry_files = [f for f in os.listdir('.') if f.startswith('missing_tapestries_complete_')]
    
    downloaded_tapestry_ids = set()
    tapestries_with_images = 0
    total_tapestries_expected = 0
    
    summary_files = [f for f in os.listdir('.') if f.startswith('missing_tapestries_summary_')]
    if summary_files:
        try:
            with open(summary_files[0], 'r') as f:
                summary = json.load(f)
            total_tapestries_expected = summary.get('analysis', {}).get('tapestries_missing_from_textiles', 0)
            print(f"📊 Total Tapestries Expected (missing from textiles): {total_tapestries_expected:,}")
        except Exception as e:
            print(f"❌ Error loading tapestry summary: {e}")
    
    for file in tapestry_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            file_ids = 0
            file_images = 0
            for obj in data:
                if isinstance(obj, dict) and 'objectID' in obj:
                    if obj['objectID'] not in downloaded_tapestry_ids:
                        downloaded_tapestry_ids.add(obj['objectID'])
                        file_ids += 1
                        if obj.get('primaryImage'):
                            tapestries_with_images += 1
                            file_images += 1
            
            print(f"  📁 {file}: {file_ids} objects, {file_images} with images")
            
        except Exception as e:
            print(f"  ❌ Error loading {file}: {e}")
    
    tapestry_errors = total_tapestries_expected - len(downloaded_tapestry_ids) if total_tapestries_expected > 0 else 0
    
    print(f"\n📊 TAPESTRIES SUMMARY:")
    print(f"  Total Expected: {total_tapestries_expected:,}")
    print(f"  Successfully Downloaded: {len(downloaded_tapestry_ids):,}")
    print(f"  With Images: {tapestries_with_images:,}")
    print(f"  Errors/Missing: {tapestry_errors:,}")
    if total_tapestries_expected > 0:
        print(f"  Success Rate: {len(downloaded_tapestry_ids)/total_tapestries_expected*100:.1f}%")
    
    # 3. COMBINED ANALYSIS
    print("\n🔗 === CORRECTED TEXTILES + TAPESTRIES COMBINED ===")
    
    total_combined = len(downloaded_textile_ids) + len(downloaded_tapestry_ids)
    total_images_combined = textiles_with_images + tapestries_with_images
    total_expected_combined = total_textiles + total_tapestries_expected
    total_errors_combined = textile_errors + tapestry_errors
    
    print(f"📊 CORRECTED COMBINED SUMMARY:")
    print(f"  Total Expected: {total_expected_combined:,}")
    print(f"  Successfully Downloaded: {total_combined:,}")
    print(f"  With Images: {total_images_combined:,}")
    print(f"  Errors/Missing: {total_errors_combined:,}")
    if total_expected_combined > 0:
        print(f"  Overall Success Rate: {total_combined/total_expected_combined*100:.1f}%")
    
    # 4. DETAILED FILE BREAKDOWN
    print(f"\n📋 === DETAILED FILE BREAKDOWN ===")
    for file, stats in file_contributions.items():
        print(f"  📁 {file}:")
        print(f"      Contributed {stats['new_objects']:,} unique objects")
        print(f"      Had {stats['duplicates']:,} duplicates")
        print(f"      Added {stats['new_images']:,} new images")
    
    # 5. CHECK FOR PERFECT COMPLETION
    if textile_errors == 0:
        print(f"\n🎉 PERFECT! ALL TEXTILE OBJECTS DOWNLOADED! 🎉")
    elif textile_errors < 10:
        print(f"\n✅ NEARLY PERFECT! Only {textile_errors} objects missing!")
    
    # 6. REVERSE FILE SPECIFIC ANALYSIS
    print(f"\n🔄 === REVERSE FILE SPECIFIC IMPACT ===")
    reverse_file = 'idun/met_textiles_complete_reverse_20250705_222820.json'
    if reverse_file in file_contributions:
        reverse_stats = file_contributions[reverse_file]
        print(f"  The reverse file contributed:")
        print(f"  📈 {reverse_stats['new_objects']:,} NEW unique objects")
        print(f"  🖼️  {reverse_stats['new_images']:,} NEW objects with images")
        print(f"  🔄 {reverse_stats['duplicates']:,} objects already downloaded")
        
        if reverse_stats['new_objects'] > 0:
            print(f"  🎉 The reverse file significantly improved coverage!")
        else:
            print(f"  ℹ️  The reverse file contained no new objects")
    
    return {
        "total_textiles": len(downloaded_textile_ids),
        "total_tapestries": len(downloaded_tapestry_ids),
        "total_combined": total_combined,
        "total_images": total_images_combined,
        "success_rate": total_combined/total_expected_combined*100 if total_expected_combined > 0 else 0,
        "file_contributions": file_contributions
    }

# Run the corrected analysis
corrected_summary = analyze_all_downloads_with_correct_logic()

=== CORRECTED COMPREHENSIVE DOWNLOAD ANALYSIS ===
Analysis started at: 2025-07-05 22:33:27.586608

🧵 === TEXTILES ANALYSIS ===
📊 Total Textiles Expected: 33,437
  📁 met_textiles_batch_22800_20250705_134702.json:
      Total in file: 22,104
      New objects: 22,104
      New with images: 17,749
      Duplicates: 0
  📁 idun/met_textiles_batch_11988_20250705_134921.json:
      Total in file: 20,497
      New objects: 10,354
      New with images: 8,875
      Duplicates: 10,143
  📁 remaining_textiles_complete_20250705_135732.json:
      Total in file: 736
      New objects: 736
      New with images: 647
      Duplicates: 0
  📁 idun/met_textiles_complete_reverse_20250705_222820.json:
      Total in file: 32,147
      New objects: 0
      New with images: 0
      Duplicates: 32,147

📊 CORRECTED TEXTILES SUMMARY:
  Total Expected: 33,437
  Successfully Downloaded: 33,194
  With Images: 27,271
  Errors/Missing: 243
  Success Rate: 99.3%

🎨 === TAPESTRIES ANALYSIS ===
📊 Total Tapestries Expec

In [27]:
import json
from datetime import datetime

def find_missing_objects():
    print("=== FINDING THE 244 MISSING OBJECTS ===")
    print(f"Analysis started at: {datetime.now()}")
    
    # 1. FIND MISSING TEXTILES (243 objects)
    print("\n🔍 === FINDING MISSING TEXTILES ===")
    
    # Load expected textile IDs
    with open("textile_object_ids.json", "r") as f:
        all_textile_ids = set(json.load(f))
    
    # Load all downloaded textile IDs
    textile_files = [
        'met_textiles_batch_22800_20250705_134702.json',
        'idun/met_textiles_batch_11988_20250705_134921.json', 
        'remaining_textiles_complete_20250705_135732.json',
        'idun/met_textiles_complete_reverse_20250705_222820.json'
    ]
    
    downloaded_textile_ids = set()
    
    for file in textile_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            for obj in data:
                if isinstance(obj, dict) and 'objectID' in obj:
                    downloaded_textile_ids.add(obj['objectID'])
        except Exception as e:
            print(f"❌ Error loading {file}: {e}")
    
    missing_textiles = sorted(list(all_textile_ids - downloaded_textile_ids))
    print(f"📊 Missing textiles: {len(missing_textiles)}")
    print(f"📋 Missing textile IDs: {missing_textiles[:20]}")  # Show first 20
    
    # 2. FIND MISSING TAPESTRIES (1 object)
    print("\n🔍 === FINDING MISSING TAPESTRIES ===")
    
    # Load expected tapestry IDs from summary
    try:
        with open('missing_tapestries_summary_20250705_191910.json', 'r') as f:
            tapestry_summary = json.load(f)
        expected_tapestries = tapestry_summary['analysis']['tapestries_missing_from_textiles']
        print(f"📊 Expected tapestries: {expected_tapestries}")
    except Exception as e:
        print(f"❌ Error loading tapestry summary: {e}")
        return
    
    # Load downloaded tapestries
    try:
        with open('missing_tapestries_complete_20250705_191910.json', 'r', encoding='utf-8') as f:
            tapestry_data = json.load(f)
        downloaded_tapestries = {obj['objectID'] for obj in tapestry_data if isinstance(obj, dict) and 'objectID' in obj}
        print(f"📊 Downloaded tapestries: {len(downloaded_tapestries)}")
        
        # This should be 1 missing tapestry but we need the original tapestry IDs to find which one
        print(f"📊 Missing tapestries: {expected_tapestries - len(downloaded_tapestries)}")
        
    except Exception as e:
        print(f"❌ Error loading tapestry data: {e}")
    
    # 3. CHECK FAILED/NOT FOUND FILES
    print("\n🔍 === CHECKING FAILED/NOT FOUND FILES ===")
    
    failed_files = [
        'failed_ids_20250705_135732.json',
        'not_found_ids_20250705_135732.json',
        'missing_tapestries_failed_20250705_191910.json',
        'missing_tapestries_not_found_20250705_191910.json'
    ]
    
    all_failed_ids = set()
    
    for file in failed_files:
        try:
            with open(file, 'r') as f:
                failed_ids = json.load(f)
            if failed_ids:
                all_failed_ids.update(failed_ids)
                print(f"📁 {file}: {len(failed_ids)} failed IDs")
        except FileNotFoundError:
            print(f"📁 {file}: Not found")
        except Exception as e:
            print(f"❌ Error loading {file}: {e}")
    
    print(f"📊 Total unique failed IDs across all files: {len(all_failed_ids)}")
    
    # 4. CROSS-REFERENCE MISSING WITH FAILED
    print("\n🔗 === CROSS-REFERENCING MISSING WITH FAILED ===")
    
    missing_in_failed = set(missing_textiles) & all_failed_ids
    missing_not_in_failed = set(missing_textiles) - all_failed_ids
    
    print(f"📊 Missing textiles that are in failed files: {len(missing_in_failed)}")
    print(f"📊 Missing textiles NOT in failed files: {len(missing_not_in_failed)}")
    
    if missing_not_in_failed:
        print(f"📋 Missing textiles NOT in failed files: {sorted(list(missing_not_in_failed))[:10]}")
    
    # 5. SAVE FINAL MISSING LIST FOR RETRY
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    final_missing = {
        "timestamp": timestamp,
        "summary": {
            "total_missing": len(missing_textiles),
            "missing_in_failed_files": len(missing_in_failed),
            "missing_not_in_failed_files": len(missing_not_in_failed)
        },
        "missing_textile_ids": missing_textiles,
        "missing_in_failed": sorted(list(missing_in_failed)),
        "missing_not_in_failed": sorted(list(missing_not_in_failed))
    }
    
    filename = f"final_missing_objects_{timestamp}.json"
    with open(filename, 'w') as f:
        json.dump(final_missing, f, indent=2)
    
    print(f"\n📁 Final missing objects analysis saved to: {filename}")
    print(f"✅ Analysis complete!")
    
    return final_missing

# Find the missing objects
missing_analysis = find_missing_objects()

=== FINDING THE 244 MISSING OBJECTS ===
Analysis started at: 2025-07-05 22:35:32.441249

🔍 === FINDING MISSING TEXTILES ===
📊 Missing textiles: 243
📋 Missing textile IDs: [21107, 72339, 77239, 77240, 77241, 77242, 212220, 212221, 212223, 212224, 212225, 212226, 212227, 212228, 212229, 212230, 212231, 212232, 212233, 212234]

🔍 === FINDING MISSING TAPESTRIES ===
📊 Expected tapestries: 151
📊 Downloaded tapestries: 150
📊 Missing tapestries: 1

🔍 === CHECKING FAILED/NOT FOUND FILES ===
📁 not_found_ids_20250705_135732.json: 243 failed IDs
📁 missing_tapestries_failed_20250705_191910.json: Not found
📁 missing_tapestries_not_found_20250705_191910.json: 1 failed IDs
📊 Total unique failed IDs across all files: 244

🔗 === CROSS-REFERENCING MISSING WITH FAILED ===
📊 Missing textiles that are in failed files: 243
📊 Missing textiles NOT in failed files: 0

📁 Final missing objects analysis saved to: final_missing_objects_20250705_223533.json
✅ Analysis complete!


In [28]:
import json
from datetime import datetime

def check_missing_in_reverse_file():
    print("=== CHECKING IF MISSING OBJECTS ARE IN REVERSE FILE ===")
    print(f"Analysis started at: {datetime.now()}")
    
    # 1. Load the missing objects from the analysis
    try:
        with open('final_missing_objects_20250705_223533.json', 'r') as f:
            missing_data = json.load(f)
        missing_textile_ids = set(missing_data['missing_textile_ids'])
        print(f"📊 Missing textile IDs to check: {len(missing_textile_ids)}")
        print(f"📋 Sample missing IDs: {sorted(list(missing_textile_ids))[:10]}")
    except Exception as e:
        print(f"❌ Error loading missing objects file: {e}")
        return
    
    # 2. Load the reverse file and check for these IDs
    print(f"\n🔍 === CHECKING REVERSE FILE ===")
    try:
        with open('idun/met_textiles_complete_reverse_20250705_222820.json', 'r', encoding='utf-8') as f:
            reverse_data = json.load(f)
        
        reverse_ids = set()
        found_missing_in_reverse = set()
        
        for obj in reverse_data:
            if isinstance(obj, dict) and 'objectID' in obj:
                obj_id = obj['objectID']
                reverse_ids.add(obj_id)
                
                # Check if this object is in our missing list
                if obj_id in missing_textile_ids:
                    found_missing_in_reverse.add(obj_id)
        
        print(f"📊 Total objects in reverse file: {len(reverse_ids):,}")
        print(f"🎯 Missing objects found in reverse file: {len(found_missing_in_reverse)}")
        
        if found_missing_in_reverse:
            print(f"✅ FOUND! These missing objects ARE in the reverse file:")
            print(f"📋 Found IDs: {sorted(list(found_missing_in_reverse))[:20]}")
            
            # Check if ALL missing objects are in reverse file
            still_missing = missing_textile_ids - found_missing_in_reverse
            print(f"📊 Still missing after reverse check: {len(still_missing)}")
            
            if len(still_missing) == 0:
                print(f"🎉 ALL MISSING OBJECTS FOUND IN REVERSE FILE! 🎉")
            else:
                print(f"📋 Still missing: {sorted(list(still_missing))[:10]}")
        else:
            print(f"❌ No missing objects found in reverse file")
    
    except Exception as e:
        print(f"❌ Error loading reverse file: {e}")
        return
    
    # 3. Double-check by loading ALL files and recounting
    print(f"\n🔍 === DOUBLE-CHECK: RECOUNT WITH ALL FILES INCLUDING REVERSE ===")
    
    # Load expected textile IDs
    with open("textile_object_ids.json", "r") as f:
        all_expected_ids = set(json.load(f))
    
    print(f"📊 Total expected textile objects: {len(all_expected_ids):,}")
    
    # Load ALL textile files INCLUDING reverse
    all_files = [
        'met_textiles_batch_22800_20250705_134702.json',
        'idun/met_textiles_batch_11988_20250705_134921.json', 
        'remaining_textiles_complete_20250705_135732.json',
        'idun/met_textiles_complete_reverse_20250705_222820.json'
    ]
    
    all_downloaded_ids = set()
    
    for file in all_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            file_ids = set()
            for obj in data:
                if isinstance(obj, dict) and 'objectID' in obj:
                    file_ids.add(obj['objectID'])
            
            new_ids = file_ids - all_downloaded_ids
            all_downloaded_ids.update(file_ids)
            
            print(f"📁 {file}: {len(file_ids):,} total, {len(new_ids):,} new")
            
        except Exception as e:
            print(f"❌ Error loading {file}: {e}")
    
    # Final count
    truly_missing = all_expected_ids - all_downloaded_ids
    
    print(f"\n📊 FINAL RECOUNT RESULTS:")
    print(f"  Expected: {len(all_expected_ids):,}")
    print(f"  Downloaded: {len(all_downloaded_ids):,}")
    print(f"  Actually Missing: {len(truly_missing):,}")
    print(f"  Success Rate: {len(all_downloaded_ids)/len(all_expected_ids)*100:.3f}%")
    
    if len(truly_missing) == 0:
        print(f"\n🎉🎉🎉 PERFECT! ALL TEXTILE OBJECTS FOUND! 🎉🎉🎉")
    elif len(truly_missing) < 10:
        print(f"\n✅ Nearly perfect! Only {len(truly_missing)} truly missing:")
        print(f"📋 Truly missing IDs: {sorted(list(truly_missing))}")
    else:
        print(f"\n📋 Truly missing IDs (first 20): {sorted(list(truly_missing))[:20]}")
    
    # Save corrected analysis
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    corrected_analysis = {
        "timestamp": timestamp,
        "original_missing_count": len(missing_textile_ids),
        "found_in_reverse_file": len(found_missing_in_reverse) if 'found_missing_in_reverse' in locals() else 0,
        "truly_missing_count": len(truly_missing),
        "truly_missing_ids": sorted(list(truly_missing)),
        "success_rate_percent": len(all_downloaded_ids)/len(all_expected_ids)*100,
        "total_downloaded": len(all_downloaded_ids),
        "total_expected": len(all_expected_ids)
    }
    
    filename = f"corrected_missing_analysis_{timestamp}.json"
    with open(filename, 'w') as f:
        json.dump(corrected_analysis, f, indent=2)
    
    print(f"\n📁 Corrected analysis saved to: {filename}")
    return corrected_analysis

# Run the check
corrected_analysis = check_missing_in_reverse_file()

=== CHECKING IF MISSING OBJECTS ARE IN REVERSE FILE ===
Analysis started at: 2025-07-05 22:37:36.752778
📊 Missing textile IDs to check: 243
📋 Sample missing IDs: [21107, 72339, 77239, 77240, 77241, 77242, 212220, 212221, 212223, 212224]

🔍 === CHECKING REVERSE FILE ===
📊 Total objects in reverse file: 32,147
🎯 Missing objects found in reverse file: 0
❌ No missing objects found in reverse file

🔍 === DOUBLE-CHECK: RECOUNT WITH ALL FILES INCLUDING REVERSE ===
📊 Total expected textile objects: 33,437
📁 met_textiles_batch_22800_20250705_134702.json: 22,104 total, 22,104 new
📁 idun/met_textiles_batch_11988_20250705_134921.json: 20,497 total, 10,354 new
📁 remaining_textiles_complete_20250705_135732.json: 736 total, 736 new
📁 idun/met_textiles_complete_reverse_20250705_222820.json: 32,147 total, 0 new

📊 FINAL RECOUNT RESULTS:
  Expected: 33,437
  Downloaded: 33,194
  Actually Missing: 243
  Success Rate: 99.273%

📋 Truly missing IDs (first 20): [21107, 72339, 77239, 77240, 77241, 77242, 2122

In [29]:
import requests
import json
import time
from datetime import datetime

def final_retry_missing_243():
    print("=== FINAL RETRY FOR THE 243 TRULY MISSING OBJECTS ===")
    
    # Load the truly missing IDs
    with open('corrected_missing_analysis_20250705_223738.json', 'r') as f:
        analysis = json.load(f)
    missing_ids = analysis['truly_missing_ids']
    
    print(f"🎯 Attempting final retry for {len(missing_ids)} objects...")
    
    def get_object_with_extreme_patience(object_id):
        """Try with maximum retries and patience"""
        url = f"https://collectionapi.metmuseum.org/public/collection/v1/objects/{object_id}"
        
        for attempt in range(15):  # More attempts
            try:
                response = requests.get(url, timeout=60)  # Longer timeout
                if response.status_code == 200:
                    return response.json(), "success"
                elif response.status_code == 404:
                    return None, "not_found"
                elif response.status_code == 403:
                    wait_time = min(60, 10 * (attempt + 1))
                    print(f"Rate limited on {object_id}, waiting {wait_time}s...")
                    time.sleep(wait_time)
                    continue
                else:
                    time.sleep(5)
                    continue
            except Exception as e:
                print(f"Error on {object_id}: {e}")
                time.sleep(10)
        
        return None, "failed"
    
    recovered = []
    still_404 = []
    still_failed = []
    
    for i, obj_id in enumerate(missing_ids):
        if i % 10 == 0:
            print(f"Progress: {i+1}/{len(missing_ids)}")
        
        obj_data, status = get_object_with_extreme_patience(obj_id)
        
        if status == "success":
            recovered.append(obj_data)
            print(f"🎉 RECOVERED: {obj_id}")
        elif status == "not_found":
            still_404.append(obj_id)
        else:
            still_failed.append(obj_id)
        
        time.sleep(1)  # Very slow to be respectful
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    if recovered:
        with open(f"final_recovery_{timestamp}.json", 'w') as f:
            json.dump(recovered, f, indent=2)
    
    final_results = {
        "timestamp": timestamp,
        "attempted": len(missing_ids),
        "recovered": len(recovered),
        "confirmed_404": len(still_404),
        "still_failed": len(still_failed),
        "final_success_rate": (33194 + len(recovered)) / 33437 * 100,
        "confirmed_404_ids": still_404,
        "still_failed_ids": still_failed
    }
    
    with open(f"final_retry_results_{timestamp}.json", 'w') as f:
        json.dump(final_results, f, indent=2)
    
    print(f"\n🏁 FINAL RESULTS:")
    print(f"✅ Recovered: {len(recovered)}")
    print(f"❌ Confirmed 404: {len(still_404)}")
    print(f"💥 Still failed: {len(still_failed)}")
    print(f"🎯 NEW SUCCESS RATE: {(33194 + len(recovered)) / 33437 * 100:.3f}%")
    
    if len(still_404) + len(still_failed) == 0:
        print("🎉🎉🎉 PERFECT COMPLETION! 🎉🎉🎉")

# Run final retry
final_retry_missing_243()

=== FINAL RETRY FOR THE 243 TRULY MISSING OBJECTS ===
🎯 Attempting final retry for 243 objects...
Progress: 1/243
Progress: 11/243
Progress: 21/243
Progress: 31/243
Rate limited on 212246, waiting 10s...
Rate limited on 212246, waiting 20s...
Progress: 41/243
Progress: 51/243
Rate limited on 239289, waiting 10s...
Rate limited on 239289, waiting 20s...
Progress: 61/243
Progress: 71/243
Progress: 81/243
Rate limited on 321230, waiting 10s...
Rate limited on 321230, waiting 20s...
Progress: 91/243
Progress: 101/243
Progress: 111/243
Rate limited on 479039, waiting 10s...
Rate limited on 479039, waiting 20s...
Progress: 121/243
Progress: 131/243
Progress: 141/243
Rate limited on 648243, waiting 10s...
Rate limited on 648243, waiting 20s...
Progress: 151/243
Progress: 161/243
Progress: 171/243
Rate limited on 722213, waiting 10s...
Rate limited on 722213, waiting 20s...
Progress: 181/243
Progress: 191/243
Progress: 201/243
Rate limited on 760055, waiting 10s...
Rate limited on 760055, wait

In [30]:
import json
import os
import shutil
from datetime import datetime

def finalize_dataset():
    print("=== FINALIZING MET TEXTILES DATASET ===")
    print(f"Started at: {datetime.now()}")
    
    # 1. CREATE NEW DIRECTORY FOR FINAL FILES
    print("\n📁 === STEP 1: CREATING CLEAN DIRECTORY ===")
    
    final_dir = "FINAL_MET_TEXTILES_DATASET"
    if os.path.exists(final_dir):
        shutil.rmtree(final_dir)
    os.makedirs(final_dir)
    print(f"✅ Created directory: {final_dir}")
    
    # Create subdirectories
    subdirs = ["objects", "lists", "metadata"]
    for subdir in subdirs:
        os.makedirs(os.path.join(final_dir, subdir))
        print(f"✅ Created subdirectory: {subdir}")
    
    # 2. COLLECT ALL TEXTILES DATA
    print("\n🧵 === STEP 2: COLLECTING ALL TEXTILES DATA ===")
    
    textile_files = [
        'met_textiles_batch_22800_20250705_134702.json',
        'idun/met_textiles_batch_11988_20250705_134921.json',
        'remaining_textiles_complete_20250705_135732.json',
        'idun/met_textiles_complete_reverse_20250705_222820.json'
    ]
    
    all_textiles = []
    all_textile_ids = set()
    textiles_with_images = []
    
    for file in textile_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            file_count = 0
            for obj in data:
                if isinstance(obj, dict) and 'objectID' in obj:
                    if obj['objectID'] not in all_textile_ids:
                        all_textiles.append(obj)
                        all_textile_ids.add(obj['objectID'])
                        file_count += 1
                        
                        if obj.get('primaryImage'):
                            textiles_with_images.append(obj)
            
            print(f"  📁 {file}: {file_count} unique objects added")
            
        except Exception as e:
            print(f"  ❌ Error loading {file}: {e}")
    
    print(f"✅ Total unique textiles collected: {len(all_textiles):,}")
    print(f"✅ Textiles with images: {len(textiles_with_images):,}")
    
    # 3. COLLECT ALL TAPESTRIES DATA
    print("\n🎨 === STEP 3: COLLECTING ALL TAPESTRIES DATA ===")
    
    all_tapestries = []
    all_tapestry_ids = set()
    tapestries_with_images = []
    
    tapestry_files = [f for f in os.listdir('.') if f.startswith('missing_tapestries_complete_')]
    
    for file in tapestry_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            file_count = 0
            for obj in data:
                if isinstance(obj, dict) and 'objectID' in obj:
                    if obj['objectID'] not in all_tapestry_ids:
                        all_tapestries.append(obj)
                        all_tapestry_ids.add(obj['objectID'])
                        file_count += 1
                        
                        if obj.get('primaryImage'):
                            tapestries_with_images.append(obj)
            
            print(f"  📁 {file}: {file_count} unique objects added")
            
        except Exception as e:
            print(f"  ❌ Error loading {file}: {e}")
    
    print(f"✅ Total unique tapestries collected: {len(all_tapestries):,}")
    print(f"✅ Tapestries with images: {len(tapestries_with_images):,}")
    
    # 4. FIND INTERSECTIONS
    print("\n🔗 === STEP 4: ANALYZING INTERSECTIONS ===")
    
    # Find objects that appear in both textiles and tapestries
    intersection_ids = all_textile_ids & all_tapestry_ids
    intersection_objects = [obj for obj in all_textiles if obj['objectID'] in intersection_ids]
    
    print(f"✅ Objects in both textiles and tapestries: {len(intersection_ids):,}")
    
    # 5. COLLECT FAILED IDs
    print("\n❌ === STEP 5: COLLECTING FAILED OBJECTS ===")
    
    failed_files = [
        'final_retry_results_20250705_224324.json',  # Latest results
        'not_found_ids_20250705_135732.json',
        'failed_ids_20250705_135732.json'
    ]
    
    all_failed_ids = set()
    confirmed_404_ids = set()
    
    for file in failed_files:
        try:
            with open(file, 'r') as f:
                data = json.load(f)
            
            # Handle different file formats
            if 'confirmed_404_ids' in data:
                confirmed_404_ids.update(data['confirmed_404_ids'])
                all_failed_ids.update(data['confirmed_404_ids'])
                if 'still_failed_ids' in data:
                    all_failed_ids.update(data['still_failed_ids'])
            else:
                # Regular list of IDs
                all_failed_ids.update(data)
            
            print(f"  📁 {file}: processed")
            
        except Exception as e:
            print(f"  ❌ Error loading {file}: {e}")
    
    print(f"✅ Total failed object IDs: {len(all_failed_ids):,}")
    print(f"✅ Confirmed 404 IDs: {len(confirmed_404_ids):,}")
    
    # 6. SAVE ALL FINAL FILES
    print("\n💾 === STEP 6: SAVING FINAL FILES ===")
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save complete objects collections
    files_to_save = [
        {
            "data": all_textiles,
            "filename": f"{final_dir}/objects/complete_textiles_{timestamp}.json",
            "description": "All textile objects with full metadata"
        },
        {
            "data": all_tapestries,
            "filename": f"{final_dir}/objects/complete_tapestries_{timestamp}.json",
            "description": "All tapestry objects with full metadata"
        },
        {
            "data": intersection_objects,
            "filename": f"{final_dir}/objects/textiles_tapestries_intersection_{timestamp}.json",
            "description": "Objects that are both textiles and tapestries"
        },
        {
            "data": textiles_with_images,
            "filename": f"{final_dir}/objects/textiles_with_images_{timestamp}.json",
            "description": "Textile objects that have primary images"
        },
        {
            "data": tapestries_with_images,
            "filename": f"{final_dir}/objects/tapestries_with_images_{timestamp}.json",
            "description": "Tapestry objects that have primary images"
        }
    ]
    
    for file_info in files_to_save:
        with open(file_info["filename"], 'w', encoding='utf-8') as f:
            json.dump(file_info["data"], f, indent=2, ensure_ascii=False)
        print(f"  ✅ {file_info['filename']}: {len(file_info['data']):,} objects")
    
    # Save ID lists only
    id_lists = [
        {
            "data": sorted(list(all_textile_ids)),
            "filename": f"{final_dir}/lists/textile_object_ids_{timestamp}.json",
            "description": "List of all textile object IDs"
        },
        {
            "data": sorted(list(all_tapestry_ids)),
            "filename": f"{final_dir}/lists/tapestry_object_ids_{timestamp}.json",
            "description": "List of all tapestry object IDs"
        },
        {
            "data": sorted(list(intersection_ids)),
            "filename": f"{final_dir}/lists/intersection_object_ids_{timestamp}.json",
            "description": "List of object IDs that are both textiles and tapestries"
        },
        {
            "data": sorted(list(all_failed_ids)),
            "filename": f"{final_dir}/lists/failed_object_ids_{timestamp}.json",
            "description": "List of object IDs that failed to download"
        },
        {
            "data": sorted(list(confirmed_404_ids)),
            "filename": f"{final_dir}/lists/confirmed_404_ids_{timestamp}.json",
            "description": "List of object IDs confirmed as not found (404)"
        }
    ]
    
    for id_list in id_lists:
        with open(id_list["filename"], 'w') as f:
            json.dump(id_list["data"], f, indent=2)
        print(f"  ✅ {id_list['filename']}: {len(id_list['data']):,} IDs")
    
    # 7. CREATE COMPREHENSIVE METADATA
    print("\n📋 === STEP 7: CREATING DATASET METADATA ===")
    
    # Load original expected counts
    with open("textile_object_ids.json", "r") as f:
        original_textile_ids = json.load(f)
    
    dataset_metadata = {
        "dataset_info": {
            "name": "MET Museum Textiles and Tapestries Collection",
            "version": "1.0",
            "created_date": timestamp,
            "description": "Complete collection of textile and tapestry objects from the Metropolitan Museum of Art",
            "success_rate_percent": 99.273,
            "total_api_calls_estimated": 35000,
            "download_duration_hours": 8
        },
        "collection_statistics": {
            "textiles": {
                "total_expected": len(original_textile_ids),
                "successfully_downloaded": len(all_textiles),
                "with_images": len(textiles_with_images),
                "success_rate": len(all_textiles) / len(original_textile_ids) * 100
            },
            "tapestries": {
                "total_downloaded": len(all_tapestries),
                "with_images": len(tapestries_with_images),
                "unique_to_tapestries": len(all_tapestry_ids - all_textile_ids)
            },
            "intersections": {
                "objects_in_both_categories": len(intersection_ids),
                "percentage_of_textiles": len(intersection_ids) / len(all_textiles) * 100 if all_textiles else 0
            },
            "failures": {
                "total_failed": len(all_failed_ids),
                "confirmed_404": len(confirmed_404_ids),
                "failure_rate": len(all_failed_ids) / len(original_textile_ids) * 100
            }
        },
        "file_descriptions": {
            "objects/": "Complete object data with full metadata",
            "lists/": "Object ID lists for various categories",
            "metadata/": "Dataset documentation and statistics"
        },
        "usage_notes": [
            "All objects include full MET API metadata",
            "Objects with 'primaryImage' field have downloadable images",
            "Intersection objects appear in both textile and tapestry searches",
            "Failed IDs are mostly confirmed 404 (object no longer exists)",
            "Use object IDs to fetch latest data from MET API if needed"
        ]
    }
    
    # Save metadata
    metadata_file = f"{final_dir}/metadata/dataset_metadata_{timestamp}.json"
    with open(metadata_file, 'w') as f:
        json.dump(dataset_metadata, f, indent=2)
    print(f"  ✅ {metadata_file}: Complete dataset metadata")
    
    # Create README
    readme_content = f"""# MET Museum Textiles and Tapestries Dataset

## Overview
This dataset contains {len(all_textiles):,} textile objects and {len(all_tapestries):,} tapestry objects from the Metropolitan Museum of Art, collected via their public API.

## Success Rate: {99.273:.3f}%

## Directory Structure
```
FINAL_MET_TEXTILES_DATASET/
├── objects/          # Complete object data with metadata
├── lists/            # Object ID lists
├── metadata/         # Dataset documentation
└── README.md         # This file
```

## Statistics
- **Textiles**: {len(all_textiles):,} objects ({len(textiles_with_images):,} with images)
- **Tapestries**: {len(all_tapestries):,} objects ({len(tapestries_with_images):,} with images)
- **Intersections**: {len(intersection_ids):,} objects appear in both categories
- **Failed Downloads**: {len(all_failed_ids):,} objects (mostly confirmed 404s)

## Files Created: {timestamp}

### Object Files (Complete Metadata)
- `complete_textiles_{timestamp}.json` - All textile objects
- `complete_tapestries_{timestamp}.json` - All tapestry objects  
- `textiles_tapestries_intersection_{timestamp}.json` - Objects in both categories
- `textiles_with_images_{timestamp}.json` - Textiles with images
- `tapestries_with_images_{timestamp}.json` - Tapestries with images

### ID Lists (Object IDs Only)
- `textile_object_ids_{timestamp}.json` - All textile IDs
- `tapestry_object_ids_{timestamp}.json` - All tapestry IDs
- `intersection_object_ids_{timestamp}.json` - Intersection IDs
- `failed_object_ids_{timestamp}.json` - Failed download IDs
- `confirmed_404_ids_{timestamp}.json` - Confirmed non-existent IDs

## Usage
Each object contains full MET API metadata including:
- Object details (title, artist, date, medium, etc.)
- Image URLs (if available)
- Department and classification info
- Measurement and location data

## Data Quality
- {99.273:.3f}% success rate
- All duplicates removed
- Failed objects documented
- Complete provenance tracking

Generated: {datetime.now()}
"""
    
    readme_file = f"{final_dir}/README.md"
    with open(readme_file, 'w') as f:
        f.write(readme_content)
    print(f"  ✅ {readme_file}: Documentation created")
    
    # 8. FINAL SUMMARY
    print(f"\n🎉 === DATASET FINALIZATION COMPLETE ===")
    print(f"📁 Final directory: {final_dir}")
    print(f"📊 Total files created: {len(files_to_save) + len(id_lists) + 2}")  # +2 for metadata and readme
    print(f"🧵 Textiles: {len(all_textiles):,} objects")
    print(f"🎨 Tapestries: {len(all_tapestries):,} objects")
    print(f"🔗 Intersections: {len(intersection_ids):,} objects")
    print(f"🖼️  With Images: {len(textiles_with_images) + len(tapestries_with_images):,} objects")
    print(f"❌ Failed: {len(all_failed_ids):,} objects")
    print(f"✅ Success Rate: {99.273:.3f}%")
    print(f"\n🎯 Your MET Textiles dataset is ready for research! 🎯")
    
    return {
        "final_directory": final_dir,
        "textiles_count": len(all_textiles),
        "tapestries_count": len(all_tapestries),
        "intersections_count": len(intersection_ids),
        "with_images_count": len(textiles_with_images) + len(tapestries_with_images),
        "failed_count": len(all_failed_ids),
        "success_rate": 99.273
    }

# Run the finalization
final_stats = finalize_dataset()

=== FINALIZING MET TEXTILES DATASET ===
Started at: 2025-07-05 22:56:31.169366

📁 === STEP 1: CREATING CLEAN DIRECTORY ===
✅ Created directory: FINAL_MET_TEXTILES_DATASET
✅ Created subdirectory: objects
✅ Created subdirectory: lists
✅ Created subdirectory: metadata

🧵 === STEP 2: COLLECTING ALL TEXTILES DATA ===
  📁 met_textiles_batch_22800_20250705_134702.json: 22104 unique objects added
  📁 idun/met_textiles_batch_11988_20250705_134921.json: 10354 unique objects added
  📁 remaining_textiles_complete_20250705_135732.json: 736 unique objects added
  📁 idun/met_textiles_complete_reverse_20250705_222820.json: 0 unique objects added
✅ Total unique textiles collected: 33,194
✅ Textiles with images: 27,271

🎨 === STEP 3: COLLECTING ALL TAPESTRIES DATA ===
  📁 missing_tapestries_complete_20250705_191910.json: 150 unique objects added
✅ Total unique tapestries collected: 150
✅ Tapestries with images: 102

🔗 === STEP 4: ANALYZING INTERSECTIONS ===
✅ Objects in both textiles and tapestries: 0



In [31]:
import json
import os
import shutil
from datetime import datetime

def create_final_corrected_dataset():
    print("=== CREATING CORRECTED FINAL MET TEXTILES DATASET ===")
    print(f"Started at: {datetime.now()}")
    
    # 1. CREATE NEW DIRECTORY FOR FINAL FILES
    print("\n📁 === STEP 1: CREATING CLEAN DIRECTORY ===")
    
    final_dir = "FINAL_CORRECTED_MET_TEXTILES_DATASET"
    if os.path.exists(final_dir):
        shutil.rmtree(final_dir)
    os.makedirs(final_dir)
    print(f"✅ Created directory: {final_dir}")
    
    # Create subdirectories
    subdirs = ["all_objects", "objects_with_images_only", "id_lists", "metadata"]
    for subdir in subdirs:
        os.makedirs(os.path.join(final_dir, subdir))
        print(f"✅ Created subdirectory: {subdir}")
    
    # 2. GET EXPECTED COUNTS FROM API SEARCH
    print("\n📊 === STEP 2: GETTING EXPECTED COUNTS FROM API ===")
    
    # Load original expected textile count
    with open("textile_object_ids.json", "r") as f:
        expected_textile_ids = set(json.load(f))
    expected_textile_count = len(expected_textile_ids)
    print(f"📊 Expected Textiles from API: {expected_textile_count:,}")
    
    # Get expected tapestry count from summary
    try:
        with open('missing_tapestries_summary_20250705_191910.json', 'r') as f:
            tapestry_summary = json.load(f)
        expected_tapestry_count = tapestry_summary['analysis']['tapestries_missing_from_textiles']
        print(f"📊 Expected Tapestries (unique): {expected_tapestry_count}")
    except Exception as e:
        print(f"❌ Error loading tapestry summary: {e}")
        expected_tapestry_count = 0
    
    # 3. COLLECT ALL DOWNLOADED TEXTILES
    print("\n🧵 === STEP 3: COLLECTING ALL DOWNLOADED TEXTILES ===")
    
    textile_files = [
        'met_textiles_batch_22800_20250705_134702.json',
        'idun/met_textiles_batch_11988_20250705_134921.json',
        'remaining_textiles_complete_20250705_135732.json',
        'idun/met_textiles_complete_reverse_20250705_222820.json'
    ]
    
    all_textiles = []
    all_textile_ids = set()
    textiles_with_images = []
    
    for file in textile_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            file_count = 0
            for obj in data:
                if isinstance(obj, dict) and 'objectID' in obj:
                    if obj['objectID'] not in all_textile_ids:
                        all_textiles.append(obj)
                        all_textile_ids.add(obj['objectID'])
                        file_count += 1
                        
                        if obj.get('primaryImage'):
                            textiles_with_images.append(obj)
            
            print(f"  📁 {file}: {file_count} unique objects added")
            
        except Exception as e:
            print(f"  ❌ Error loading {file}: {e}")
    
    downloaded_textile_count = len(all_textiles)
    textile_images_count = len(textiles_with_images)
    textile_failed_count = expected_textile_count - downloaded_textile_count
    
    print(f"✅ Downloaded Textiles: {downloaded_textile_count:,} out of {expected_textile_count:,}")
    print(f"✅ Textiles with images: {textile_images_count:,}")
    print(f"❌ Failed Textiles: {textile_failed_count}")
    
    # 4. COLLECT ALL DOWNLOADED TAPESTRIES
    print("\n🎨 === STEP 4: COLLECTING ALL DOWNLOADED TAPESTRIES ===")
    
    all_tapestries = []
    all_tapestry_ids = set()
    tapestries_with_images = []
    
    tapestry_files = [f for f in os.listdir('.') if f.startswith('missing_tapestries_complete_')]
    
    for file in tapestry_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            file_count = 0
            for obj in data:
                if isinstance(obj, dict) and 'objectID' in obj:
                    if obj['objectID'] not in all_tapestry_ids:
                        all_tapestries.append(obj)
                        all_tapestry_ids.add(obj['objectID'])
                        file_count += 1
                        
                        if obj.get('primaryImage'):
                            tapestries_with_images.append(obj)
            
            print(f"  📁 {file}: {file_count} unique objects added")
            
        except Exception as e:
            print(f"  ❌ Error loading {file}: {e}")
    
    downloaded_tapestry_count = len(all_tapestries)
    tapestry_images_count = len(tapestries_with_images)
    tapestry_failed_count = expected_tapestry_count - downloaded_tapestry_count
    
    print(f"✅ Downloaded Tapestries: {downloaded_tapestry_count} out of {expected_tapestry_count}")
    print(f"✅ Tapestries with images: {tapestry_images_count}")
    print(f"❌ Failed Tapestries: {tapestry_failed_count}")
    
    # 5. FIND INTERSECTIONS
    print("\n🔗 === STEP 5: ANALYZING INTERSECTIONS ===")
    
    # Check for intersection (should be 0 based on your data)
    intersection_ids = all_textile_ids & all_tapestry_ids
    intersection_objects = [obj for obj in all_textiles if obj['objectID'] in intersection_ids]
    
    print(f"✅ Objects in both textiles and tapestries: {len(intersection_ids)}")
    
    # The rest of tapestries are in textile intersection (as you mentioned)
    tapestries_in_textiles = expected_tapestry_count - downloaded_tapestry_count - tapestry_failed_count
    print(f"📊 Tapestries already in textiles: {tapestries_in_textiles}")
    
    # 6. CALCULATE FINAL TOTALS
    print("\n📊 === STEP 6: CALCULATING FINAL TOTALS ===")
    
    total_failed = textile_failed_count + tapestry_failed_count
    total_downloaded = downloaded_textile_count + downloaded_tapestry_count
    total_expected = expected_textile_count + expected_tapestry_count
    total_with_images = textile_images_count + tapestry_images_count
    
    print(f"📊 FINAL TOTALS:")
    print(f"  Expected Total: {total_expected:,}")
    print(f"  Downloaded Total: {total_downloaded:,}")
    print(f"  Total with Images: {total_with_images:,}")
    print(f"  Total Failed: {total_failed}")
    print(f"  Success Rate: {total_downloaded/total_expected*100:.3f}%")
    
    # 7. CREATE SPECIAL COLLECTION: ALL OBJECTS WITH IMAGES ONLY
    print("\n🖼️  === STEP 7: CREATING OBJECTS WITH IMAGES COLLECTION ===")
    
    all_objects_with_images = textiles_with_images + tapestries_with_images
    print(f"✅ Total objects with images: {len(all_objects_with_images):,}")
    print(f"✅ This will be your FINAL RESEARCH DATASET!")
    
    # 8. SAVE ALL FILES
    print("\n💾 === STEP 8: SAVING ALL FILES ===")
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # A. Save complete objects collections
    complete_objects = [
        {
            "data": all_textiles,
            "filename": f"{final_dir}/all_objects/complete_textiles_{timestamp}.json",
            "description": "All downloaded textile objects with full metadata"
        },
        {
            "data": all_tapestries,
            "filename": f"{final_dir}/all_objects/complete_tapestries_{timestamp}.json",
            "description": "All downloaded tapestry objects with full metadata"
        },
        {
            "data": intersection_objects,
            "filename": f"{final_dir}/all_objects/textiles_tapestries_intersection_{timestamp}.json",
            "description": "Objects that appear in both categories"
        }
    ]
    
    for file_info in complete_objects:
        with open(file_info["filename"], 'w', encoding='utf-8') as f:
            json.dump(file_info["data"], f, indent=2, ensure_ascii=False)
        print(f"  📁 {file_info['filename']}: {len(file_info['data']):,} objects")
    
    # B. Save IMAGES ONLY collections (FINAL RESEARCH DATASET)
    images_objects = [
        {
            "data": textiles_with_images,
            "filename": f"{final_dir}/objects_with_images_only/textiles_with_images_{timestamp}.json",
            "description": "Textile objects with images - RESEARCH READY"
        },
        {
            "data": tapestries_with_images,
            "filename": f"{final_dir}/objects_with_images_only/tapestries_with_images_{timestamp}.json",
            "description": "Tapestry objects with images - RESEARCH READY"
        },
        {
            "data": all_objects_with_images,
            "filename": f"{final_dir}/objects_with_images_only/ALL_TEXTILES_AND_TAPESTRIES_WITH_IMAGES_{timestamp}.json",
            "description": "🎯 FINAL RESEARCH DATASET - All objects with images"
        }
    ]
    
    for file_info in images_objects:
        with open(file_info["filename"], 'w', encoding='utf-8') as f:
            json.dump(file_info["data"], f, indent=2, ensure_ascii=False)
        print(f"  🖼️  {file_info['filename']}: {len(file_info['data']):,} objects")
    
    # C. Save ID lists
    id_lists = [
        {
            "data": sorted(list(all_textile_ids)),
            "filename": f"{final_dir}/id_lists/textile_object_ids_{timestamp}.json",
            "description": "All downloaded textile object IDs"
        },
        {
            "data": sorted(list(all_tapestry_ids)),
            "filename": f"{final_dir}/id_lists/tapestry_object_ids_{timestamp}.json",
            "description": "All downloaded tapestry object IDs"
        },
        {
            "data": [obj['objectID'] for obj in all_objects_with_images],
            "filename": f"{final_dir}/id_lists/all_objects_with_images_ids_{timestamp}.json",
            "description": "IDs of all objects with images (FINAL DATASET)"
        }
    ]
    
    # Load failed IDs
    try:
        with open('final_retry_results_20250705_224324.json', 'r') as f:
            failed_data = json.load(f)
        all_failed_ids = failed_data.get('confirmed_404_ids', [])
        
        id_lists.append({
            "data": sorted(all_failed_ids),
            "filename": f"{final_dir}/id_lists/failed_object_ids_{timestamp}.json",
            "description": f"Failed object IDs ({len(all_failed_ids)} total)"
        })
    except Exception as e:
        print(f"  ⚠️  Could not load failed IDs: {e}")
    
    for id_list in id_lists:
        with open(id_list["filename"], 'w') as f:
            json.dump(id_list["data"], f, indent=2)
        print(f"  📋 {id_list['filename']}: {len(id_list['data']):,} IDs")
    
    # 9. CREATE COMPREHENSIVE METADATA
    print("\n📋 === STEP 9: CREATING DATASET METADATA ===")
    
    dataset_metadata = {
        "dataset_info": {
            "name": "MET Museum Textiles and Tapestries Collection",
            "version": "2.0 - CORRECTED",
            "created_date": timestamp,
            "description": "Complete collection of textile and tapestry objects from the Metropolitan Museum of Art",
            "success_rate_percent": round(total_downloaded/total_expected*100, 3),
            "api_calls_made": "~35,000",
            "download_duration": "~8 hours"
        },
        "api_expected_counts": {
            "textiles_from_api": expected_textile_count,
            "tapestries_unique_from_api": expected_tapestry_count,
            "total_expected": total_expected
        },
        "download_results": {
            "textiles": {
                "downloaded": downloaded_textile_count,
                "with_images": textile_images_count,
                "failed": textile_failed_count,
                "success_rate": round(downloaded_textile_count/expected_textile_count*100, 3)
            },
            "tapestries": {
                "downloaded": downloaded_tapestry_count,
                "with_images": tapestry_images_count,
                "failed": tapestry_failed_count,
                "success_rate": round(downloaded_tapestry_count/expected_tapestry_count*100, 3) if expected_tapestry_count > 0 else 100
            },
            "totals": {
                "total_downloaded": total_downloaded,
                "total_with_images": total_with_images,
                "total_failed": total_failed,
                "overall_success_rate": round(total_downloaded/total_expected*100, 3)
            }
        },
        "final_research_dataset": {
            "description": "Objects with images only - ready for research",
            "total_objects": len(all_objects_with_images),
            "textiles_with_images": textile_images_count,
            "tapestries_with_images": tapestry_images_count,
            "filename": f"ALL_TEXTILES_AND_TAPESTRIES_WITH_IMAGES_{timestamp}.json"
        }
    }
    
    metadata_file = f"{final_dir}/metadata/dataset_metadata_{timestamp}.json"
    with open(metadata_file, 'w') as f:
        json.dump(dataset_metadata, f, indent=2)
    print(f"  📊 {metadata_file}: Complete metadata saved")
    
    # 10. CREATE UPDATED README
    readme_content = f"""# MET Museum Textiles and Tapestries Dataset (CORRECTED)

## Overview
This dataset contains textile and tapestry objects from the Metropolitan Museum of Art, collected via their public API.

## Success Rate: {total_downloaded/total_expected*100:.3f}%

## API Expected vs Downloaded
- **Textiles**: {downloaded_textile_count:,} objects ({textile_images_count:,} with images) out of {expected_textile_count:,} expected
- **Tapestries**: {downloaded_tapestry_count} objects ({tapestry_images_count} with images) out of {expected_tapestry_count} expected  
- **Failed Downloads**: {total_failed} objects (243 textiles + {tapestry_failed_count} tapestries)

## 🎯 FINAL RESEARCH DATASET
**{len(all_objects_with_images):,} objects with images** - This is your main research dataset!
- Located in: `objects_with_images_only/ALL_TEXTILES_AND_TAPESTRIES_WITH_IMAGES_{timestamp}.json`

## Directory Structure
```
FINAL_CORRECTED_MET_TEXTILES_DATASET/
├── all_objects/                    # Complete collections (all downloaded objects)
├── objects_with_images_only/       # 🎯 RESEARCH READY - Objects with images only
├── id_lists/                       # Object ID lists for reference
├── metadata/                       # Dataset documentation
└── README.md                       # This file
```

## Detailed Statistics
- **Total Expected from API**: {total_expected:,} objects
- **Total Successfully Downloaded**: {total_downloaded:,} objects
- **Total with Images**: {total_with_images:,} objects
- **Total Failed**: {total_failed} objects
- **Overall Success Rate**: {total_downloaded/total_expected*100:.3f}%

### Breakdown by Category
- **Textiles**: {downloaded_textile_count:,}/{expected_textile_count:,} ({downloaded_textile_count/expected_textile_count*100:.1f}% success)
- **Tapestries**: {downloaded_tapestry_count}/{expected_tapestry_count} ({downloaded_tapestry_count/expected_tapestry_count*100:.1f}% success)
- **Intersections**: {len(intersection_ids)} objects appear in both categories

## Files Created: {timestamp}

### 🎯 RESEARCH READY (Images Only)
- `ALL_TEXTILES_AND_TAPESTRIES_WITH_IMAGES_{timestamp}.json` - **FINAL RESEARCH DATASET**
- `textiles_with_images_{timestamp}.json` - Textiles with images only
- `tapestries_with_images_{timestamp}.json` - Tapestries with images only

### Complete Collections (All Downloaded)
- `complete_textiles_{timestamp}.json` - All downloaded textiles
- `complete_tapestries_{timestamp}.json` - All downloaded tapestries
- `textiles_tapestries_intersection_{timestamp}.json` - Objects in both categories

### ID Lists (For Reference)
- `all_objects_with_images_ids_{timestamp}.json` - IDs of final research dataset
- `textile_object_ids_{timestamp}.json` - All textile IDs
- `tapestry_object_ids_{timestamp}.json` - All tapestry IDs
- `failed_object_ids_{timestamp}.json` - Failed download IDs

## Usage Notes
- **Use the `objects_with_images_only/` folder for research** - these objects have downloadable images
- Each object contains full MET API metadata
- Failed objects are mostly confirmed 404s (no longer exist in MET collection)
- Image URLs in `primaryImage` field can be downloaded directly

## Data Quality
- {total_downloaded/total_expected*100:.3f}% success rate from API
- All duplicates removed
- Complete provenance tracking
- Ready for academic research

**🎯 Your final research dataset: {len(all_objects_with_images):,} objects with images!**

Generated: {datetime.now()}
"""
    
    readme_file = f"{final_dir}/README.md"
    with open(readme_file, 'w') as f:
        f.write(readme_content)
    print(f"  📝 {readme_file}: Documentation created")
    
    # 11. FINAL CELEBRATION
    print(f"\n🎉🎉🎉 === CORRECTED DATASET FINALIZATION COMPLETE === 🎉🎉🎉")
    print(f"📁 Final directory: {final_dir}")
    print(f"📊 API Expected: {total_expected:,} objects")
    print(f"✅ Successfully Downloaded: {total_downloaded:,} objects")
    print(f"🖼️  Objects with Images: {len(all_objects_with_images):,} objects")
    print(f"❌ Failed: {total_failed} objects")
    print(f"🎯 Success Rate: {total_downloaded/total_expected*100:.3f}%")
    print(f"\n🏆 YOUR FINAL RESEARCH DATASET: {len(all_objects_with_images):,} OBJECTS WITH IMAGES!")
    print(f"📂 Located in: objects_with_images_only/ALL_TEXTILES_AND_TAPESTRIES_WITH_IMAGES_{timestamp}.json")
    
    return {
        "final_directory": final_dir,
        "total_expected": total_expected,
        "total_downloaded": total_downloaded,
        "total_with_images": len(all_objects_with_images),
        "total_failed": total_failed,
        "success_rate": total_downloaded/total_expected*100,
        "final_research_dataset_size": len(all_objects_with_images)
    }

# Run the corrected finalization
corrected_stats = create_final_corrected_dataset()

=== CREATING CORRECTED FINAL MET TEXTILES DATASET ===
Started at: 2025-07-05 23:03:14.558119

📁 === STEP 1: CREATING CLEAN DIRECTORY ===
✅ Created directory: FINAL_CORRECTED_MET_TEXTILES_DATASET
✅ Created subdirectory: all_objects
✅ Created subdirectory: objects_with_images_only
✅ Created subdirectory: id_lists
✅ Created subdirectory: metadata

📊 === STEP 2: GETTING EXPECTED COUNTS FROM API ===
📊 Expected Textiles from API: 33,437
📊 Expected Tapestries (unique): 151

🧵 === STEP 3: COLLECTING ALL DOWNLOADED TEXTILES ===
  📁 met_textiles_batch_22800_20250705_134702.json: 22104 unique objects added
  📁 idun/met_textiles_batch_11988_20250705_134921.json: 10354 unique objects added
  📁 remaining_textiles_complete_20250705_135732.json: 736 unique objects added
  📁 idun/met_textiles_complete_reverse_20250705_222820.json: 0 unique objects added
✅ Downloaded Textiles: 33,194 out of 33,437
✅ Textiles with images: 27,271
❌ Failed Textiles: 243

🎨 === STEP 4: COLLECTING ALL DOWNLOADED TAPESTRIES =