In [1]:
import pandas as pd
import numpy as np

# Load the CSV file
df = pd.read_csv('/home/user1/Desktop/HAMZA/THESIS/MET/openaccess/MetObjects.csv')

# Basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())

print("\nFirst few rows:")
print(df.head())

print("\nData types:")
print(df.dtypes)

print("\nBasic statistics:")
print(df.describe(include='all'))

print("\nMissing values:")
print(df.isnull().sum())

print("\nMemory usage:")
print(df.info(memory_usage='deep'))

  df = pd.read_csv('/home/user1/Desktop/HAMZA/THESIS/MET/openaccess/MetObjects.csv')


Dataset Shape: (484956, 54)

Column Names:
['Object Number', 'Is Highlight', 'Is Timeline Work', 'Is Public Domain', 'Object ID', 'Gallery Number', 'Department', 'AccessionYear', 'Object Name', 'Title', 'Culture', 'Period', 'Dynasty', 'Reign', 'Portfolio', 'Constituent ID', 'Artist Role', 'Artist Prefix', 'Artist Display Name', 'Artist Display Bio', 'Artist Suffix', 'Artist Alpha Sort', 'Artist Nationality', 'Artist Begin Date', 'Artist End Date', 'Artist Gender', 'Artist ULAN URL', 'Artist Wikidata URL', 'Object Date', 'Object Begin Date', 'Object End Date', 'Medium', 'Dimensions', 'Credit Line', 'Geography Type', 'City', 'State', 'County', 'Country', 'Region', 'Subregion', 'Locale', 'Locus', 'Excavation', 'River', 'Classification', 'Rights and Reproduction', 'Link Resource', 'Object Wikidata URL', 'Metadata Date', 'Repository', 'Tags', 'Tags AAT URL', 'Tags Wikidata URL']

First few rows:
  Object Number  Is Highlight  Is Timeline Work  Is Public Domain  Object ID  \
0    1979.486.1 

In [56]:
import requests

# Search all objects with medium = "Textiles"
search_url = "https://collectionapi.metmuseum.org/public/collection/v1/search"
params = {
    "medium": "Textiles",
    "q": "*",
    "hasImages": "true",
}

try:
    response = requests.get(search_url, params=params)
    if response.status_code == 403:
        raise Exception("Rate limit exceeded (HTTP 403). Please try again later.")
    response.raise_for_status()
    data = response.json()
    print(data["total"], "objects found")
    print(data["objectIDs"][:10])  # Show first 10 object IDs
except Exception as e:
    print("Error:", e)


Error: Rate limit exceeded (HTTP 403). Please try again later.


In [None]:
[459227, 310073, 485416, 467639, 320054, 467638, 467641, 68307, 68310, 68308, 450724, 39742, 450722, 316978, 228995, 321286, 316824, 42115, 788195, 452315, 447966, 452188, 228990, 219681, 468624, 453317, 447967, 788174, 227160, 308024, 308022, 315700, 53438, 57363, 452105, 448587, 446642, 853462, 64896, 229030, 448232, 446636, 849033, 315772, 739062, 451894, 238683, 444073, 316415, 460623, 460624, 226426, 222238, 229761, 197740, 197738, 197742, 197744, 307944, 233024, 233023, 229930, 24007, 197739, 197743, 197741, 452365, 468062, 219396, 448213, 446655, 231000, 314528, 450735, 219509, 444327, 447118, 320804, 454609, 446825, 461385, 460792, 460584, 460585, 316963]


In [57]:
# Simple and efficient comparison
print("=== DIRECT ID COMPARISON ===")

# Get API object IDs
api_ids = set(api_data['objectIDs'])
print(f"API returned: {len(api_ids)} object IDs")

# Get CSV textile object IDs (all objects with 'Textiles' in Classification)
csv_textile_ids = set(df[df['Classification'].str.contains('Textiles', case=False, na=False)]['Object ID'])
print(f"CSV textiles: {len(csv_textile_ids)} object IDs")

# Direct comparison
matches = api_ids & csv_textile_ids
api_only = api_ids - csv_textile_ids
csv_only = csv_textile_ids - api_ids

print(f"\n📊 RESULTS:")
print(f"Perfect matches: {len(matches)} ({len(matches)/len(api_ids)*100:.1f}% of API results)")
print(f"API has but CSV doesn't: {len(api_only)}")
print(f"CSV has but API doesn't: {len(csv_only)}")

print(f"\n✅ Match accuracy: {len(matches)}/{len(api_ids)} = {len(matches)/len(api_ids)*100:.2f}%")

# Quick peek at the mismatches
if len(api_only) > 0:
    print(f"\nSample API-only IDs: {list(api_only)[:5]}")
if len(csv_only) > 0:
    print(f"Sample CSV-only IDs: {list(csv_only)[:5]}")

=== DIRECT ID COMPARISON ===
API returned: 33437 object IDs
CSV textiles: 32138 object IDs

📊 RESULTS:
Perfect matches: 32084 (96.0% of API results)
API has but CSV doesn't: 1353
CSV has but API doesn't: 54

✅ Match accuracy: 32084/33437 = 95.95%

Sample API-only IDs: [24582, 892940, 57363, 57364, 32794]
Sample CSV-only IDs: [319235, 319110, 313233, 319121, 319124]


In [58]:
import requests
import os

# Create directory for images
os.makedirs('csv_textile_images', exist_ok=True)

# Check what image-related columns exist in CSV
print("=== CHECKING IMAGE COLUMNS IN CSV ===")
image_cols = [col for col in df.columns if any(word in col.lower() for word in ['image', 'link', 'resource', 'url', 'photo'])]
print(f"Image-related columns: {image_cols}")

# Get 10 textile objects from CSV that are public domain and likely to have images
textile_objects = df[df['Classification'].str.contains('Textiles', case=False, na=False)]
public_textiles = textile_objects[textile_objects['Is Public Domain'] == True]

print(f"\nFound {len(public_textiles)} public domain textile objects")

# Select 10 objects to download
sample_objects = public_textiles.head(10)

def download_met_image(object_id, title, classification):
    """Download image using Met's API"""
    try:
        # Get object from API
        api_url = f"https://collectionapi.metmuseum.org/public/collection/v1/objects/{object_id}"
        response = requests.get(api_url)
        
        if response.status_code == 200:
            obj_data = response.json()
            
            if 'primaryImage' in obj_data and obj_data['primaryImage']:
                img_url = obj_data['primaryImage']
                
                # Download image
                img_response = requests.get(img_url)
                if img_response.status_code == 200:
                    # Clean filename
                    safe_title = "".join(c for c in str(title)[:30] if c.isalnum() or c in (' ', '-', '_')).rstrip()
                    filename = f"csv_textile_images/{object_id}_{safe_title}.jpg"
                    
                    with open(filename, 'wb') as f:
                        f.write(img_response.content)
                    
                    print(f"✅ Downloaded: {object_id} - {title}")
                    print(f"   Classification: {classification}")
                    print(f"   File: {filename}\n")
                    return True
                else:
                    print(f"❌ Failed to download image for {object_id}")
                    return False
            else:
                print(f"⚠️  No image available for {object_id}")
                return False
        else:
            print(f"❌ API error for {object_id}: {response.status_code}")
            return False
            
    except Exception as e:
        print(f"❌ Error downloading {object_id}: {e}")
        return False

print("\n=== DOWNLOADING 10 TEXTILE IMAGES FROM CSV ===")

success_count = 0
for idx, row in sample_objects.iterrows():
    object_id = row['Object ID']
    title = row['Title']
    classification = row['Classification']
    
    if download_met_image(object_id, title, classification):
        success_count += 1
    
    # Small delay to be nice to the API
    import time
    time.sleep(0.5)

print(f"\n🎉 Successfully downloaded {success_count}/10 images")
print(f"📁 Images saved in: ./csv_textile_images/")

=== CHECKING IMAGE COLUMNS IN CSV ===
Image-related columns: ['Artist ULAN URL', 'Artist Wikidata URL', 'Link Resource', 'Object Wikidata URL', 'Tags AAT URL', 'Tags Wikidata URL']

Found 22374 public domain textile objects

=== DOWNLOADING 10 TEXTILE IMAGES FROM CSV ===
✅ Downloaded: 13737 - Embroidered Picture
   Classification: Textiles
   File: csv_textile_images/13737_Embroidered Picture.jpg

❌ API error for 13740: 403
❌ API error for 14054: 403
❌ API error for 14056: 403
❌ API error for 14081: 403
❌ API error for 14086: 403
❌ API error for 14098: 403
❌ API error for 14101: 403
❌ API error for 14102: 403
❌ API error for 16882: 403

🎉 Successfully downloaded 1/10 images
📁 Images saved in: ./csv_textile_images/


In [61]:
import json
import os

# Load only the latest JSON file
latest_file = '/home/user1/Desktop/HAMZA/THESIS/MET/met_textiles_batch_1050_20250704_204537.json'

print(f"=== CHECKING LATEST FILE ===")
print(f"File: {os.path.basename(latest_file)}")

try:
    with open(latest_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    total_objects = len(data)
    objects_with_primary_images = 0
    objects_with_small_images = 0
    objects_with_additional_images = 0
    
    for obj in data:
        if isinstance(obj, dict):
            if 'primaryImage' in obj and obj['primaryImage']:
                objects_with_primary_images += 1
            if 'primaryImageSmall' in obj and obj['primaryImageSmall']:
                objects_with_small_images += 1
            if 'additionalImages' in obj and obj['additionalImages']:
                objects_with_additional_images += len(obj['additionalImages'])
    
    objects_without_images = total_objects - objects_with_primary_images
    
    print(f"\n📊 LATEST FILE SUMMARY:")
    print(f"Total objects: {total_objects}")
    print(f"Objects with primary images: {objects_with_primary_images}")
    print(f"Objects with small images: {objects_with_small_images}")
    print(f"Total additional images: {objects_with_additional_images}")
    print(f"Objects without images: {objects_without_images}")
    
    if total_objects > 0:
        percentage_with_images = (objects_with_primary_images / total_objects) * 100
        print(f"\n✅ Primary images available: {objects_with_primary_images}/{total_objects} ({percentage_with_images:.1f}%)")
        print(f"🖼️  You can download {objects_with_primary_images} primary images!")
        if objects_with_additional_images > 0:
            print(f"🖼️  Plus {objects_with_additional_images} additional images!")
        
        total_downloadable = objects_with_primary_images + objects_with_additional_images
        print(f"🎯 Total downloadable images: {total_downloadable}")
        
        # Show a sample object structure
        print(f"\n📋 Sample object structure:")
        sample_obj = data[0]
        print(f"Object ID: {sample_obj.get('objectID')}")
        print(f"Title: {sample_obj.get('title')}")
        print(f"Classification: {sample_obj.get('classification')}")
        print(f"Has primary image: {'Yes' if sample_obj.get('primaryImage') else 'No'}")
        print(f"Has small image: {'Yes' if sample_obj.get('primaryImageSmall') else 'No'}")
        
except Exception as e:
    print(f"❌ Failed to load {latest_file}: {e}")

=== CHECKING LATEST FILE ===
File: met_textiles_batch_1050_20250704_204537.json

📊 LATEST FILE SUMMARY:
Total objects: 971
Objects with primary images: 667
Objects with small images: 667
Total additional images: 639
Objects without images: 304

✅ Primary images available: 667/971 (68.7%)
🖼️  You can download 667 primary images!
🖼️  Plus 639 additional images!
🎯 Total downloadable images: 1306

📋 Sample object structure:
Object ID: 229636
Title: Cap crown
Classification: Textiles-Laces
Has primary image: Yes
Has small image: Yes


In [67]:
# Let's compare the API search results with your JSON data
import json

api_ids = [459227, 310073, 485416, 467639, 320054, 467638, 467641, 68307, 68310, 68308, 450724, 39742, 450722, 316978, 228995, 321286, 316824, 42115, 788195, 452315, 447966, 452188, 228990, 219681, 468624, 453317, 447967, 788174, 227160, 308024, 308022, 315700, 53438, 57363, 452105, 448587, 446642, 853462, 64896, 229030, 448232, 446636, 849033, 315772, 739062, 451894, 238683, 444073, 316415, 460623, 460624, 226426, 222238, 229761, 197740, 197738, 197742, 197744, 307944, 233024, 233023, 229930, 24007, 197739, 197743, 197741, 452365, 468062, 219396, 448213, 446655, 231000, 314528, 450735, 219509, 444327, 447118, 320804, 454609, 446825, 461385, 460792, 460584, 460585, 316963]

# Load your JSON data
with open('/home/user1/Desktop/HAMZA/THESIS/MET/met_textiles_batch_1050_20250704_204537.json', 'r') as f:
    json_data = json.load(f)

# Get object IDs from your JSON that have images
json_objects_with_images = []
for obj in json_data:
    if obj.get('primaryImage'):
        json_objects_with_images.append(obj['objectID'])

print(f"JSON file: {len(json_objects_with_images)} objects with images")
print(f"API search: {len(api_ids)} objects with images")

# Check if the API results are a subset of your JSON data
api_ids_set = set(api_ids)  # Fixed: use the list you defined above
json_ids = set(json_objects_with_images)

overlap = api_ids_set & json_ids
print(f"Objects in both: {len(overlap)}")
print(f"API-only: {len(api_ids_set - json_ids)}")
print(f"JSON-only: {len(json_ids - api_ids_set)}")

# Check a few API objects in your JSON
print(f"\nChecking first 5 API objects in your JSON:")
for obj_id in api_ids[:5]:  # Fixed: use the list directly
    json_obj = next((obj for obj in json_data if obj['objectID'] == obj_id), None)
    if json_obj:
        has_image = bool(json_obj.get('primaryImage'))
        print(f"Object {obj_id}: Found in JSON, has image: {has_image}")
    else:
        print(f"Object {obj_id}: NOT found in JSON")

JSON file: 667 objects with images
API search: 85 objects with images
Objects in both: 1
API-only: 84
JSON-only: 666

Checking first 5 API objects in your JSON:
Object 459227: NOT found in JSON
Object 310073: NOT found in JSON
Object 485416: NOT found in JSON
Object 467639: NOT found in JSON
Object 320054: NOT found in JSON


In [68]:
import pandas as pd
import re

print("=== SEARCHING FOR URLs/LINKS IN CSV ===")

# Search for HTTP/HTTPS patterns in all columns
url_pattern = r'https?://[^\s]+'

# Check each column for URLs
columns_with_urls = {}
total_urls_found = 0

for column in df.columns:
    # Convert column to string and search for URLs
    column_data = df[column].astype(str)
    
    # Find rows that contain URLs
    url_mask = column_data.str.contains(url_pattern, case=False, na=False, regex=True)
    urls_in_column = url_mask.sum()
    
    if urls_in_column > 0:
        columns_with_urls[column] = urls_in_column
        total_urls_found += urls_in_column
        
        print(f"\n📄 Column '{column}': {urls_in_column} URLs found")
        
        # Show some examples
        sample_urls = column_data[url_mask].head(3)
        for idx, url_text in enumerate(sample_urls):
            # Extract just the URL part
            urls = re.findall(url_pattern, url_text)
            if urls:
                print(f"   Example {idx+1}: {urls[0]}")

print(f"\n📊 SUMMARY:")
print(f"Total columns with URLs: {len(columns_with_urls)}")
print(f"Total URLs found: {total_urls_found}")

if columns_with_urls:
    print(f"\nColumns with URLs:")
    for col, count in columns_with_urls.items():
        print(f"  {col}: {count} URLs")
        
    # Check specifically for image-related URLs
    print(f"\n🖼️  CHECKING FOR IMAGE URLS:")
    image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.tiff', '.bmp']
    
    for column, count in columns_with_urls.items():
        column_data = df[column].astype(str)
        
        # Check for image URLs
        for ext in image_extensions:
            image_mask = column_data.str.contains(f'{ext}', case=False, na=False)
            image_count = image_mask.sum()
            
            if image_count > 0:
                print(f"   {column}: {image_count} URLs containing '{ext}'")
                # Show an example
                example = column_data[image_mask].iloc[0]
                image_urls = re.findall(url_pattern, example)
                if image_urls:
                    print(f"     Example: {image_urls[0]}")
                break
else:
    print("❌ No URLs found in any CSV columns")

=== SEARCHING FOR URLs/LINKS IN CSV ===

📄 Column 'Artist ULAN URL': 191079 URLs found
   Example 1: http://vocab.getty.edu/page/ulan/500011409
   Example 2: http://vocab.getty.edu/page/ulan/500077295
   Example 3: http://vocab.getty.edu/page/ulan/500095555

📄 Column 'Artist Wikidata URL': 213245 URLs found
   Example 1: https://www.wikidata.org/wiki/Q3806459
   Example 2: https://www.wikidata.org/wiki/Q5109648
   Example 3: https://www.wikidata.org/wiki/Q4881787

📄 Column 'Link Resource': 484956 URLs found
   Example 1: http://www.metmuseum.org/art/collection/search/1
   Example 2: http://www.metmuseum.org/art/collection/search/2
   Example 3: http://www.metmuseum.org/art/collection/search/3

📄 Column 'Object Wikidata URL': 69154 URLs found
   Example 1: https://www.wikidata.org/wiki/Q116250677
   Example 2: https://www.wikidata.org/wiki/Q116373732
   Example 3: https://www.wikidata.org/wiki/Q83545838

📄 Column 'Tags AAT URL': 191978 URLs found
   Example 1: http://vocab.getty.edu/pag