# Food in Art

In [None]:
import os
import requests
import pandas as pd
import urllib.parse
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from PIL import Image
import numpy as np

In [None]:
IMG_WIDTH = 512

API_ENDPOINT = "https://commons.wikimedia.org/w/api.php"

OUTPUT_DIR = f'img/img_{IMG_WIDTH}'

CHECKPOINT_FILE = "data/checkpoints/download_checkpoint.txt"

MAX_WORKERS = 3 

BATCH_SIZE = 30  

USER_AGENT = "IH-final/1.0 (jipijipijipi@gmail.com)"

API_TIMEOUT = 30 
DOWNLOAD_TIMEOUT = 60  

MAX_RETRIES = 3

API_DELAY = 0.5  # seconds



In [None]:
def extract_filename(url):
    parsed = urllib.parse.urlparse(url)
    first_decode = urllib.parse.unquote(parsed.path)
    filename = os.path.basename(first_decode)
    filename = filename.replace(' ', '_')
    return filename

def chunked_iterable(iterable, size):
    it = iter(iterable)
    while True:
        try:
            chunk = [next(it) for _ in range(size)]
        except StopIteration:
            chunk = []
        if not chunk:
            break
        yield chunk
        if len(chunk) < size:
            break



In [None]:


def fetch_thumbnail_urls(filenames):

    titles = '|'.join([f'File:{filename}' for filename in filenames])
    params = {
        'action': 'query',
        'titles': titles,
        'prop': 'imageinfo',
        'iiprop': 'url',
        'iiurlwidth': f'{IMG_WIDTH}',
        'format': 'json',
        'formatversion': '2'
    }
    headers = {
        'User-Agent': USER_AGENT
    }

    try:
        response = requests.get(API_ENDPOINT, params=params, headers=headers, timeout=API_TIMEOUT)
        response.raise_for_status()
        data = response.json()
        print(data)
        result = {}
        pages = data.get('query', {}).get('pages', [])
        for page in pages:
            title = page.get('title', '')
            filename = title.replace('File:', '')
            imageinfo = page.get('imageinfo', [])
            if imageinfo:
                # Prefer 'thumburl' if available, else fallback to 'url'
                thumb_url = imageinfo[0].get('thumburl')
                if not thumb_url:
                    # If thumburl is not available, use the full image URL
                    thumb_url = imageinfo[0].get('url')
                result[filename] = thumb_url
            else:
                result[filename] = None
        return result
    except Exception as e:
        print(f"Exception during fetching thumbnail URLs: {e}")
        return {}

def download_image(session, url, filename):

    if not url:
        print(f"No URL provided for {filename}. Skipping download.")
        return False

    image_path = os.path.join(OUTPUT_DIR, filename)

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = session.get(url, timeout=DOWNLOAD_TIMEOUT)
            response.raise_for_status()
            with open(image_path, 'wb') as f:
                f.write(response.content)
            return True
        except Exception as e:
            print(f"Error downloading {filename} (Attempt {attempt}/{MAX_RETRIES}): {e}")
            time.sleep(1)  # Wait before retrying
    print(f"Failed to download {filename} after {MAX_RETRIES} attempts.")
    return False

def update_checkpoint(filename):

    with open(CHECKPOINT_FILE, 'a') as f:
        f.write(f"{filename}\n")

In [None]:
df = pd.read_csv('data/wikidata_all_paintings.csv')
#df = pd.read_csv('data/missing_image_paths.csv')
display(df.head())


In [None]:
df = df.dropna(subset=['image_url'])
df['filename'] = df['image_url'].apply(extract_filename)
df = df.drop_duplicates(subset=['filename'])

display(df)

In [None]:

os.makedirs(OUTPUT_DIR, exist_ok=True)

df.dropna(subset=['image_url'], inplace=True)



if os.path.exists(CHECKPOINT_FILE):
    with open(CHECKPOINT_FILE, 'r') as f:
        downloaded = set(line.strip() for line in f)
else:
    downloaded = set()

existing_files = set(os.listdir(OUTPUT_DIR))
downloaded.update(existing_files)





In [None]:


df['filename'] = df['image_url'].apply(extract_filename)


df = df.drop_duplicates(subset='filename')

filenames = df['filename'].tolist()
filenames_to_download = [fn for fn in filenames if fn.replace('_', ' ') not in downloaded]

total_images = len(filenames_to_download)
print(f"Total images to download: {total_images}")

session = requests.Session()
session.headers.update({'User-Agent': USER_AGENT})

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    for batch_num, batch in enumerate(chunked_iterable(filenames_to_download, BATCH_SIZE), start=1):
        print(f"Processing batch {batch_num} with {len(batch)} images...")
        thumb_urls = fetch_thumbnail_urls(batch)

        tasks = []
        for filename, url in thumb_urls.items():
            if url:
                tasks.append((filename, url))
            else:
                print(f"No thumbnail URL found for {filename, url}. Skipping.")

        if not tasks:
            print(f"No downloadable URLs found in batch {batch_num}. Skipping to next batch.")
            continue

        with tqdm(total=len(tasks), desc=f"Batch {batch_num}", unit="image") as pbar:
            future_to_filename = {
                executor.submit(download_image, session, url, filename): filename
                for filename, url in tasks
            }
            for future in as_completed(future_to_filename):
                filename = future_to_filename[future]
                try:
                    success = future.result()
                    if success:
                        update_checkpoint(filename)
                except Exception as e:
                    print(f"Unexpected error downloading {filename}: {e}")
                finally:
                    pbar.update(1)

        time.sleep(API_DELAY)

print("All downloads completed.")   
