# Food in Art

In [23]:
import os
import requests
import pandas as pd
import urllib.parse
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from PIL import Image
import numpy as np

In [24]:
IMG_WIDTH = 512

# Wikimedia Commons API endpoint
API_ENDPOINT = "https://commons.wikimedia.org/w/api.php"

# Directory to save downloaded images
OUTPUT_DIR = f'img/img_{IMG_WIDTH}'

# File to store checkpoint information
CHECKPOINT_FILE = "data/checkpoints/download_checkpoint.txt"

# Number of concurrent threads for downloading
MAX_WORKERS = 3  # Adjust based on your network and Wikimedia's rate limits

# Batch size for API requests
BATCH_SIZE = 30  # Number of filenames per batch API request

# User-Agent header to identify your script (replace with your details)
USER_AGENT = "IH-final/1.0 (jipijipijipi@gmail.com)"

# Timeout settings for HTTP requests
API_TIMEOUT = 30  # seconds
DOWNLOAD_TIMEOUT = 60  # seconds

# Maximum number of retries for failed downloads
MAX_RETRIES = 3

# Delay between API requests to respect rate limits
API_DELAY = 0.5  # seconds

# ---------------------------- Setup ----------------------------


In [25]:
# Function to extract the file name from the URL
def extract_filename(url):
    parsed = urllib.parse.urlparse(url)
    # First decoding: %2520 -> %20
    first_decode = urllib.parse.unquote(parsed.path)
    # Second decoding: %20 -> space
    #second_decode = urllib.parse.unquote(first_decode)
    filename = os.path.basename(first_decode)
    # Replace spaces with underscores as per MediaWiki API requirements
    filename = filename.replace(' ', '_')
    return filename

# Function to split iterable into chunks of size 'size'
def chunked_iterable(iterable, size):
    it = iter(iterable)
    while True:
        try:
            chunk = [next(it) for _ in range(size)]
        except StopIteration:
            chunk = []
        if not chunk:
            break
        yield chunk
        if len(chunk) < size:
            break

# ---------------------------- Helper Functions ----------------------------

def fetch_thumbnail_urls(filenames):
    """
    Fetch thumbnail URLs for a batch of filenames using Wikimedia API.

    :param filenames: List of filenames
    :return: Dictionary mapping filename to thumbnail URL or full URL
    """
    titles = '|'.join([f'File:{filename}' for filename in filenames])
    params = {
        'action': 'query',
        'titles': titles,
        'prop': 'imageinfo',
        'iiprop': 'url',
        'iiurlwidth': f'{IMG_WIDTH}',
        'format': 'json',
        'formatversion': '2'
    }
    headers = {
        'User-Agent': USER_AGENT
    }

    try:
        response = requests.get(API_ENDPOINT, params=params, headers=headers, timeout=API_TIMEOUT)
        response.raise_for_status()
        data = response.json()
        print(data)
        result = {}
        pages = data.get('query', {}).get('pages', [])
        for page in pages:
            title = page.get('title', '')
            filename = title.replace('File:', '')
            imageinfo = page.get('imageinfo', [])
            if imageinfo:
                # Prefer 'thumburl' if available, else fallback to 'url'
                thumb_url = imageinfo[0].get('thumburl')
                if not thumb_url:
                    # If thumburl is not available, use the full image URL
                    thumb_url = imageinfo[0].get('url')
                result[filename] = thumb_url
            else:
                result[filename] = None
        return result
    except Exception as e:
        print(f"Exception during fetching thumbnail URLs: {e}")
        return {}

def download_image(session, url, filename):
    """
    Download an image from the given URL and save it to the output directory.

    :param session: requests.Session object
    :param url: URL of the image to download
    :param filename: Filename to save the image as
    :return: Boolean indicating success or failure
    """
    if not url:
        print(f"No URL provided for {filename}. Skipping download.")
        return False

    image_path = os.path.join(OUTPUT_DIR, filename)

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = session.get(url, timeout=DOWNLOAD_TIMEOUT)
            response.raise_for_status()
            with open(image_path, 'wb') as f:
                f.write(response.content)
            return True
        except Exception as e:
            print(f"Error downloading {filename} (Attempt {attempt}/{MAX_RETRIES}): {e}")
            time.sleep(1)  # Wait before retrying
    print(f"Failed to download {filename} after {MAX_RETRIES} attempts.")
    return False

def update_checkpoint(filename):
    """
    Append the successfully downloaded filename to the checkpoint file.

    :param filename: Filename to add to checkpoint
    """
    with open(CHECKPOINT_FILE, 'a') as f:
        f.write(f"{filename}\n")

In [26]:

#df = pd.read_csv('data/wikidata_all_paintings.csv')
df = pd.read_csv('data/missing_image_paths.csv')
display(df.head())


Unnamed: 0,item,creation_date,display_country,type,school,image_url,depicts,origin_country,time_period,fruit,...,meal,cheese,meat,food,beverage,dairy,vegetable,dessert,food_count,image_path
0,http://www.wikidata.org/entity/Q27064304,1566-01-01T00:00:00Z,France,genre art,,http://commons.wikimedia.org/wiki/Special:File...,"carrot, cherry, woman, fire, man, meat, vegeta...",,,0,...,0,0,1,0,0,1,1,1,5,img/img_512/Intérieur de cuisine - Joachim Beu...
1,http://www.wikidata.org/entity/Q776175,1565-01-01T00:00:00Z,United States of America,genre art,Northern Renaissance,http://commons.wikimedia.org/wiki/Special:File...,"sea, woman, sky, church building, summer, food...",,,0,...,1,0,1,1,1,0,1,0,5,img/img_512/Pieter Bruegel the Elder- The Harv...
2,http://www.wikidata.org/entity/Q12900365,1868-01-01T00:00:00Z,Germany,genre art,,http://commons.wikimedia.org/wiki/Special:File...,"wine, woman, book, child, bread, newspaper, bi...",,,0,...,1,1,0,0,0,0,0,0,5,img/img_512/The Luncheon (SM sg170).png
3,http://www.wikidata.org/entity/Q27974915,1650-01-01T00:00:00Z,Russia,pronk still life,,http://commons.wikimedia.org/wiki/Special:File...,"man, vegetable, house cat, table, Hound, game,...",,,1,...,0,0,0,0,0,1,1,0,4,img/img_512/Paul de Vos and Jacob Jordaens - C...
4,http://www.wikidata.org/entity/Q20532659,1884-01-01T00:00:00Z,Denmark,still life,,http://commons.wikimedia.org/wiki/Special:File...,"beer, cheese, table, chair, butter, pipe, lunch",,,0,...,1,1,0,0,1,0,0,1,4,"img/img_512/OA Hermansen, Et frokostbord, 1884..."


In [27]:
df = df.dropna(subset=['image_url'])
df['filename'] = df['image_url'].apply(extract_filename)
df = df.drop_duplicates(subset=['filename'])

display(df)

Unnamed: 0,item,creation_date,display_country,type,school,image_url,depicts,origin_country,time_period,fruit,...,cheese,meat,food,beverage,dairy,vegetable,dessert,food_count,image_path,filename
0,http://www.wikidata.org/entity/Q27064304,1566-01-01T00:00:00Z,France,genre art,,http://commons.wikimedia.org/wiki/Special:File...,"carrot, cherry, woman, fire, man, meat, vegeta...",,,0,...,0,1,0,0,1,1,1,5,img/img_512/Intérieur de cuisine - Joachim Beu...,Intérieur_de_cuisine_-_Joachim_Beuckelaer_-_Mu...
1,http://www.wikidata.org/entity/Q776175,1565-01-01T00:00:00Z,United States of America,genre art,Northern Renaissance,http://commons.wikimedia.org/wiki/Special:File...,"sea, woman, sky, church building, summer, food...",,,0,...,0,1,1,1,0,1,0,5,img/img_512/Pieter Bruegel the Elder- The Harv...,Pieter_Bruegel_the_Elder-_The_Harvesters_-_Goo...
2,http://www.wikidata.org/entity/Q12900365,1868-01-01T00:00:00Z,Germany,genre art,,http://commons.wikimedia.org/wiki/Special:File...,"wine, woman, book, child, bread, newspaper, bi...",,,0,...,1,0,0,0,0,0,0,5,img/img_512/The Luncheon (SM sg170).png,The_Luncheon_(SM_sg170).png
3,http://www.wikidata.org/entity/Q27974915,1650-01-01T00:00:00Z,Russia,pronk still life,,http://commons.wikimedia.org/wiki/Special:File...,"man, vegetable, house cat, table, Hound, game,...",,,1,...,0,0,0,0,1,1,0,4,img/img_512/Paul de Vos and Jacob Jordaens - C...,Paul_de_Vos_and_Jacob_Jordaens_-_Cook_at_the_T...
4,http://www.wikidata.org/entity/Q20532659,1884-01-01T00:00:00Z,Denmark,still life,,http://commons.wikimedia.org/wiki/Special:File...,"beer, cheese, table, chair, butter, pipe, lunch",,,0,...,1,0,0,1,0,0,1,4,"img/img_512/OA Hermansen, Et frokostbord, 1884...","OA_Hermansen,_Et_frokostbord,_1884,_KMS3132,_S..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,http://www.wikidata.org/entity/Q28031701,1649-01-01T00:00:00Z,,still life,,http://commons.wikimedia.org/wiki/Special:File...,"flower, fruit",,,1,...,0,0,0,0,0,0,0,1,img/img_512/Attributed to Mario Nuzzi (1603-73...,Attributed_to_Mario_Nuzzi_(1603-73)_-_Flowers_...
1388,http://www.wikidata.org/entity/Q3633844,1551-01-01T00:00:00Z,Sweden,inverted still-life,,http://commons.wikimedia.org/wiki/Special:File...,"sausage, Flight into Egypt, butcher shop, mark...",,,0,...,0,0,0,0,0,0,1,1,img/img_512/Pieter Aertsen 005.jpg,Pieter_Aertsen_005.jpg
1389,http://www.wikidata.org/entity/Q55428853,1610-01-01T00:00:00Z,Norway,still life,,http://commons.wikimedia.org/wiki/Special:File...,"vase, fruit",,,1,...,0,0,0,0,0,0,0,1,"img/img_512/Ukjent kunstner, flamsk, Clara Pee...","Ukjent_kunstner,_flamsk,_Clara_Peeters_-_Still..."
1390,http://www.wikidata.org/entity/Q61744340,1642-01-01T00:00:00Z,,still life,,http://commons.wikimedia.org/wiki/Special:File...,"oyster, tablecloth, lemon, berkemeyer, lobster...",,,0,...,0,0,0,0,0,0,0,1,img/img_512/Jan Davidsz. de Heem - Still-Life ...,Jan_Davidsz._de_Heem_-_Still-Life_-_WGA11283.jpg


In [28]:

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load your dataframe
# The dataframe should have a column named 'link' with Wikimedia URLs
df.dropna(subset=['image_url'], inplace=True)

""" image_links = df['image_url'].unique().tolist()
# Filter out images that are already present in the output directory
image_links = [url for url in image_links if extract_filename(url) not in existing_files] """

# Load checkpoint
if os.path.exists(CHECKPOINT_FILE):
    with open(CHECKPOINT_FILE, 'r') as f:
        downloaded = set(line.strip() for line in f)
else:
    downloaded = set()

# Check for existing files and update the checkpoint
existing_files = set(os.listdir(OUTPUT_DIR))
downloaded.update(existing_files)



# ---------------------------- Main Function ----------------------------

df['filename'] = df['image_url'].apply(extract_filename)

# Remove downloaded images from the dataframe
#df = df[~df['filename'].isin(downloaded)]
df = df.drop_duplicates(subset='filename')

filenames = df['filename'].tolist()
filenames_to_download = [fn for fn in filenames if fn.replace('_', ' ') not in downloaded]

total_images = len(filenames_to_download)
print(f"Total images to download: {total_images}")

# Initialize a requests session for HTTP connections
session = requests.Session()
session.headers.update({'User-Agent': USER_AGENT})

# Initialize ThreadPoolExecutor for parallel downloads
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Process in batches
    for batch_num, batch in enumerate(chunked_iterable(filenames_to_download, BATCH_SIZE), start=1):
        print(f"Processing batch {batch_num} with {len(batch)} images...")
        thumb_urls = fetch_thumbnail_urls(batch)

        # Prepare download tasks
        tasks = []
        for filename, url in thumb_urls.items():
            if url:
                tasks.append((filename, url))
            else:
                print(f"No thumbnail URL found for {filename, url}. Skipping.")

        if not tasks:
            print(f"No downloadable URLs found in batch {batch_num}. Skipping to next batch.")
            continue

        # Use tqdm for progress bar
        with tqdm(total=len(tasks), desc=f"Batch {batch_num}", unit="image") as pbar:
            future_to_filename = {
                executor.submit(download_image, session, url, filename): filename
                for filename, url in tasks
            }
            for future in as_completed(future_to_filename):
                filename = future_to_filename[future]
                try:
                    success = future.result()
                    if success:
                        update_checkpoint(filename)
                except Exception as e:
                    print(f"Unexpected error downloading {filename}: {e}")
                finally:
                    pbar.update(1)

        # Optional: Delay between batches to respect API rate limits
        time.sleep(API_DELAY)

print("All downloads completed.")   


Total images to download: 26
All downloads completed.
