# Food in Art

In [21]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time
import os

## Fetch the IDs from the SPARQL endpoint

In [22]:

# Function to run the SPARQL query
def run_sparql_query(query):
    sparql = SPARQLWrapper(wikidata_endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    # Set your user agent to comply with Wikidata's policy
    sparql.addCustomHttpHeader('User-Agent', 'MyPaintingDataRetriever/1.0 (jipijipijipi@gmail.com)')
    try:
        results = sparql.query().convert()
    except Exception as e:
        print(f"An error occurred: {e}")
        time.sleep(10)  # Wait before retrying
        results = sparql.query().convert()
    return results

# Function to chunk the list into batches
def chunk_list(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
wikidata_endpoint_url = "https://query.wikidata.org/sparql"
# Skip if the final file already exists
if os.path.exists('data/wikidata_paintings_ids.csv'):
    print("Final file already exists. Skipping data retrieval.")
else:
    wikidata_base_query = """
    SELECT ?item ?title ?author_wikidata ?author_name WHERE {{
    ?item wdt:P31 wd:Q3305213.
    ?item rdfs:label ?title.
    ?item wdt:P170 ?author_wikidata.
    ?author_wikidata rdfs:label ?author_name.
    FILTER(LANG(?title) = "en").
    FILTER(LANG(?author_name) = "en").
    }}
    LIMIT {limit}
    OFFSET {offset}
    """

    # Set batch parameters
    limit = 1000  # Number of records to fetch per batch
    checkpoint_interval = 10  # Save a checkpoint every 10 batches
    max_retries = 5  # Maximum number of retries for failed requests

    # Check if a checkpoint exists to resume from
    # Create checkpoints folder if it does not exist
    if not os.path.exists('data/checkpoints'):
        os.makedirs('data/checkpoints')
    if os.path.exists('data/checkpoints/paintings_checkpoint.csv') and os.path.exists('data/checkpoints/offset_checkpoint.txt'):
        all_data = pd.read_csv('data/checkpoints/paintings_checkpoint.csv')
        with open('data/checkpoints/offset_checkpoint.txt', 'r') as f:
            offset = int(f.read())
        batch_number = offset // limit
        print(f"Resuming from offset {offset}")
    else:
        all_data = pd.DataFrame()
        offset = 0
        batch_number = 0

# Loop to fetch data in batches

    while True:
        query = wikidata_base_query.format(limit=limit, offset=offset)
        print(f"Fetching data with OFFSET {offset}")
        retries = 0
        while retries < max_retries:
            try:
                results = run_sparql_query(query)
                break
            except Exception as e:
                print(f"Error: {e}. Retrying ({retries+1}/{max_retries})...")
                retries += 1
                time.sleep(5)
        else:
            print("Max retries exceeded. Exiting.")
            break

        # Process the results
        bindings = results['results']['bindings']
        if not bindings:
            print("No more data returned.")
            break

        # Convert the bindings to a DataFrame
        data = []
        for b in bindings:
            item = b['item']['value']
            title = b['title']['value']
            author_wikidata = b['author_wikidata']['value']
            author_name = b['author_name']['value']
            data.append({
                'item': item,
                'title': title,
                'author_wikidata': author_wikidata,
                'author_name': author_name
            })
        df = pd.DataFrame(data)
        all_data = pd.concat([all_data, df], ignore_index=True)

        # Save a checkpoint at specified intervals
        batch_number += 1
        if batch_number % checkpoint_interval == 0:
            all_data.to_csv('data/checkpoints/paintings_checkpoint.csv', index=False)
            with open('data/checkpoints/offset_checkpoint.txt', 'w') as f:
                f.write(str(offset + limit))
            print(f"Checkpoint saved at batch {batch_number}")

        # Update the offset for the next batch
        offset += limit
        time.sleep(1)  # Be polite and avoid overloading the server

    # Save the final data to a CSV file
    all_data.to_csv('data/wikidata_paintings_ids.csv', index=False)
    print("Data retrieval complete. Saved to wikidata_paintings_ids.csv")


## Fetch all the data from the previous list


In [None]:

# Define the SPARQL endpoint URL
wikidata_endpoint_url = "https://query.wikidata.org/sparql"

# Read the basic painting data
basic_data = pd.read_csv('data/wikidata_paintings_ids.csv')
item_list = basic_data['item'].tolist()

# Set batch parameters
batch_size = 50  # Number of paintings to query at once
max_retries = 5  # Maximum number of retries for failed requests

# Prepare to store detailed data
detailed_data = pd.DataFrame()

# Check if a checkpoint exists to resume from
if os.path.exists('data/checkpoints/paintings_detailed_checkpoint.csv') and os.path.exists('data/checkpoints/batch_index_checkpoint.txt'):
    detailed_data = pd.read_csv('data/checkpoints/paintings_detailed_checkpoint.csv')
    with open('data/checkpoints/batch_index_checkpoint.txt', 'r') as f:
        start_batch = int(f.read())
    print(f"Resuming from batch index {start_batch}")
else:
    detailed_data = pd.DataFrame()
    start_batch = 0

# Convert item URIs to Q-IDs
item_qids = [uri.split('/')[-1] for uri in item_list]

# Create batches
batches = list(chunk_list(item_qids, batch_size))

# Loop over batches to fetch detailed data
for batch_index, batch_qids in enumerate(batches[start_batch:], start=start_batch):
    print(f"Processing batch {batch_index + 1}/{len(batches)}")
    qid_list_str = ' '.join(f'wd:{qid}' for qid in batch_qids)

    # Construct the SPARQL query for the batch
    batch_query = f"""
    PREFIX schema: <http://schema.org/>
    SELECT ?item ?creation_date ?origin_country ?display_country ?display_location ?type ?school ?time_period ?wiki_url ?image_url (GROUP_CONCAT(?depicts_label; separator=", ") AS ?depicts) WHERE {{
      VALUES ?item {{ {qid_list_str} }}
      
      OPTIONAL {{ ?item wdt:P571 ?creation_date. }}
      OPTIONAL {{
        ?item wdt:P495 ?origin_country_wd.
        ?origin_country_wd rdfs:label ?origin_country.
        FILTER(LANG(?origin_country) = "en")
      }}
      OPTIONAL {{
        ?item wdt:P276 ?display_location_wd.
        ?display_location_wd rdfs:label ?display_location.
        FILTER(LANG(?display_location) = "en")
        OPTIONAL {{
          ?display_location_wd wdt:P17 ?display_country_wd.
          ?display_country_wd rdfs:label ?display_country.
          FILTER(LANG(?display_country) = "en")
        }}
      }}
      OPTIONAL {{
        ?item wdt:P136 ?type_wd.
        ?type_wd rdfs:label ?type.
        FILTER(LANG(?type) = "en")
      }}
      OPTIONAL {{
        ?item wdt:P135 ?school_wd.
        ?school_wd rdfs:label ?school.
        FILTER(LANG(?school) = "en")
      }}
      OPTIONAL {{
        ?item wdt:P2348 ?time_period_wd.
        ?time_period_wd rdfs:label ?time_period.
        FILTER(LANG(?time_period) = "en")
      }}
      OPTIONAL {{
        ?item wdt:P18 ?image_file.
        BIND(CONCAT("https://commons.wikimedia.org/wiki/Special:FilePath/", ENCODE_FOR_URI(REPLACE(STR(?image_file), "^.*\\\\/(?!.*\\\\/)", ""))) AS ?image_url)
      }}
      OPTIONAL {{
        ?item wdt:P180 ?depicts_wd.
        ?depicts_wd rdfs:label ?depicts_label.
        FILTER(LANG(?depicts_label) = "en")
      }}
      OPTIONAL {{
        ?sitelink schema:about ?item;
                  schema:isPartOf <https://en.wikipedia.org/>;
                  schema:url ?wiki_url.
      }}
    }}
    GROUP BY ?item ?creation_date ?origin_country ?display_country ?display_location ?type ?school ?time_period ?wiki_url ?image_url
    """

    retries = 0
    while retries < max_retries:
        try:
            results = run_sparql_query(batch_query)
            break
        except Exception as e:
            print(f"Error: {e}. Retrying ({retries + 1}/{max_retries})...")
            retries += 1
            time.sleep(5)
    else:
        print("Max retries exceeded for this batch. Skipping.")
        continue

    # Process the results
    bindings = results['results']['bindings']
    if not bindings:
        print(f"No data returned for batch {batch_index + 1}.")
        continue

    # Convert the bindings to a DataFrame
    data = []
    for b in bindings:
        item = b['item']['value']
        creation_date = b.get('creation_date', {}).get('value', None)
        origin_country = b.get('origin_country', {}).get('value', None)
        display_country = b.get('display_country', {}).get('value', None)
        display_location = b.get('display_location', {}).get('value', None)
        type_ = b.get('type', {}).get('value', None)
        school = b.get('school', {}).get('value', None)
        time_period = b.get('time_period', {}).get('value', None)
        wiki_url = b.get('wiki_url', {}).get('value', None)
        image_url = b.get('image_url', {}).get('value', None)
        depicts = b.get('depicts', {}).get('value', None)
        data.append({
            'item': item,
            'creation_date': creation_date,
            'origin_country': origin_country,
            'display_country': display_country,
            'display_location': display_location,
            'type': type_,
            'school': school,
            'time_period': time_period,
            'wiki_url': wiki_url,
            'image_url': image_url,
            'depicts': depicts
        })
    df = pd.DataFrame(data)
    detailed_data = pd.concat([detailed_data, df], ignore_index=True)

    # Save a checkpoint
    detailed_data.to_csv('data/checkpoints/paintings_detailed_checkpoint.csv', index=False)
    with open('data/checkpoints/batch_index_checkpoint.txt', 'w') as f:
        f.write(str(batch_index + 1))
    print(f"Checkpoint saved at batch {batch_index + 1}")

    time.sleep(1)  # Be polite and avoid overloading the server

# Merge basic and detailed data
final_data = pd.merge(basic_data, detailed_data, on='item', how='left')

# Save the final data to a CSV file
final_data.to_csv('data/wikidata_paintings_final.csv', index=False)
print("Second pass complete. Detailed painting data saved to wikidata_paintings_final.csv")