# Food in Art

In [13]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time
import os

from datetime import datetime

## Fetch the IDs from the SPARQL endpoint

In [14]:

# Function to run the SPARQL query
def run_sparql_query(query):
    sparql = SPARQLWrapper(wikidata_endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    # Set your user agent to comply with Wikidata's policy
    sparql.addCustomHttpHeader('User-Agent', 'MyPaintingDataRetriever/1.0 (jipijipijipi@gmail.com)')
    try:
        results = sparql.query().convert()
    except Exception as e:
        print(f"An error occurred: {e}")
        time.sleep(10)  # Wait before retrying
        results = sparql.query().convert()
    return results

# Function to chunk the list into batches
def chunk_list(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [15]:
wikidata_endpoint_url = "https://query.wikidata.org/sparql"

In [16]:
wikidata_base_query = """
SELECT ?item ?author_wikidata ?location_wikidata WHERE {{
?item wdt:P31 wd:Q3305213.
OPTIONAL {{ ?item wdt:P170 ?author_wikidata. }}
OPTIONAL {{ ?item wdt:P276 ?location_wikidata. }}
}}
LIMIT {limit}
OFFSET {offset}
"""

# Set batch parameters
limit = 1000  # Number of records to fetch per batch
checkpoint_interval = 10  # Save a checkpoint every 10 batches
max_retries = 5  # Maximum number of retries for failed requests

# Check if a checkpoint exists to resume from
# Create checkpoints folder if it does not exist
if not os.path.exists('data/checkpoints'):
    os.makedirs('data/checkpoints')
    
if os.path.exists('data/checkpoints/paintings_ids_checkpoint.csv') and os.path.exists('data/checkpoints/offset_paintings_ids_checkpoint.txt'):
    all_data = pd.read_csv('data/checkpoints/paintings_ids_checkpoint.csv')
    with open('data/checkpoints/offset_paintings_ids_checkpoint.txt', 'r') as f:
        offset = int(f.read())
    batch_number = offset // limit
    print(f"Resuming from offset {offset}")
else:
    all_data = pd.DataFrame()
    offset = 0
    batch_number = 0

# Loop to fetch data in batches

while True:
    query = wikidata_base_query.format(limit=limit, offset=offset)
    print(f"Fetching data with OFFSET {offset}")
    retries = 0
    while retries < max_retries:
        try:
            results = run_sparql_query(query)
            break
        except Exception as e:
            print(f"Error: {e}. Retrying ({retries+1}/{max_retries})...")
            retries += 1
            time.sleep(5)
    else:
        print("Max retries exceeded. Exiting.")
        break

    # Check if the results object is empty or None
    if not results or 'results' not in results or 'bindings' not in results['results'] or not results['results']['bindings']:
        print("No more data returned.")
        break
    
    # Process the results
    bindings = results['results']['bindings']
    if not bindings:
        print("No more data returned.")
        break

    # Convert the bindings to a DataFrame
    data = []
    for b in bindings:
        item = b['item']['value']
        author_wikidata = b['author_wikidata']['value'] if 'author_wikidata' in b else None
        location_wikidata = b['location_wikidata']['value'] if 'location_wikidata' in b else None
        data.append({
            'item': item,
            'author_wikidata': author_wikidata,
            'location_wikidata': location_wikidata
        })
    df = pd.DataFrame(data)
    all_data = pd.concat([all_data, df], ignore_index=True)

    
    # Save a checkpoint at specified intervals
    batch_number += 1
    if batch_number % checkpoint_interval == 0:
        all_data.to_csv('data/checkpoints/paintings_ids_checkpoint.csv', index=False)
        with open('data/checkpoints/offset_paintings_ids_checkpoint.txt', 'w') as f:
            f.write(str(offset + limit))
        print(f"Checkpoint saved at batch {batch_number}")

    # Update the offset for the next batch
    offset += limit
    time.sleep(1)  # Be polite and avoid overloading the server

# Save the final data to a CSV file
unique_paintings = all_data.drop_duplicates(subset='item', keep='first')
unique_paintings.to_csv('data/wikidata_paintings_ids_final_2.csv', index=False)
print("Data retrieval complete. Saved to wikidata_paintings_ids_final.csv")


Resuming from offset 820000
Fetching data with OFFSET 820000
Fetching data with OFFSET 821000
Fetching data with OFFSET 822000
Fetching data with OFFSET 823000
Fetching data with OFFSET 824000
Fetching data with OFFSET 825000
Fetching data with OFFSET 826000
Fetching data with OFFSET 827000
Fetching data with OFFSET 828000
Fetching data with OFFSET 829000
Checkpoint saved at batch 830
Fetching data with OFFSET 830000
Fetching data with OFFSET 831000
Fetching data with OFFSET 832000
Fetching data with OFFSET 833000
Fetching data with OFFSET 834000
Fetching data with OFFSET 835000
Fetching data with OFFSET 836000
Fetching data with OFFSET 837000
Fetching data with OFFSET 838000
Fetching data with OFFSET 839000
Checkpoint saved at batch 840
Fetching data with OFFSET 840000
Fetching data with OFFSET 841000
Fetching data with OFFSET 842000
Fetching data with OFFSET 843000
Fetching data with OFFSET 844000
Fetching data with OFFSET 845000
Fetching data with OFFSET 846000
Fetching data with OF