# Food in Art

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time
import os

## Fetch the IDs from the SPARQL endpoint

In [None]:

# Function to run the SPARQL query
def run_sparql_query(query):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    # Set your user agent to comply with Wikidata's policy
    sparql.addCustomHttpHeader('User-Agent', 'MyPaintingDataRetriever/1.0 (jipijipijipi@gmail.com)')
    try:
        results = sparql.query().convert()
    except Exception as e:
        print(f"An error occurred: {e}")
        time.sleep(10)  # Wait before retrying
        results = sparql.query().convert()
    return results

# Function to chunk the list into batches
def chunk_list(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

## Fetch all the data from original list


In [None]:

# Define the SPARQL endpoint URL
wikidata_endpoint_url = "https://query.wikidata.org/sparql"

# Read the basic painting data
basic_data = pd.read_csv('data/wikidata_paintings_ids_final.csv')
basic_data

In [None]:

locations = basic_data['location_wikidata']
locations_list = locations.unique().tolist()

# Set batch parameters
batch_size = 50  # Number of locations to query at once
max_retries = 5  # Maximum number of retries for failed requests

# Prepare to store detailed data
detailed_data = pd.DataFrame()

# Check if a checkpoint exists to resume from
if os.path.exists('data/checkpoints/locations_checkpoint.csv') and os.path.exists('data/checkpoints/locations_batch_index_checkpoint.txt'):
    detailed_data = pd.read_csv('data/checkpoints/locations_checkpoint.csv')
    with open('data/checkpoints/locations_batch_index_checkpoint.txt', 'r') as f:
        start_batch = int(f.read())
    print(f"Resuming from batch index {start_batch}")
else:
    detailed_data = pd.DataFrame()
    start_batch = 0

# Convert item URIs to Q-IDs
item_qids = [uri.split('/')[-1] for uri in locations_list]

# Create batches
batches = list(chunk_list(item_qids, batch_size))

# Loop over batches to fetch detailed data
for batch_index, batch_qids in enumerate(batches[start_batch:], start=start_batch):
    print(f"Processing batch {batch_index + 1}/{len(batches)}")
    qid_list_str = ' '.join(f'wd:{qid}' for qid in batch_qids)

    # Construct the SPARQL query for the batch
    batch_query = f"""
                    SELECT ?museum ?museum_name ?city ?city_label ?country ?country_label ?founding_date ?museum_type ?museum_type_label ?coordinates ?part_of WHERE  {{
                    VALUES ?museum {{ {qid_list_str} }}
                    
                    OPTIONAL {{
                        ?museum wdt:P17 ?country.                           
                        ?country rdfs:label ?country_label.              
                        FILTER(LANG(?country_label) = "en")
                    }}
                    
                    OPTIONAL {{
                        ?museum wdt:P131 ?city.                     
                        ?city rdfs:label ?city_label.                
                        FILTER(LANG(?city_label) = "en")
                    }}
                    
                    OPTIONAL {{
                        ?museum wdt:P571 ?founding_date.                    
                    }}
                    
                    
                    OPTIONAL {{
                        ?museum wdt:P31 ?museum_type.                        
                        ?museum_type rdfs:label ?museum_type_label.           
                        FILTER(LANG(?museum_type_label) = "en")
                    }}
                    
                    OPTIONAL {{
                        ?museum wdt:P625 ?coordinates.                       
                    }}
                    
                    OPTIONAL {{
                        ?museum rdfs:label ?museum_name.                     
                        FILTER(LANG(?museum_name) = "en")
                    }}
                    
                    OPTIONAL {{
                        ?museum wdt:P361 ?part_of.                            
                    }}
                    }}
                    """

    retries = 0
    while retries < max_retries:
        try:
            results = run_sparql_query(batch_query)
            break
        except Exception as e:
            print(f"Error: {e}. Retrying ({retries + 1}/{max_retries})...")
            retries += 1
            time.sleep(5)
    else:
        print("Max retries exceeded for this batch. Skipping.")
        continue

    # Process the results
    bindings = results['results']['bindings']
    if not bindings:
        print(f"No data returned for batch {batch_index + 1}.")
        continue

    # Convert the bindings to a DataFrame
    data = []
    for b in bindings:
        location_wikidata = b['museum']['value']
        location_name = b.get('museum_name', {}).get('value', None)
        country = b.get('country_label', {}).get('value', None)
        founding_date = b.get('founding_date', {}).get('value', None)
        collection_size = b.get('collection_size', {}).get('value', None)
        museum_type = b.get('museum_type_label', {}).get('value', None)
        coordinates = b.get('coordinates', {}).get('value', None)
        part_of = b.get('part_of', {}).get('value', None)

        data.append({
            'location_wikidata': location_wikidata,
            'name': location_name,
            'country': country,
            'founding_date': founding_date,
            'collection_size': collection_size,
            'museum_type': museum_type,
            'coordinates': coordinates,
            'part_of': part_of
        })
        
    df = pd.DataFrame(data)
    detailed_data = pd.concat([detailed_data, df], ignore_index=True)

    # Save a checkpoint
    detailed_data.to_csv(
        'data/checkpoints/locations_checkpoint.csv', index=False)
    with open('data/checkpoints/locations_batch_index_checkpoint.txt', 'w') as f:
        f.write(str(batch_index + 1))
    print(f"Checkpoint saved at batch {batch_index + 1}")

    time.sleep(1)  # Be polite and avoid overloading the server

# Merge basic and detailed data
detailed_data.drop_duplicates(subset=['location_wikidata'], inplace=True)
final_data = pd.merge(locations, detailed_data, on='location_wikidata', how='left')

# Save the final data to a CSV file
final_data.to_csv('data/wikidata_locations.csv', index=False)
print("Location retrieval complete. Detailed painting data saved to wikidata_locations.csv")