# Food in Art

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time
import os
import requests

from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

import json
from datetime import datetime

## Fetch the IDs from the SPARQL endpoint

In [None]:

# Function to run the SPARQL query
def run_sparql_query(query):
    sparql = SPARQLWrapper(wikidata_endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    # Set your user agent to comply with Wikidata's policy
    sparql.addCustomHttpHeader('User-Agent', 'MyPaintingDataRetriever/1.0 (jipijipijipi@gmail.com)')
    try:
        results = sparql.query().convert()
    except Exception as e:
        print(f"An error occurred: {e}")
        time.sleep(10)  # Wait before retrying
        results = sparql.query().convert()
    return results

# Function to chunk the list into batches
def chunk_list(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

## Fetch all the data from original list


In [None]:
if os.path.exists('data/wikidata_authors.csv'):
    print("Final file already exists. Skipping data retrieval.")
else:
    # Define the SPARQL endpoint URL
    wikidata_endpoint_url = "https://query.wikidata.org/sparql"

    # Read the basic painting data
    basic_data = pd.read_csv('data/wikidata_paintings_ids.csv')
    authors = basic_data[['author_wikidata', 'author_name']].drop_duplicates(
        subset=['author_wikidata']).reset_index(drop=True)
    authors_list = authors['author_wikidata'].tolist()

    # Set batch parameters
    batch_size = 50  # Number of paintings to query at once
    max_retries = 5  # Maximum number of retries for failed requests

    # Prepare to store detailed data
    detailed_data = pd.DataFrame()

    # Check if a checkpoint exists to resume from
    if os.path.exists('data/checkpoints/authors_checkpoint.csv') and os.path.exists('data/checkpoints/authors_batch_index_checkpoint.txt'):
        detailed_data = pd.read_csv('data/checkpoints/authors_checkpoint.csv')
        with open('data/checkpoints/authors_batch_index_checkpoint.txt', 'r') as f:
            start_batch = int(f.read())
        print(f"Resuming from batch index {start_batch}")
    else:
        detailed_data = pd.DataFrame()
        start_batch = 0

    # Convert item URIs to Q-IDs
    author_qids = [uri.split('/')[-1] for uri in authors_list]

    # Create batches
    batches = list(chunk_list(author_qids, batch_size))

    # Loop over batches to fetch detailed data
    for batch_index, batch_qids in enumerate(batches[start_batch:], start=start_batch):
        print(f"Processing batch {batch_index + 1}/{len(batches)}")
        qid_list_str = ' '.join(f'wd:{qid}' for qid in batch_qids)

        # Construct the SPARQL query for the batch
        batch_query = f"""
                        SELECT ?author ?author_name ?country ?country_label ?gender ?gender_label ?date_of_birth 
                            ?place_of_birth ?place_of_birth_label ?place_of_birth_country ?place_of_birth_country_label WHERE {{
                        VALUES ?author {{ {qid_list_str} }}
                        
                        OPTIONAL {{
                            ?author wdt:P27 ?country.                             # P27 = country of citizenship
                            ?country rdfs:label ?country_label.                   # Get the label for country of citizenship
                            FILTER(LANG(?country_label) = "en")
                        }}
                        
                        OPTIONAL {{
                            ?author wdt:P21 ?gender.                              # P21 = gender
                            ?gender rdfs:label ?gender_label.                     # Get the label for gender
                            FILTER(LANG(?gender_label) = "en")
                        }}
                        
                        OPTIONAL {{
                            ?author wdt:P569 ?date_of_birth.                      # P569 = date of birth
                        }}
                        
                        OPTIONAL {{
                            ?author wdt:P19 ?place_of_birth.                      # P19 = place of birth
                            ?place_of_birth rdfs:label ?place_of_birth_label.     # Get the label for place of birth
                            FILTER(LANG(?place_of_birth_label) = "en")
                            
                            OPTIONAL {{
                            ?place_of_birth wdt:P17 ?place_of_birth_country.      # P17 = country for the place of birth
                            ?place_of_birth_country rdfs:label ?place_of_birth_country_label.  # Get the label for this country
                            FILTER(LANG(?place_of_birth_country_label) = "en")
                            }}
                        }}
                        
                        OPTIONAL {{
                            ?author rdfs:label ?author_name.                      # Get the author's name
                            FILTER(LANG(?author_name) = "en")
                        }}
                        }}
                        """

        retries = 0
        while retries < max_retries:
            try:
                results = run_sparql_query(batch_query)
                break
            except Exception as e:
                print(f"Error: {e}. Retrying ({retries + 1}/{max_retries})...")
                retries += 1
                time.sleep(5)
        else:
            print("Max retries exceeded for this batch. Skipping.")
            continue

        # Process the results
        bindings = results['results']['bindings']
        if not bindings:
            print(f"No data returned for batch {batch_index + 1}.")
            continue

        # Convert the bindings to a DataFrame
        data = []
        for b in bindings:
            author = b['author']['value']
            author_name = b.get('author_name', {}).get('value', None)
            country = b.get('country_label', {}).get('value', None)
            birth_country = b.get('place_of_birth_country_label', {}).get('value', None)
            birth_place = b.get('place_of_birth_label', {}).get('value', None)
            gender = b.get('gender_label', {}).get('value', None)
            date_of_birth = b.get('date_of_birth', {}).get('value', None)

            data.append({
                'author_wikidata': author,
                'name': author_name,
                'country': country,
                'birth_country': birth_country,
                'birth_place': birth_place,
                'gender': gender,
                'date_of_birth': date_of_birth,
            })
            
        df = pd.DataFrame(data)
        detailed_data = pd.concat([detailed_data, df], ignore_index=True)

        # Save a checkpoint
        detailed_data.to_csv(
            'data/checkpoints/authors_checkpoint.csv', index=False)
        with open('data/checkpoints/authors_batch_index_checkpoint.txt', 'w') as f:
            f.write(str(batch_index + 1))
        print(f"Checkpoint saved at batch {batch_index + 1}")

        time.sleep(1)  # Be polite and avoid overloading the server

    # Merge basic and detailed data
    detailed_data.drop_duplicates(subset=['author_wikidata'], inplace=True)
    final_data = pd.merge(authors, detailed_data, on='author_wikidata', how='left')
    # Save the final data to a CSV file
    final_data.to_csv('data/wikidata_authors.csv', index=False)
    print("Second pass complete. Detailed painting data saved to wikidata_authors.csv")