# Food in Art

In [4]:
import pandas as pd
import os
import time
import concurrent.futures
from typing import List, Dict
import requests
from ratelimit import limits, sleep_and_retry


In [None]:

# Initialize global session for connection pooling
session = requests.Session()
session.headers.update({
    'User-Agent': 'ArtDataBot/1.0 (jipijipijipi@gmail.com) Python/requests',
    'Accept': 'application/json'
})

@sleep_and_retry
@limits(calls=5, period=1)  # Limit to 5 calls per second
def run_sparql_query(query: str, endpoint_url: str) -> Dict:
    """Execute SPARQL query with rate limiting"""
    response = session.get(
        endpoint_url,
        params={'query': query, 'format': 'json'},
        timeout=30
    )
    response.raise_for_status()
    return response.json()

def process_batch(batch_qids: List[str], endpoint_url: str, max_retries: int = 5) -> pd.DataFrame:
    """Process a single batch of author QIDs"""
    qid_list_str = ' '.join(f'wd:{qid}' for qid in batch_qids)
    
    # Optimized SPARQL query
    batch_query = f"""
    SELECT ?author ?author_name ?country ?country_label ?gender ?gender_label 
           ?date_of_birth ?place_of_birth ?place_of_birth_label 
           ?place_of_birth_country ?place_of_birth_country_label 
    WHERE {{
        VALUES ?author {{ {qid_list_str} }}
        
        OPTIONAL {{
            ?author rdfs:label ?author_name;
                    wdt:P27 ?country;
                    wdt:P21 ?gender;
                    wdt:P569 ?date_of_birth;
                    wdt:P19 ?place_of_birth.
            
            ?country rdfs:label ?country_label.
            ?gender rdfs:label ?gender_label.
            ?place_of_birth rdfs:label ?place_of_birth_label.
            ?place_of_birth wdt:P17 ?place_of_birth_country.
            ?place_of_birth_country rdfs:label ?place_of_birth_country_label.
            
            FILTER(LANG(?author_name) = "en")
            FILTER(LANG(?country_label) = "en")
            FILTER(LANG(?gender_label) = "en")
            FILTER(LANG(?place_of_birth_label) = "en")
            FILTER(LANG(?place_of_birth_country_label) = "en")
        }}
    }}"""

    for retry in range(max_retries):
        try:
            results = run_sparql_query(batch_query, endpoint_url)
            bindings = results['results']['bindings']
            
            if not bindings:
                return pd.DataFrame()

            data = [{
                'author_wikidata': b['author']['value'],
                'name': b.get('author_name', {}).get('value'),
                'country': b.get('country_label', {}).get('value'),
                'birth_country': b.get('place_of_birth_country_label', {}).get('value'),
                'birth_place': b.get('place_of_birth_label', {}).get('value'),
                'gender': b.get('gender_label', {}).get('value'),
                'date_of_birth': b.get('date_of_birth', {}).get('value')
            } for b in bindings]
            
            return pd.DataFrame(data)
            
        except Exception as e:
            if retry == max_retries - 1:
                print(f"Max retries exceeded: {e}")
                return pd.DataFrame()
            time.sleep(2 ** retry)  # Exponential backoff
    
    return pd.DataFrame()

def chunk_list(lst: List, chunk_size: int) -> List[List]:
    """Split a list into chunks of specified size"""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

def get_supplement_from_wikidata():
    # Configuration
    endpoint_url = "https://query.wikidata.org/sparql"
    batch_size = 50
    max_workers = 3  # Adjust based on your needs and API limits
    checkpoint_frequency = 10  # Save checkpoint every N batches

    if os.path.exists('data/wikidata_painters.csv'):
        print("Final file already exists. Skipping data retrieval.")
        return

    # Create checkpoint directory if it doesn't exist
    os.makedirs('data/checkpoints', exist_ok=True)

    # Load checkpoint if exists
    checkpoint_file = 'data/checkpoints/authors_checkpoint.csv'
    batch_index_file = 'data/checkpoints/authors_batch_index_checkpoint.txt'
    
    if os.path.exists(checkpoint_file) and os.path.exists(batch_index_file):
        detailed_data = pd.read_csv(checkpoint_file)
        with open(batch_index_file, 'r') as f:
            start_batch = int(f.read())
        print(f"Resuming from batch {start_batch}")
    else:
        detailed_data = pd.DataFrame()
        start_batch = 0

    # Read and prepare author data
    basic_data = pd.read_csv('data/wikidata_paintings_ids_final_2.csv')
    authors = basic_data['author_wikidata'].drop_duplicates().reset_index(drop=True)
    
    # Convert item URIs to Q-IDs and create batches
    author_qids = [uri.split('/')[-1] for uri in authors['author_wikidata'].tolist()]
    print(author_qids)
    batches = chunk_list(author_qids, batch_size)

    print(f"Processing {len(batches)} batches with {len(author_qids)} authors...")

    # Process batches with ThreadPoolExecutor
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(process_batch, batch, endpoint_url): batch_idx 
            for batch_idx, batch in enumerate(batches[start_batch:], start=start_batch)
        }
        
        completed_batches = 0
        for future in concurrent.futures.as_completed(futures):
            batch_idx = futures[future]
            try:
                batch_df = future.result()
                if not batch_df.empty:
                    detailed_data = pd.concat([detailed_data, batch_df], ignore_index=True)
                
                completed_batches += 1
                print(f"Completed batch {batch_idx + 1}/{len(batches)} "
                      f"({(batch_idx + 1)/len(batches)*100:.1f}%)")

                # Save checkpoint periodically
                if completed_batches % checkpoint_frequency == 0:
                    detailed_data.to_csv(checkpoint_file, index=False)
                    with open(batch_index_file, 'w') as f:
                        f.write(str(batch_idx + 1))
                    print(f"Checkpoint saved at batch {batch_idx + 1}")

            except Exception as e:
                print(f"Error processing batch {batch_idx}: {e}")

    # Final processing
    print("Processing complete. Preparing final dataset...")
    detailed_data.drop_duplicates(subset=['author_wikidata'], inplace=True)
    final_data = pd.merge(authors, detailed_data, on='author_wikidata', how='left')
    
    # Save final results
    final_data.to_csv('data/wikidata_painters.csv', index=False)
    print("Data saved to wikidata_painters.csv")

    # Clean up checkpoints
    if os.path.exists(checkpoint_file):
        os.remove(checkpoint_file)
    if os.path.exists(batch_index_file):
        os.remove(batch_index_file)

if __name__ == "__main__":
    get_supplement_from_wikidata()

Processing 2673 batches with 133646 authors...
Max retries exceeded: 400 Client Error: Bad Request for url: https://query.wikidata.org/sparql?query=%0A++++SELECT+%3Fauthor+%3Fauthor_name+%3Fcountry+%3Fcountry_label+%3Fgender+%3Fgender_label+%0A+++++++++++%3Fdate_of_birth+%3Fplace_of_birth+%3Fplace_of_birth_label+%0A+++++++++++%3Fplace_of_birth_country+%3Fplace_of_birth_country_label+%0A++++WHERE+%7B%0A++++++++VALUES+%3Fauthor+%7B+wd%3Ahttp%3A%2F%2Fwww.wikidata.org%2F.well-known%2Fgenid%2F195f8d395b47639d97d86068e10fa9d9+wd%3Ahttp%3A%2F%2Fwww.wikidata.org%2Fentity%2FQ156386+wd%3Ahttp%3A%2F%2Fwww.wikidata.org%2Fentity%2FQ704600+wd%3Ahttp%3A%2F%2Fwww.wikidata.org%2Fentity%2FQ128538+wd%3Ahttp%3A%2F%2Fwww.wikidata.org%2Fentity%2FQ5603+wd%3Ahttp%3A%2F%2Fwww.wikidata.org%2Fentity%2FQ366212+wd%3Ahttp%3A%2F%2Fwww.wikidata.org%2Fentity%2FQ1393857+wd%3Ahttp%3A%2F%2Fwww.wikidata.org%2Fentity%2FQ130777+wd%3Ahttp%3A%2F%2Fwww.wikidata.org%2Fentity%2FQ152797+wd%3Ahttp%3A%2F%2Fwww.wikidata.org%2Fentity