# Food in Art

In [38]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time
import os
import requests

from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

import json
from datetime import datetime

In [39]:
#TODO get author's country

## Fetch the IDs from the SPARQL endpoint

In [40]:

# Function to run the SPARQL query
def run_sparql_query(query):
    sparql = SPARQLWrapper(wikidata_endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    # Set your user agent to comply with Wikidata's policy
    sparql.addCustomHttpHeader('User-Agent', 'MyPaintingDataRetriever/1.0 (jipijipijipi@gmail.com)')
    try:
        results = sparql.query().convert()
    except Exception as e:
        print(f"An error occurred: {e}")
        time.sleep(10)  # Wait before retrying
        results = sparql.query().convert()
    return results

# Function to chunk the list into batches
def chunk_list(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [41]:
wikidata_endpoint_url = "https://query.wikidata.org/sparql"
# Skip if the final file already exists
if os.path.exists('data/wikidata_paintings_ids.csv'):
    print("Final file already exists. Skipping data retrieval.")
else:
    wikidata_base_query = """
    SELECT ?item ?title ?author_wikidata ?author_name WHERE {{
    ?item wdt:P31 wd:Q3305213.
    ?item rdfs:label ?title.
    ?item wdt:P170 ?author_wikidata.
    ?author_wikidata rdfs:label ?author_name.
    FILTER(LANG(?title) = "en").
    FILTER(LANG(?author_name) = "en").
    }}
    LIMIT {limit}
    OFFSET {offset}
    """

    # Set batch parameters
    limit = 1000  # Number of records to fetch per batch
    checkpoint_interval = 10  # Save a checkpoint every 10 batches
    max_retries = 5  # Maximum number of retries for failed requests

    # Check if a checkpoint exists to resume from
    # Create checkpoints folder if it does not exist
    if not os.path.exists('data/checkpoints'):
        os.makedirs('data/checkpoints')
    if os.path.exists('data/checkpoints/paintings_checkpoint.csv') and os.path.exists('data/checkpoints/offset_checkpoint.txt'):
        all_data = pd.read_csv('data/checkpoints/paintings_checkpoint.csv')
        with open('data/checkpoints/offset_checkpoint.txt', 'r') as f:
            offset = int(f.read())
        batch_number = offset // limit
        print(f"Resuming from offset {offset}")
    else:
        all_data = pd.DataFrame()
        offset = 0
        batch_number = 0

# Loop to fetch data in batches

    while True:
        query = wikidata_base_query.format(limit=limit, offset=offset)
        print(f"Fetching data with OFFSET {offset}")
        retries = 0
        while retries < max_retries:
            try:
                results = run_sparql_query(query)
                break
            except Exception as e:
                print(f"Error: {e}. Retrying ({retries+1}/{max_retries})...")
                retries += 1
                time.sleep(5)
        else:
            print("Max retries exceeded. Exiting.")
            break

        # Process the results
        bindings = results['results']['bindings']
        if not bindings:
            print("No more data returned.")
            break

        # Convert the bindings to a DataFrame
        data = []
        for b in bindings:
            item = b['item']['value']
            title = b['title']['value']
            author_wikidata = b['author_wikidata']['value']
            author_name = b['author_name']['value']
            data.append({
                'item': item,
                'title': title,
                'author_wikidata': author_wikidata,
                'author_name': author_name
            })
        df = pd.DataFrame(data)
        all_data = pd.concat([all_data, df], ignore_index=True)

        # Save a checkpoint at specified intervals
        batch_number += 1
        if batch_number % checkpoint_interval == 0:
            all_data.to_csv('data/checkpoints/paintings_checkpoint.csv', index=False)
            with open('data/checkpoints/offset_checkpoint.txt', 'w') as f:
                f.write(str(offset + limit))
            print(f"Checkpoint saved at batch {batch_number}")

        # Update the offset for the next batch
        offset += limit
        time.sleep(1)  # Be polite and avoid overloading the server

    # Save the final data to a CSV file
    all_data.to_csv('data/wikidata_paintings_ids.csv', index=False)
    print("Data retrieval complete. Saved to wikidata_paintings_ids.csv")


Final file already exists. Skipping data retrieval.


## Fetch all the data from the previous list


In [None]:
if os.path.exists('data/wikidata_paintings_final.csv'):
    print("Final file already exists. Skipping data retrieval.")
else:
  # Define the SPARQL endpoint URL
  wikidata_endpoint_url = "https://query.wikidata.org/sparql"

  # Read the basic painting data
  basic_data = pd.read_csv('data/wikidata_paintings_ids.csv')
  item_list = basic_data['item'].tolist()

  # Set batch parameters
  batch_size = 50  # Number of paintings to query at once
  max_retries = 5  # Maximum number of retries for failed requests

  # Prepare to store detailed data
  detailed_data = pd.DataFrame()

  # Check if a checkpoint exists to resume from
  if os.path.exists('data/checkpoints/paintings_detailed_checkpoint.csv') and os.path.exists('data/checkpoints/batch_index_checkpoint.txt'):
      detailed_data = pd.read_csv('data/checkpoints/paintings_detailed_checkpoint.csv')
      with open('data/checkpoints/batch_index_checkpoint.txt', 'r') as f:
          start_batch = int(f.read())
      print(f"Resuming from batch index {start_batch}")
  else:
      detailed_data = pd.DataFrame()
      start_batch = 0

  # Convert item URIs to Q-IDs
  item_qids = [uri.split('/')[-1] for uri in item_list]

  # Create batches
  batches = list(chunk_list(item_qids, batch_size))

  # Loop over batches to fetch detailed data
  for batch_index, batch_qids in enumerate(batches[start_batch:], start=start_batch):
      print(f"Processing batch {batch_index + 1}/{len(batches)}")
      qid_list_str = ' '.join(f'wd:{qid}' for qid in batch_qids)

      # Construct the SPARQL query for the batch
      batch_query = f"""
      PREFIX schema: <http://schema.org/>
SELECT ?item ?creation_date ?origin_country ?display_country ?display_location ?type ?school ?time_period ?wiki_url ?image_url (GROUP_CONCAT(?depicts_label; separator=", ") AS ?depicts) WHERE {{
  VALUES ?item {{ {qid_list_str} }}
  
  OPTIONAL {{ ?item wdt:P571 ?creation_date. }}
  
  # Origin country of the item
  OPTIONAL {{
    ?item wdt:P495 ?origin_country_wd.
    ?origin_country_wd rdfs:label ?origin_country.
    FILTER(LANG(?origin_country) = "en")
  }}
  
  # Alternative: Origin country of the author
  OPTIONAL {{
    ?item wdt:P50 ?author.
    ?author wdt:P27 ?author_country_wd.  # Or use wdt:P495 if country of origin applies.
    ?author_country_wd rdfs:label ?author_country.
    FILTER(LANG(?author_country) = "en")
  }}
  
  # Select either the item's origin country or author's country if available
  BIND(COALESCE(?origin_country, ?author_country) AS ?origin_country)
  
  OPTIONAL {{
    ?item wdt:P276 ?display_location_wd.
    ?display_location_wd rdfs:label ?display_location.
    FILTER(LANG(?display_location) = "en")
    OPTIONAL {{
      ?display_location_wd wdt:P17 ?display_country_wd.
      ?display_country_wd rdfs:label ?display_country.
      FILTER(LANG(?display_country) = "en")
    }}
  }}
  OPTIONAL {{
    ?item wdt:P136 ?type_wd.
    ?type_wd rdfs:label ?type.
    FILTER(LANG(?type) = "en")
  }}
  OPTIONAL {{
    ?item wdt:P135 ?school_wd.
    ?school_wd rdfs:label ?school.
    FILTER(LANG(?school) = "en")
  }}
  OPTIONAL {{
    ?item wdt:P2348 ?time_period_wd.
    ?time_period_wd rdfs:label ?time_period.
    FILTER(LANG(?time_period) = "en")
  }}
  OPTIONAL {{
    ?item wdt:P18 ?image_file.
    BIND(CONCAT("https://commons.wikimedia.org/wiki/Special:FilePath/", ENCODE_FOR_URI(REPLACE(STR(?image_file), "^.*\\\\/(?!.*\\\\/)", ""))) AS ?image_url)
  }}
  OPTIONAL {{
    ?item wdt:P180 ?depicts_wd.
    ?depicts_wd rdfs:label ?depicts_label.
    FILTER(LANG(?depicts_label) = "en")
  }}
  OPTIONAL {{
    ?sitelink schema:about ?item;
              schema:isPartOf <https://en.wikipedia.org/>;
              schema:url ?wiki_url.
  }}
}}
GROUP BY ?item ?creation_date ?origin_country ?display_country ?display_location ?type ?school ?time_period ?wiki_url ?image_url
      """

      retries = 0
      while retries < max_retries:
          try:
              results = run_sparql_query(batch_query)
              break
          except Exception as e:
              print(f"Error: {e}. Retrying ({retries + 1}/{max_retries})...")
              retries += 1
              time.sleep(5)
      else:
          print("Max retries exceeded for this batch. Skipping.")
          continue

      # Process the results
      bindings = results['results']['bindings']
      if not bindings:
          print(f"No data returned for batch {batch_index + 1}.")
          continue

      # Convert the bindings to a DataFrame
      data = []
      for b in bindings:
          item = b['item']['value']
          creation_date = b.get('creation_date', {}).get('value', None)
          origin_country = b.get('origin_country', {}).get('value', None)
          display_country = b.get('display_country', {}).get('value', None)
          display_location = b.get('display_location', {}).get('value', None)
          type_ = b.get('type', {}).get('value', None)
          school = b.get('school', {}).get('value', None)
          time_period = b.get('time_period', {}).get('value', None)
          wiki_url = b.get('wiki_url', {}).get('value', None)
          image_url = b.get('image_url', {}).get('value', None)
          depicts = b.get('depicts', {}).get('value', None)
          data.append({
              'item': item,
              'creation_date': creation_date,
              'origin_country': origin_country,
              'display_country': display_country,
              'display_location': display_location,
              'type': type_,
              'school': school,
              'time_period': time_period,
              'wiki_url': wiki_url,
              'image_url': image_url,
              'depicts': depicts
          })
      df = pd.DataFrame(data)
      detailed_data = pd.concat([detailed_data, df], ignore_index=True)

      # Save a checkpoint
      detailed_data.to_csv('data/checkpoints/paintings_detailed_checkpoint.csv', index=False)
      with open('data/checkpoints/batch_index_checkpoint.txt', 'w') as f:
          f.write(str(batch_index + 1))
      print(f"Checkpoint saved at batch {batch_index + 1}")

      time.sleep(1)  # Be polite and avoid overloading the server

  # Merge basic and detailed data
  final_data = pd.merge(basic_data, detailed_data, on='item', how='left')

  # Save the final data to a CSV file
  final_data.to_csv('data/wikidata_paintings_final.csv', index=False)
  print("Second pass complete. Detailed painting data saved to wikidata_paintings_final.csv")

Final file already exists. Skipping data retrieval.


### Get wikipedia links

In [43]:
wikidata = pd.read_csv('data/wikidata_paintings_final.csv')
wikidata = wikidata.drop_duplicates(subset=['item'])
wikidata = wikidata.dropna(subset=['image_url'])
wikidata

Unnamed: 0,item,title,author_wikidata,author_name,creation_date,origin_country,display_country,display_location,type,school,time_period,wiki_url,image_url,depicts
0,http://www.wikidata.org/entity/Q607761,The Death of the Picador,http://www.wikidata.org/entity/Q5432,Francisco Goya,1793-01-01T00:00:00Z,,,,genre art,Romanticism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"picador, stadium, spear, bullfighting, man, ho..."
1,http://www.wikidata.org/entity/Q609572,Manaò tupapaú,http://www.wikidata.org/entity/Q37693,Paul Gauguin,1892-01-01T00:00:00Z,,United States of America,Buffalo AKG Art Museum,genre art,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"gaze, lying, intergluteal cleft, sole, barefoo..."
13,http://www.wikidata.org/entity/Q607598,Virgin of the Councillors,http://www.wikidata.org/entity/Q723863,Lluís Dalmau,1445-01-01T00:00:00Z,,Spain,Museu Nacional d'Art de Catalunya,religious art,Gothic painting,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Madonna and Child, Eulalia of Barcelona, Joan ..."
14,http://www.wikidata.org/entity/Q734082,Regatta at Sainte-Adresse,http://www.wikidata.org/entity/Q296,Claude Monet,1867-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,marine art,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"parasol, sailboat, Sainte-Adresse, church, mar..."
15,http://www.wikidata.org/entity/Q472037,By the Seashore,http://www.wikidata.org/entity/Q39931,Pierre-Auguste Renoir,1883-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,portrait,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"portrait, Saint Peter Port, coast, chair, woman"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514699,http://www.wikidata.org/entity/Q98966261,Musical Entertainment,http://www.wikidata.org/entity/Q18613400,Jakob Emanuel Gaisser,1899-01-01T00:00:00Z,,,,genre art,,,,https://commons.wikimedia.org/wiki/Special:Fil...,
514700,http://www.wikidata.org/entity/Q98977855,"Césarine de Houdetot, Baronne de Barante, read...",http://www.wikidata.org/entity/Q51077254,Louise Bouteiller,1818-01-01T00:00:00Z,France,France,Château de Barante,portrait,,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Saint François d‘Assise, Césarine d'Houdetot, ..."
514714,http://www.wikidata.org/entity/Q99025930,The Broken Jug,http://www.wikidata.org/entity/Q97477673,Jenny Berger-Désoras,1847-01-01T00:00:00Z,,,,genre art,,,,https://commons.wikimedia.org/wiki/Special:Fil...,
514718,http://www.wikidata.org/entity/Q98970362,Dr Philippe Pinel (1745-1826) and his family,http://www.wikidata.org/entity/Q3291501,Marie-Anne-Julie Forestier,1807-01-01T00:00:00Z,,,,family portrait,,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Scipion Pinel, Philippe Pinel, physician, chil..."


In [44]:


class CheckpointManager:
    def __init__(self, checkpoint_dir="checkpoints"):
        self.checkpoint_dir = checkpoint_dir
        self.metadata_file = os.path.join(checkpoint_dir, "checkpoint_metadata.csv")
        os.makedirs(checkpoint_dir, exist_ok=True)
        
    def save_checkpoint(self, batch_num, results, total_processed):
        """Save results and metadata to CSV checkpoint files"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save results to CSV
        filename = f"checkpoint_batch_{batch_num}_{timestamp}.csv"
        filepath = os.path.join(self.checkpoint_dir, filename)
        
        # Convert results to DataFrame and save
        results_df = pd.DataFrame(list(results.items()), columns=['wikidata_id', 'wikipedia_url'])
        results_df.to_csv(filepath, index=False)
        
        # Update metadata
        metadata = pd.DataFrame([{
            'batch_num': batch_num,
            'checkpoint_file': filename,
            'total_processed': total_processed,
            'timestamp': timestamp
        }])
        
        if os.path.exists(self.metadata_file):
            existing_metadata = pd.read_csv(self.metadata_file)
            metadata = pd.concat([existing_metadata, metadata]).reset_index(drop=True)
        
        metadata.to_csv(self.metadata_file, index=False)
        
        # Keep only the latest 3 checkpoints
        self._cleanup_old_checkpoints()
        
    def load_latest_checkpoint(self):
        """Load the most recent checkpoint"""
        if not os.path.exists(self.metadata_file):
            return None
        
        metadata = pd.read_csv(self.metadata_file)
        if metadata.empty:
            return None
        
        # Get the latest checkpoint info
        latest_checkpoint = metadata.iloc[-1]
        checkpoint_file = os.path.join(self.checkpoint_dir, latest_checkpoint['checkpoint_file'])
        
        if os.path.exists(checkpoint_file):
            # Load results from CSV
            results_df = pd.read_csv(checkpoint_file)
            results = dict(zip(results_df['wikidata_id'], results_df['wikipedia_url']))
            
            return {
                'batch_num': latest_checkpoint['batch_num'],
                'results': results,
                'total_processed': latest_checkpoint['total_processed'],
                'timestamp': latest_checkpoint['timestamp']
            }
        return None
    
    def _cleanup_old_checkpoints(self, keep_latest=3):
        """Remove old checkpoint files, keeping only the specified number of latest ones"""
        if not os.path.exists(self.metadata_file):
            return
        
        metadata = pd.read_csv(self.metadata_file)
        if len(metadata) > keep_latest:
            # Keep only the latest records in metadata
            metadata_to_keep = metadata.iloc[-keep_latest:]
            files_to_keep = set(metadata_to_keep['checkpoint_file'])
            
            # Remove old checkpoint files
            for file in os.listdir(self.checkpoint_dir):
                if file.startswith("checkpoint_batch_") and file.endswith(".csv"):
                    if file not in files_to_keep:
                        os.remove(os.path.join(self.checkpoint_dir, file))
            
            # Update metadata file
            metadata_to_keep.to_csv(self.metadata_file, index=False)

def get_wikipedia_url(wikidata_id, session):
    """
    Get English Wikipedia URL for a Wikidata ID using the Wikidata API
    """
    try:
        # Remove any URL prefix and get just the Q-number
        if 'wikidata.org' in wikidata_id:
            wikidata_id = wikidata_id.split('/')[-1]
        
        url = f"https://www.wikidata.org/w/api.php"
        params = {
            "action": "wbgetentities",
            "format": "json",
            "props": "sitelinks",
            "ids": wikidata_id,
            "sitefilter": "enwiki"
        }
        
        response = session.get(url, params=params)
        data = response.json()
        
        try:
            title = data['entities'][wikidata_id]['sitelinks']['enwiki']['title']
            wiki_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
            return wiki_url
        except KeyError:
            return None
            
    except Exception as e:
        print(f"Error processing {wikidata_id}: {str(e)}")
        return None

def process_batch(wikidata_ids, checkpoint_manager=None, start_batch=0, 
                 max_workers=10, batch_size=100):
    """
    Process a batch of Wikidata IDs using threading with CSV checkpoint support
    """
    wikipedia_urls = {}
    total_processed = 0
    
    # Load existing results from checkpoint if available
    if checkpoint_manager:
        checkpoint = checkpoint_manager.load_latest_checkpoint()
        if checkpoint:
            wikipedia_urls = checkpoint['results']
            start_batch = checkpoint['batch_num'] + 1
            total_processed = checkpoint['total_processed']
            print(f"Resuming from batch {start_batch} with {total_processed} items processed")
    
    session = requests.Session()
    total_batches = (len(wikidata_ids) + batch_size - 1) // batch_size
    
    # Create progress bar
    pbar = tqdm(total=len(wikidata_ids), initial=total_processed)
    
    try:
        for batch_num in range(start_batch, total_batches):
            batch_start = batch_num * batch_size
            batch_end = min((batch_num + 1) * batch_size, len(wikidata_ids))
            batch = wikidata_ids[batch_start:batch_end]
            
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = [
                    executor.submit(get_wikipedia_url, wikidata_id, session)
                    for wikidata_id in batch
                ]
                
                batch_results = {}
                for wikidata_id, future in zip(batch, futures):
                    try:
                        batch_results[wikidata_id] = future.result()
                    except Exception as e:
                        print(f"Error processing {wikidata_id}: {str(e)}")
                        batch_results[wikidata_id] = None
                
                wikipedia_urls.update(batch_results)
                total_processed += len(batch)
                pbar.update(len(batch))
                
                if checkpoint_manager:
                    checkpoint_manager.save_checkpoint(batch_num, wikipedia_urls, total_processed)
            
            time.sleep(1)
    
    except KeyboardInterrupt:
        print("\nProcess interrupted. Progress has been saved to checkpoint.")
        if checkpoint_manager:
            checkpoint_manager.save_checkpoint(batch_num, wikipedia_urls, total_processed)
    
    finally:
        pbar.close()
        
    return wikipedia_urls

def add_wikipedia_urls(df, wikidata_url_column, checkpoint_dir="data/checkpoints"):
    """
    Add Wikipedia URLs to a DataFrame containing Wikidata URLs with CSV checkpoint support
    """
    checkpoint_manager = CheckpointManager(checkpoint_dir)
    
    wikidata_ids = df[wikidata_url_column].tolist()
    
    print("Fetching Wikipedia URLs...")
    wikipedia_urls = process_batch(wikidata_ids, checkpoint_manager=checkpoint_manager)
    
    df['wikipedia_url'] = df[wikidata_url_column].map(wikipedia_urls)
    
    return df

# Basic usage (checkpoints will be saved in './checkpoints' directory)
wikidata_with_wiki_url = add_wikipedia_urls(wikidata, 'item')

wikidata_with_wiki_url.to_csv('data/wikidata_paintings_final_with_wiki_url.csv', index=False)


Fetching Wikipedia URLs...


100%|██████████| 93562/93562 [51:11<00:00, 30.46it/s]


In [46]:
display(wikidata_with_wiki_url)
display(wikidata_with_wiki_url.info())

Unnamed: 0,item,title,author_wikidata,author_name,creation_date,origin_country,display_country,display_location,type,school,time_period,wiki_url,image_url,depicts,wikipedia_url
0,http://www.wikidata.org/entity/Q607761,The Death of the Picador,http://www.wikidata.org/entity/Q5432,Francisco Goya,1793-01-01T00:00:00Z,,,,genre art,Romanticism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"picador, stadium, spear, bullfighting, man, ho...",
1,http://www.wikidata.org/entity/Q609572,Manaò tupapaú,http://www.wikidata.org/entity/Q37693,Paul Gauguin,1892-01-01T00:00:00Z,,United States of America,Buffalo AKG Art Museum,genre art,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"gaze, lying, intergluteal cleft, sole, barefoo...",https://en.wikipedia.org/wiki/Spirit_of_the_De...
13,http://www.wikidata.org/entity/Q607598,Virgin of the Councillors,http://www.wikidata.org/entity/Q723863,Lluís Dalmau,1445-01-01T00:00:00Z,,Spain,Museu Nacional d'Art de Catalunya,religious art,Gothic painting,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Madonna and Child, Eulalia of Barcelona, Joan ...",https://en.wikipedia.org/wiki/Virgin_of_the_Co...
14,http://www.wikidata.org/entity/Q734082,Regatta at Sainte-Adresse,http://www.wikidata.org/entity/Q296,Claude Monet,1867-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,marine art,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"parasol, sailboat, Sainte-Adresse, church, mar...",https://en.wikipedia.org/wiki/Regatta_at_Saint...
15,http://www.wikidata.org/entity/Q472037,By the Seashore,http://www.wikidata.org/entity/Q39931,Pierre-Auguste Renoir,1883-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,portrait,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"portrait, Saint Peter Port, coast, chair, woman",https://en.wikipedia.org/wiki/By_the_Seashore
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514699,http://www.wikidata.org/entity/Q98966261,Musical Entertainment,http://www.wikidata.org/entity/Q18613400,Jakob Emanuel Gaisser,1899-01-01T00:00:00Z,,,,genre art,,,,https://commons.wikimedia.org/wiki/Special:Fil...,,
514700,http://www.wikidata.org/entity/Q98977855,"Césarine de Houdetot, Baronne de Barante, read...",http://www.wikidata.org/entity/Q51077254,Louise Bouteiller,1818-01-01T00:00:00Z,France,France,Château de Barante,portrait,,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Saint François d‘Assise, Césarine d'Houdetot, ...",
514714,http://www.wikidata.org/entity/Q99025930,The Broken Jug,http://www.wikidata.org/entity/Q97477673,Jenny Berger-Désoras,1847-01-01T00:00:00Z,,,,genre art,,,,https://commons.wikimedia.org/wiki/Special:Fil...,,
514718,http://www.wikidata.org/entity/Q98970362,Dr Philippe Pinel (1745-1826) and his family,http://www.wikidata.org/entity/Q3291501,Marie-Anne-Julie Forestier,1807-01-01T00:00:00Z,,,,family portrait,,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Scipion Pinel, Philippe Pinel, physician, chil...",


<class 'pandas.core.frame.DataFrame'>
Index: 93562 entries, 0 to 514719
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   item              93562 non-null  object 
 1   title             93562 non-null  object 
 2   author_wikidata   93562 non-null  object 
 3   author_name       93562 non-null  object 
 4   creation_date     87149 non-null  object 
 5   origin_country    9593 non-null   object 
 6   display_country   84829 non-null  object 
 7   display_location  86531 non-null  object 
 8   type              62515 non-null  object 
 9   school            4144 non-null   object 
 10  time_period       298 non-null    object 
 11  wiki_url          0 non-null      float64
 12  image_url         93562 non-null  object 
 13  depicts           51377 non-null  object 
 14  wikipedia_url     4878 non-null   object 
dtypes: float64(1), object(14)
memory usage: 11.4+ MB


None