# Food in Art

In [1]:
import pandas as pd
import urllib.parse
import aiohttp
import asyncio


In [2]:

# Step 1: Load your dataframe
# Replace this with your actual data loading method
# For example: df = pd.read_csv('wikipedia_urls.csv')
df = pd.read_csv('data/wikidata_paintings_final_with_wiki_url.csv')
df

Unnamed: 0,item,title,author_wikidata,author_name,creation_date,origin_country,display_country,display_location,type,school,time_period,wiki_url,image_url,depicts,wikipedia_url
0,http://www.wikidata.org/entity/Q607761,The Death of the Picador,http://www.wikidata.org/entity/Q5432,Francisco Goya,1793-01-01T00:00:00Z,,,,genre art,Romanticism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"picador, stadium, spear, bullfighting, man, ho...",
1,http://www.wikidata.org/entity/Q609572,Manaò tupapaú,http://www.wikidata.org/entity/Q37693,Paul Gauguin,1892-01-01T00:00:00Z,,United States of America,Buffalo AKG Art Museum,genre art,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"gaze, lying, intergluteal cleft, sole, barefoo...",https://en.wikipedia.org/wiki/Spirit_of_the_De...
2,http://www.wikidata.org/entity/Q607598,Virgin of the Councillors,http://www.wikidata.org/entity/Q723863,Lluís Dalmau,1445-01-01T00:00:00Z,,Spain,Museu Nacional d'Art de Catalunya,religious art,Gothic painting,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Madonna and Child, Eulalia of Barcelona, Joan ...",https://en.wikipedia.org/wiki/Virgin_of_the_Co...
3,http://www.wikidata.org/entity/Q734082,Regatta at Sainte-Adresse,http://www.wikidata.org/entity/Q296,Claude Monet,1867-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,marine art,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"parasol, sailboat, Sainte-Adresse, church, mar...",https://en.wikipedia.org/wiki/Regatta_at_Saint...
4,http://www.wikidata.org/entity/Q472037,By the Seashore,http://www.wikidata.org/entity/Q39931,Pierre-Auguste Renoir,1883-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,portrait,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"portrait, Saint Peter Port, coast, chair, woman",https://en.wikipedia.org/wiki/By_the_Seashore
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93557,http://www.wikidata.org/entity/Q98966261,Musical Entertainment,http://www.wikidata.org/entity/Q18613400,Jakob Emanuel Gaisser,1899-01-01T00:00:00Z,,,,genre art,,,,https://commons.wikimedia.org/wiki/Special:Fil...,,
93558,http://www.wikidata.org/entity/Q98977855,"Césarine de Houdetot, Baronne de Barante, read...",http://www.wikidata.org/entity/Q51077254,Louise Bouteiller,1818-01-01T00:00:00Z,France,France,Château de Barante,portrait,,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Saint François d‘Assise, Césarine d'Houdetot, ...",
93559,http://www.wikidata.org/entity/Q99025930,The Broken Jug,http://www.wikidata.org/entity/Q97477673,Jenny Berger-Désoras,1847-01-01T00:00:00Z,,,,genre art,,,,https://commons.wikimedia.org/wiki/Special:Fil...,,
93560,http://www.wikidata.org/entity/Q98970362,Dr Philippe Pinel (1745-1826) and his family,http://www.wikidata.org/entity/Q3291501,Marie-Anne-Julie Forestier,1807-01-01T00:00:00Z,,,,family portrait,,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Scipion Pinel, Philippe Pinel, physician, chil...",


In [None]:

# Step 2: Extract titles
def extract_title(url):
    parsed = urllib.parse.urlparse(url)
    title = parsed.path.split('/wiki/')[-1]
    title = urllib.parse.unquote(title)
    return title

df['title'] = df['wikipedia_url'].apply(extract_title)


In [None]:

# Step 3: Define asynchronous fetch function
async def fetch(session, title):
    API_URL = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "extracts",
        "explaintext": True,
        "titles": title,
        "format": "json",
        "redirects": 1
    }
    headers = {
        "User-Agent": "YourAppName/1.0 (your.email@example.com)"
    }
    try:
        async with session.get(API_URL, params=params, headers=headers) as response:
            if response.status != 200:
                print(f"Failed to fetch {title}: Status {response.status}")
                return title, None
            data = await response.json()
            pages = data.get('query', {}).get('pages', {})
            for page_id, page in pages.items():
                return title, page.get('extract', None)
    except Exception as e:
        print(f"Exception for {title}: {e}")
        return title, None

# Step 4: Define the main asynchronous function
async def main(titles):
    connector = aiohttp.TCPConnector(limit=20)
    timeout = aiohttp.ClientTimeout(total=60)
    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
        tasks = [fetch(session, title) for title in titles]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        return results

# Step 5: Run the asynchronous retrieval
titles = df['title'].tolist()
results = asyncio.run(main(titles))

# Step 6: Map results back to dataframe
content_dict = {}
for result in results:
    if isinstance(result, tuple):
        title, content = result
        content_dict[title] = content
    else:
        print(f"Unexpected result: {result}")

df['wikipedia_article'] = df['title'].map(content_dict)

# Optional: Save to CSV
df.to_csv('wikidata_paintings_final_with_wiki_articles.csv', index=False)

print("Article retrieval complete.")