# Food in Art

In [69]:
import pandas as pd
import re

import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
import logging
import os
import json
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [70]:
#TODO import from web https://www.wga.hu/index_database.html
#TODO setup caching system

In [None]:
artworks = pd.read_csv('data/catalog.txt', delimiter='\t', quotechar='"', encoding='iso-8859-1')
all_paintings = pd.read_csv('data/wikidata_paintings_final_with_wiki_articles.csv')
display(artworks)
display(all_paintings)

### Select only paintings

In [None]:
display(artworks['FORM'].value_counts())

wga_paintings = artworks[artworks['FORM'] == 'painting']
wga_paintings.rename(columns={'URL': 'wga_url'}, inplace=True)
wga_paintings.head(30)

### Merge url to df with wiki

In [None]:
# Function to clean and standardize author names
def clean_author_name(name):
    # Extract the part before the tab character
    name = name.split('\t')[0]
    # Split into last name and first name
    if ',' in name:
        last, first = name.split(',', 1)
        name = first.strip() + ' ' + last.strip()
    else:
        name = name.strip()
    # Remove extra spaces and convert to title case
    name = ' '.join(name.split()).title()
    return name


# Apply the function to df2
wga_paintings['author_name'] = wga_paintings['AUTHOR'].apply(clean_author_name)

# Clean the 'author_name' in df1
all_paintings['author_name'] = all_paintings['author_name'].apply(lambda x: ' '.join(x.split()).title())

# Clean and standardize the titles in both dataframes
def clean_title(title):
    title = title.strip().lower()
    # Remove special characters
    title = re.sub(r'[^\w\s]', '', title)
    return title

all_paintings['title_clean'] = all_paintings['title'].apply(clean_title)
wga_paintings['title_clean'] = wga_paintings['TITLE'].apply(clean_title)

# Merge the dataframes on 'author_name' and 'title_clean'
all_paintings = pd.merge(all_paintings, wga_paintings[['author_name','title_clean','wga_url']], on=['author_name', 'title_clean'], how='left')

# Display the merged dataframe
display(all_paintings)

### Get description from WGA

In [None]:


# Configure logging
logging.basicConfig(
    filename='wga_scraping.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Create a cache directory
cache_dir = 'data/wga_cache'
os.makedirs(cache_dir, exist_ok=True)

def get_cached_description(wga_url):
    cache_file = os.path.join(cache_dir, f"{hash(wga_url)}.json")
    if os.path.exists(cache_file):
        with open(cache_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            return data.get('description')
    return None

def cache_description(wga_url, description):
    cache_file = os.path.join(cache_dir, f"{hash(wga_url)}.json")
    with open(cache_file, 'w', encoding='utf-8') as f:
        json.dump({'description': description}, f)

def scrape_wga_description_direct(session, wga_url):
    """
    Scrapes the description of an artwork directly from the wga_url.
    """
    try:
        # Check cache first
        cached_description = get_cached_description(wga_url)
        if cached_description:
            logging.info(f"Description retrieved from cache for URL: {wga_url}")
            return cached_description

        headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; YourBot/0.1; +http://yourwebsite.com/bot)'
        }
        response = session.get(wga_url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        # Locate the <p> tag containing the description
        # Based on your provided HTML, the description is within a <p> tag inside a <td>
        # Adjust the selectors accordingly

        # Locate the <h3> tag with the artwork title
        title_tag = soup.find('h3')
        if title_tag:
            parent_td = title_tag.find_parent('td')
            if parent_td:
                # Find all <p> tags within this <td>
                p_tags = parent_td.find_all('p')
                for p in p_tags:
                    text = p.get_text(strip=True)
                    if len(text) > 50:  # Adjust the length threshold as needed
                        cache_description(wga_url, text)
                        return text

        # Fallback: find the first <p> tag with sufficient text
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            text = p.get_text(strip=True)
            if len(text) > 50:
                cache_description(wga_url, text)
                return text

        # If no suitable description is found
        logging.warning(f"Description not found in URL: {wga_url}")
        return None

    except requests.exceptions.RequestException as e:
        logging.error(f"Request error for URL {wga_url}: {e}")
        return None
    except Exception as e:
        logging.error(f"Error parsing URL {wga_url}: {e}")
        return None

# Create a session with retries
session = requests.Session()
retry = Retry(
    total=3,
    backoff_factor=1,
    status_forcelist=[500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)



wga_df = all_paintings[all_paintings['wga_url'].notna() & (all_paintings['wga_url'].str.strip() != '')].copy()
wga_df.reset_index(inplace=True)

# Initialize a list to store descriptions
descriptions = []

for idx, row in wga_df.iterrows():
    original_wga_url = row['wga_url']
    logging.info(f"Processing {idx + 1}/{len(wga_df)}: {original_wga_url}")

    # Directly scrape the description from the wga_url
    description = scrape_wga_description_direct(session, original_wga_url)
    descriptions.append(description)

    # Respectful delay to avoid overwhelming the server
    time.sleep(1)  # Adjust the delay as needed

# Add the descriptions to the dataframe
wga_df['wga_description'] = descriptions


In [None]:
wga_df

In [None]:
all_paintings

In [None]:

# Merge back to the original dataframe using the original index
all_paintings = all_paintings.merge(wga_df[['wga_url', 'wga_description']], on='wga_url', how='left')


# Save the updated dataframe
all_paintings.to_csv('data/paintings_with_descriptions.csv', index=False)

# Optional: Print a success message
print("Scraping completed. Descriptions added to 'paintings_with_descriptions.csv'.")