In [None]:
# Webscraping Script for ArtPrice

In [3]:
# Import Necessary Libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

import pandas as pd
import time
import os
import requests
import re

In [23]:
driver.quit()

In [24]:
# Setup Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
# options.add_argument('--headless=new')

# Initialize Chrome driver
driver = webdriver.Chrome(options=chrome_options)
driver.maximize_window()

In [11]:
# Sprase URLs of each artwork per page
def get_all_artwork_links(driver, start_url):
    all_links = []
    driver.get(start_url)

    while True:
        # Wait for artwork grid to load
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "lot-container")))

        # Parse page content
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract artwork links
        for container in soup.find_all("div", class_="lot-container"):
            a_tag = container.find("a", class_="sln_lot_show")
            if a_tag:
                href = a_tag.get("href")
                full_url = "https://www.artprice.com" + href
                all_links.append(full_url)

        print(f"Scraped {len(all_links)} artworks so far...")

        # Try to click the "Next" button
        try:
            next_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "li.next_page a.sln-next-page"))
            )
            next_button.click()
            time.sleep(2)  # Give time for the next page to load
        except:
            print("No more pages or 'Next' button not found.")
            break

    return all_links


In [8]:
# Test Function
start_url = "https://www.artprice.com/artist/15079/wassily-kandinsky/lots/pasts?p=1"
all_artwork_links = get_all_artwork_links(driver, start_url)

print(f"Total artworks scraped: {len(all_artwork_links)}")


Scraped 30 artworks so far...
Scraped 60 artworks so far...
Scraped 90 artworks so far...
Scraped 120 artworks so far...
Scraped 150 artworks so far...
Scraped 180 artworks so far...
No more pages or 'Next' button not found.
Total artworks scraped: 180


In [22]:
# Scraping a Single Artwork
def scrape_artwork_detail(driver, url):

    driver.get(url)
    time.sleep(2)

    soup = BeautifulSoup(driver.page_source, "html.parser")

    # -- Artist --
    artist_tag = soup.select_one("div.artist a.invisiblelink")
    artist = artist_tag.text.strip() if artist_tag else None

    # -- Title --
    title_tag = soup.select_one("div.lot-title h1")
    title = title_tag.text.strip() if title_tag else None

    # -- Lot Number --
    lot_tag = soup.find("div", class_="block marg marg-t-5")
    lot_number = lot_tag.text.strip() if lot_tag else None

    # -- Description (h2/h3/h4 inside .description) --
    description_tag = soup.find("div", class_="description")
    if description_tag:
        desc_parts = [tag.text.strip() for tag in description_tag.find_all(["h2", "h3", "h4"])]
        description = " | ".join(desc_parts)
    else:
        description = None

    # -- Prices --
    final_price = starting_price = estimate = price_incl_premium = None
    prices_tag = soup.find("div", class_="prices")
    if prices_tag:
        # Final hammer price
        hammer = prices_tag.find("div", class_="price")
        if hammer and "Hammer price" in hammer.text:
            span = hammer.find("span", attrs={"ng-show": re.compile("currency.*eur")})
            final_price = span.text.strip() if span else None

        # Price incl. premium
        premium = prices_tag.find("div", class_="price-tax")
        if premium:
            span = premium.find("span", attrs={"ng-show": re.compile("currency.*eur")})
            price_incl_premium = span.text.strip() if span else None

        # Starting price
        start = prices_tag.find("div", class_="price-start")
        if start:
            span = start.find("span", attrs={"ng-show": re.compile("currency.*eur")})
            starting_price = span.text.strip() if span else None

        # Estimate
        estimate_spans = prices_tag.select("div.price-estim span[ng-show*='currency']")
        if estimate_spans:
            estimate = " - ".join([s.text.strip() for s in estimate_spans if "eur" in s.get("ng-show", "")])

    # -- Sale Info --
    sale_title = sale_date = auction_house = location = None
    sale_tag = soup.find("div", class_="sale")
    if sale_tag:
        sale_title_tag = sale_tag.select_one("div.strong.block i")
        sale_title = sale_title_tag.text.strip() if sale_title_tag else None

        sale_date_tag = sale_tag.select_one("div.block.sale-date")
        sale_date = sale_date_tag.text.strip() if sale_date_tag else None

        auction_house_tag = sale_tag.select_one("div.block.strong div")
        auction_house = auction_house_tag.text.strip() if auction_house_tag else None

        # Get last meaningful block for location
        blocks = sale_tag.find_all("div", class_="block")
        for block in reversed(blocks):
            text = block.get_text(strip=True)
            if text and not any(kw in text.lower() for kw in ["sale", "auctioneer", "date"]):
                location = text
                break

    # -- Additional Details --
    additional_details = {}
    datas_tag = soup.select_one("div.sale div.datas")
    if datas_tag:
        for block in datas_tag.find_all("div", class_="block"):
            head_span = block.find("span", class_="head")
            if head_span:
                label = head_span.text.strip().rstrip(":")
                value = block.get_text(separator=" ", strip=True).replace(f"{label}:", "").strip()
                additional_details[label] = value
            else:
                # Try long content (like exhibition blocks)
                content = block.get_text(separator=" ", strip=True)
                if content:
                    key = f"Note {len(additional_details)+1}"
                    additional_details[key] = content

    return {
        "URL": url,
        "Artist": artist,
        "Title": title,
        "Lot Number": lot_number,
        "Description": description,
        "Final Price (EUR)": final_price,
        "Price Incl. Premium (EUR)": price_incl_premium,
        "Starting Price (EUR)": starting_price,
        "Estimate (EUR)": estimate,
        "Sale Title": sale_title,
        "Sale Date": sale_date,
        "Auction House": auction_house,
        "Location": location,
        "Additional Details": additional_details
    }


In [25]:
test_url = "https://www.artprice.com/artist/15079/wassily-kandinsky/drawing-watercolor/35952741/friedlich?p=2"
data = scrape_artwork_detail(driver, test_url)
pd.DataFrame([data])

Unnamed: 0,URL,Artist,Title,Lot Number,Description,Final Price (EUR),Price Incl. Premium (EUR),Starting Price (EUR),Estimate (EUR),Sale Title,Sale Date,Auction House,Location,Additional Details
0,https://www.artprice.com/artist/15079/wassily-...,Wassily KANDINSKY (1866-1944),Friedlich\n (1930),Lot #\n 8,"Drawing-Watercolor | Watercolour, ink & pen/pa...","€ 620,000","€ 787,400",,"€ 300,000 - € 500,000",Evening Sale,06 dec 2024,Ketterer Kunst GmbH,Illustrated on page 19 of the catalog,"{'Note 1': 'Details', 'Note 2': 'Monog. dated ..."


In [26]:
def flatten_artwork_data(records):
    flat_records = []

    for record in records:
        flat = record.copy()
        details = flat.pop("Additional Details", {})  # Remove nested dict and store keys
        for key, value in details.items():
            flat[f"Detail - {key}"] = value  # Prefix to avoid column name clashes
        flat_records.append(flat)

    return flat_records

In [27]:
flat_data = flatten_artwork_data([data])

# Create DataFrame
df = pd.DataFrame(flat_data)

# Preview the output
display(df.head())

Unnamed: 0,URL,Artist,Title,Lot Number,Description,Final Price (EUR),Price Incl. Premium (EUR),Starting Price (EUR),Estimate (EUR),Sale Title,Sale Date,Auction House,Location,Detail - Note 1,Detail - Note 2,Detail - Literature,Detail - Provenance,Detail - Exhibition,Detail - Note 6
0,https://www.artprice.com/artist/15079/wassily-...,Wassily KANDINSKY (1866-1944),Friedlich\n (1930),Lot #\n 8,"Drawing-Watercolor | Watercolour, ink & pen/pa...","€ 620,000","€ 787,400",,"€ 300,000 - € 500,000",Evening Sale,06 dec 2024,Ketterer Kunst GmbH,Illustrated on page 19 of the catalog,Details,Monog. dated\n \n lo...,"Literature : Endicott Barnett, II, 987","Provenance : Paul Klee Collection (1879-1940),...",Exhibition : Memorial exhibition of Wassily Ka...,Illustrated on page 19 of the catalog
