In [None]:
# Selenium used to open and interact with a webpage
# BeautifulSoup used to parse the HTML of the webpage (extracts data from the HTML)
# Pandas used to store the data in a DataFrame

In [2]:
# Import Necessary Libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

import pandas as pd
import time
import os
import requests
import re

In [3]:
# Setup Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
# options.add_argument('--headless=new')

# Initialize Chrome driver
driver = webdriver.Chrome(options=chrome_options)
driver.maximize_window()

In [93]:
driver.quit()

In [None]:
# Login to Artsy
def artsy_login(driver, email, password):
    header = driver.find_element(By.TAG_NAME, "header")
    login_button = [i for i in header.find_elements(By.TAG_NAME, "button") if i.text == 'Log In']

    if len(login_button) == 1:
        login_button = login_button[0]
        login_button.click()
        time.sleep(2)

        email_input = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '[placeholder="Enter your email address"]'))
        )
        email_input.send_keys(email)
        time.sleep(2)

        continue_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//button[@type="submit"]'))
        )
        continue_button.click()
        time.sleep(2)

        password_input = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '[placeholder="Enter your password"]'))
        )
        password_input.send_keys(password)
        time.sleep(2)

        login_submit = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//button[@type="submit"]'))
        )
        login_submit.click()
        print("Logged in")
        time.sleep(10)
    else:
        print("Error: Not able to locate log in button")


In [5]:
# Scrape Artist Description
def close_popup_if_present(driver):
    try:
        popup_close_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label="Close"]'))
        )
        driver.execute_script("arguments[0].click();", popup_close_button)
        print("Popup closed successfully.")
    except:
        print("No popup found or popup already closed.")


def get_artist_description(driver):
    try:
        artist_info = driver.find_element(By.CSS_SELECTOR, '[data-test="artistHeader"]')

        # Close popup if present
        close_popup_if_present(driver)

        # Try clicking "Read more" if available
        try:
            artist_info_readmore = WebDriverWait(artist_info, 5).until(
                EC.element_to_be_clickable((By.XPATH, './/button[contains(., "Read more")]'))
            )
            driver.execute_script("arguments[0].click();", artist_info_readmore)

            WebDriverWait(artist_info, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '[aria-expanded="true"]'))
            )
        except:
            pass  # No "Read more" button means no expansion needed

        # Re-grab updated HTML after potential expansion
        artist_info_updated = driver.find_element(By.CSS_SELECTOR, '[data-test="artistHeader"]')
        artist_info_html = artist_info_updated.get_attribute('innerHTML')
        soup = BeautifulSoup(artist_info_html, 'html.parser')

        # Extract data
        artist_name = soup.find('h1').text.strip()
        artist_country_year = soup.find('h2').text.strip()

        # Description (check if it exists)
        desc_block = soup.find('div', {'aria-expanded': 'true'})
        artist_description_paragraph = desc_block.text.strip() if desc_block else 'N/A'

        return artist_name, artist_country_year, artist_description_paragraph

    except Exception as e:
        print(f"Error extracting artist description: {e}")
        return 'N/A', 'N/A', 'N/A'




In [6]:
# Image Download from URL
def download_image(image_url, title):
    # Create images folder if it doesn't exist
    if not os.path.exists('images'):
        os.makedirs('images')

    response = requests.get(image_url)
    file_name = title.replace("/", "_").replace(" ", "_") + ".jpg"
    file_path = f"images/{file_name}"
    with open(file_path, "wb") as file:
        file.write(response.content)
    return file_path

In [7]:
# Get Artist Name and URL
def get_artist_list(letter = 'a'): 
    curr_pg = 1
    driver.get('https://www.artsy.net/artists/artists-starting-with-{0}?page={1}'.format(letter,curr_pg))
    time.sleep(5) 

    # Number of Pages
    no_pages = driver.find_elements(By.TAG_NAME, 'nav')[-1] # Last 'nav' tag contains the page numbers
    soup = BeautifulSoup(no_pages.get_attribute('innerHTML'), 'html.parser') 
    tot_no_pages = max([int(i.text) for i in soup.find_all('a') if re.match('[0-9]+', i.text)]) # Max Number of Pages

    while curr_pg < tot_no_pages: # Loop through all pages
        artist_list = driver.find_elements(By.CSS_SELECTOR, '[class*="ArtistsByLetter__Name"]') # Get all artist names
        artists_url.extend([a.get_attribute('href') for a in artist_list]) # Get all artist URLs
        
        # Click Next Button
        nxt_button = WebDriverWait(driver.find_elements(By.TAG_NAME, 'nav')[-1], 10).until( 
                EC.presence_of_element_located((By.CSS_SELECTOR, '[data-testid="next"]')) 
            )
        driver.execute_script("arguments[0].click();", nxt_button) 
        curr_pg += 1 # Goes through loop until curr_pg < tot_no_pages
        time.sleep(5)

In [8]:
def filter_artist_by_auction_count(artist_url):
    
    try:
        driver.get(artist_url + "/auction-results")
        time.sleep(5)  # Allow time for the page to load

        # Check for "No auction results" message
        if driver.find_element(By.XPATH, 
                "//*[contains(text(), 'There are currently no auction results for this artist.')]"):
            return 0 # 0 means do not continue

        # Extract auction results count
        auction_results = driver.find_element(By.CSS_SELECTOR, '[data-test="auctionResults"]')
        auction_results_html = auction_results.get_attribute('innerHTML')
        soup = BeautifulSoup(auction_results_html, 'html.parser')

        # Extract the number of auction results
        no_work_text = soup.find(string=re.compile(r"(\d+) results"))
        if no_work_text:
            no_work = int(re.search(r"(\d+)", no_work_text).group(1))
        else:
            no_work = 0  # Assume 0 if number is not found

        # Only keep artists with at least 10 auction results
        if no_work >= 10:
            return 1
        else:
            return 0

    except Exception as e:
        print(f"Error processing {artist_url}: {e}")


In [9]:
# Scrape all Auction Entries for an Artist
def scrape_all_pages(url):
    driver.get(url)

    time.sleep(5)

    # Scrape Artist Description
    artist_name, artist_country_year, artist_description_paragraph = get_artist_description(driver)
    print(f"Artist Name: {artist_name}")
    print(f"Artist Country and Year: {artist_country_year}")
    print(f"Artist Description: {artist_description_paragraph}\n")

    page_count = 1

    while True:
        soup = BeautifulSoup(driver.page_source, "html.parser")
        auction_entries = soup.find_all("a", class_="RouterLink__RouterAwareLink-sc-c712443b-0 laGLjt")
        all_work.extend(auction_entries)
        print(f"Page {page_count}: Collected {len(auction_entries)} entries.")

        try:
            next_button = WebDriverWait(driver, 15).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-testid='next'][style*='opacity: 1']"))
            )
            driver.execute_script("arguments[0].click();", next_button)
            time.sleep(5)
            page_count += 1

        except Exception as e:
            print(f"No 'Next' button found or error on Page {page_count}: {e}")
            break

    #driver.quit()
    return artist_name, artist_country_year, artist_description_paragraph



In [None]:
def modify_url(url_ori, height=400, quality=80, resize_to='fit&amp', width=400):
    config_param = {'height':str(int(height)), 
                    'quality':str(int(quality)), 
                    'resize_to':resize_to, 
                    'width':str(int(width))}

    http_, url_details = url_sample.split('://')
    url_domain, url_details = url_details.split('?')
    url_details = url_details.split('&')
    for idx, d in enumerate(url_details):
        if d.split('=')[0] == 'src':
            url_details[idx] = d.replace('thumbnail.jpg','larger.jpg')
        else:
            url_details[idx] = '{0}={1}'.format(d.split('=')[0], config_param[d.split('=')[0]])
    
    modified_url = '{0}://{1}?{2}'.format(http_,url_domain,'&'.join(url_details))
    return modified_url



In [None]:
# Example
if __name__ == "__main__":
    #Read artist URLs from the saved text file
    with open("artists_url.txt", "r", encoding="utf-8") as file:
        artist_urls = file.read().splitlines()  # Load URLs into a list

    #Convert the URLs into a list of (Artist Name, URL) format
    artist_data = [(url.split("/")[-1].replace("-", " ").title(), url) for url in artist_urls]
    
    
    email = "artauctionproject.57@gmail.com"
    password = "Artauctionproject2025!"

    driver.get('https://www.artsy.net/')
    time.sleep(5)
    artsy_login(driver, email, password)

    time.sleep(10)

    all_work = []

    for url in artist_data:
        if filter_artist_by_auction_count(url) == 0:
            continue
        artist_name, artist_country_year, artist_description_paragraph = scrape_all_pages(url)
    driver.quit()
    artist_name, artist_country_year, artist_description_paragraph = scrape_all_pages('https://www.artsy.net/artist/mario-dalpra/auction-results')

    print(f"\nTotal auction results collected: {len(all_work)}\n")
    for idx, entry in enumerate(all_work, 1):
        print(f"{idx}: {entry.get('href')}")


Logged in
Popup closed successfully.
Artist Name: Mario Dalpra
Artist Country and Year: Austrian, b. 1960
Artist Description: N/A

Page 1: Collected 50 entries.
Page 2: Collected 50 entries.
Page 3: Collected 38 entries.
No 'Next' button found or error on Page 3: Message: 
Stacktrace:
0   chromedriver                        0x0000000103ee7808 chromedriver + 6105096
1   chromedriver                        0x0000000103edf40a chromedriver + 6071306
2   chromedriver                        0x000000010397a600 chromedriver + 415232
3   chromedriver                        0x00000001039cc2c0 chromedriver + 750272
4   chromedriver                        0x00000001039cc511 chromedriver + 750865
5   chromedriver                        0x0000000103a1c9c4 chromedriver + 1079748
6   chromedriver                        0x00000001039f263d chromedriver + 906813
7   chromedriver                        0x0000000103a19c3d chromedriver + 1068093
8   chromedriver                        0x00000001039f23e3 chr

In [None]:
### For Testing ####

all_work = []

artist_name, artist_country_year, artist_description_paragraph = scrape_all_pages('https://www.artsy.net/artist/mario-dalpra/auction-results')

print(f"\nTotal auction results collected: {len(all_work)}\n")
for idx, entry in enumerate(all_work, 1):
    print(f"{idx}: {entry.get('href')}")


Popup closed successfully.


TimeoutException: Message: 
Stacktrace:
0   chromedriver                        0x000000010090f808 chromedriver + 6105096
1   chromedriver                        0x000000010090740a chromedriver + 6071306
2   chromedriver                        0x00000001003a2600 chromedriver + 415232
3   chromedriver                        0x00000001003f42c0 chromedriver + 750272
4   chromedriver                        0x00000001003f4511 chromedriver + 750865
5   chromedriver                        0x00000001003e7e76 chromedriver + 700022
6   chromedriver                        0x000000010041a63d chromedriver + 906813
7   chromedriver                        0x00000001003e7d68 chromedriver + 699752
8   chromedriver                        0x000000010041a7ce chromedriver + 907214
9   chromedriver                        0x0000000100441c3d chromedriver + 1068093
10  chromedriver                        0x000000010041a3e3 chromedriver + 906211
11  chromedriver                        0x00000001003e629a chromedriver + 692890
12  chromedriver                        0x00000001003e73f1 chromedriver + 697329
13  chromedriver                        0x00000001008ced00 chromedriver + 5840128
14  chromedriver                        0x00000001008d2bd4 chromedriver + 5856212
15  chromedriver                        0x00000001008a9936 chromedriver + 5687606
16  chromedriver                        0x00000001008d35cb chromedriver + 5858763
17  chromedriver                        0x0000000100898024 chromedriver + 5615652
18  chromedriver                        0x00000001008f5368 chromedriver + 5997416
19  chromedriver                        0x00000001008f552f chromedriver + 5997871
20  chromedriver                        0x0000000100906fe8 chromedriver + 6070248
21  libsystem_pthread.dylib             0x00007ff816b3e253 _pthread_start + 99
22  libsystem_pthread.dylib             0x00007ff816b39bef thread_start + 15


In [None]:
# Run 
artists_url = []
get_artist_list()
with open("artists_url.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(artists_url))

In [None]:
all_work[:10]

[<a class="RouterLink__RouterAwareLink-sc-c712443b-0 laGLjt" display="block" href="/auction-result/7058655" text-decoration="none"><div class="Box-sc-15se88d-0 CSSGrid-sc-1q8w5xn-0 GridColumns-sc-1g9p6xx-0 gRoBRz fwdhTL"><div class="Box-sc-15se88d-0 GridColumns__Cell-sc-1g9p6xx-1 fHmcuw"><div class="Box-sc-15se88d-0 ilfZqS" overflow="hidden" style="aspect-ratio: 1 / 1; max-width: 130px;" width="100%"><div class="Box-sc-15se88d-0 fnOOKv" height="100%" width="100%"><img alt="" class="Box-sc-15se88d-0 guRykI" display="block" height="100%" loading="lazy" src="https://d7hftxdivxxvm.cloudfront.net?height=130&amp;quality=80&amp;resize_to=fill&amp;src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F2PZPW73N0I0_hsrFivbT2Q%2Fthumbnail.jpg&amp;width=130" srcset="https://d7hftxdivxxvm.cloudfront.net?height=130&amp;quality=80&amp;resize_to=fill&amp;src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F2PZPW73N0I0_hsrFivbT2Q%2Fthumbnail.jpg&amp;width=130 1x, https://d7hftxdivxxvm.cloudfront.net?height=260

In [None]:
def parse_auction_entries(all_entries):
    auction_data = []

    for entry in all_entries:
        soup = BeautifulSoup(str(entry), "html.parser")

        # Extracting Title, Medium, Dimesnions
        title = soup.select_one('.bxWaGD').text.strip() if soup.select_one('.bxWaGD') else 'N/A'
        medium = soup.select('.irDwAE')[0].text.strip() if len(soup.select('.irDwAE')) > 0 else 'N/A'
        dimensions = soup.select('.irDwAE')[1].text.strip() if len(soup.select('.irDwAE')) > 1 else 'N/A'

        # Extracting Sale Date, Auction House
        sale_date_house = soup.select('.irDwAE')[2].text.strip() if len(soup.select('.irDwAE')) > 2 else 'N/A'
        if '•' in sale_date_house:
            sale_date, auction_house = [x.strip() for x in sale_date_house.split('•', 1)]
        else:
            sale_date, auction_house = sale_date_house, 'N/A'
        
        # Extracting Sale Name, Lot Number
        sale_name = soup.select('.irDwAE')[6].text.strip() if len(soup.select('.irDwAE')) > 6 else 'N/A'
        lot_number = soup.select('.irDwAE')[7].text.strip() if len(soup.select('.irDwAE')) > 7 else 'N/A'
        
        # Extracting Sale Location
        sale_location_full = soup.select('.irDwAE.bbAxnM')[2].text.strip() if len(soup.select('.irDwAE.bbAxnM')) > 2 else 'N/A'
        if '•' in sale_location_full:
            sale_location = sale_location_full.split('•')[1].strip()
        else:
            sale_location = 'N/A'
        
         # Image URL
        if entry.find('img'):
            image_url = entry.find('img').get('src')
            # replace image_url with the higher quality image url
            modified_url = modify_url(image_url)
            
        else:
            image_url = 'N/A'
            modified_url = 'N/A'

        # Extracting Price Sold, Price Estimated
        price_sold = soup.select_one('.cMfkJA').text.strip() if soup.select_one('.cMfkJA') else 'N/A'
        price_estimated = soup.select_one('.jEONpp').text.strip().replace("(est)", "") if soup.select_one('.jEONpp') else 'N/A'

        # Merge Data
        auction_data.append({
            'Title': title,
            'Medium': medium,
            'Dimensions': dimensions,
            'Sale Date': sale_date,
            'Auction House': auction_house,
            'Sale Location': sale_location,
            'Sale Name': sale_name,
            'Lot Number': lot_number,
            'Price Sold': price_sold,
            'Price Estimated': price_estimated,
            'Image url ori': image_url,
            'Image url better quality': modified_url
        })

    return pd.DataFrame(auction_data)

# Example usage:
df_auctions = parse_auction_entries(all_work)
print(df_auctions.head())



                                               Title  \
0                                     Untitled, 1998   
1                                     Untitled, 1995   
2  “Geh fort aus diesem Land, spürst du diese dun...   
3                          "Don't be so silly", 2006   
4                                       “Wild”, 1992   

                                     Medium        Dimensions     Sale Date  \
0  mixed media on paper laid down on canvas    51.0 x 71.0 cm  18 Sept 2024   
1  mixed media on paper laid down on canvas    51.0 x 71.0 cm  18 Sept 2024   
2               wooden painted with acrylic          115.0 cm   14 Mar 2024   
3                                  painting  238.0 x 200.0 cm   14 Mar 2024   
4                         acrylic on canvas   96.0 x 117.0 cm  20 Sept 2023   

  Auction House Sale Location                             Sale Name  \
0     Dorotheum           N/A             Austrian Contemporary Art   
1     Dorotheum           N/A             Aust

In [105]:
df_auctions

Unnamed: 0,Title,Medium,Dimensions,Sale Date,Auction House,Sale Location,Sale Name,Lot Number,Price Sold,Price Estimated,Image Path
0,"Untitled, 1998",mixed media on paper laid down on canvas,51.0 x 71.0 cm,18 Sept 2024,Dorotheum,,Austrian Contemporary Art,Lot 431,,"€1,000–€1,600",
1,"Untitled, 1995",mixed media on paper laid down on canvas,51.0 x 71.0 cm,18 Sept 2024,Dorotheum,,Austrian Contemporary Art,Lot 432,,"€1,000–€1,600",
2,"“Geh fort aus diesem Land, spürst du diese dun...",wooden painted with acrylic,115.0 cm,14 Mar 2024,Dorotheum,,Austrian Contemporary and Modern Art,Lot 129,,"€3,000–€5,000",
3,"""Don't be so silly"", 2006",painting,238.0 x 200.0 cm,14 Mar 2024,Dorotheum,,Austrian Contemporary and Modern Art,Lot 130,,"€3,500–€5,000",
4,"“Wild”, 1992",acrylic on canvas,96.0 x 117.0 cm,20 Sept 2023,Dorotheum,,Austrian Contemporary and Modern Art,Lot 282,,"€2,500–€3,500",
...,...,...,...,...,...,...,...,...,...,...,...
133,"For Anna, 1989",acrylic and mixed media on linen,197.99 x 265.99 cm,22 Mar 2009,Bonhams & Goodman,Sydney & Melbourne,Adelaide Art Auction,Lot 179,,"AU$1,000–AU$2,000",
134,Time,acrylic on linen,200 x 270 cm,22 Mar 2009,Bonhams & Goodman,Sydney & Melbourne,Adelaide Art Auction,Lot 222,,"AU$4,000–AU$6,000",
135,"In a hiding place, 1993",acrylic on linen,148.01 x 118.01 cm,22 Mar 2009,Bonhams & Goodman,Sydney & Melbourne,Adelaide Art Auction,Lot 136,AU$488 • US$335,"AU$1,000–AU$2,000",
136,"Look into this face, 1990",acrylic on linen,150.01 x 119.99 cm,22 Mar 2009,Bonhams & Goodman,Sydney & Melbourne,Adelaide Art Auction,Lot 137,AU$488 • US$335,"AU$1,000–AU$2,000",
