In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import uuid
import time
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = uc.Chrome()

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}

url = 'https://www.artimage.org.uk/sitemap.xml'

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'xml')

all_data = []

for i in range(97, 123):
    try:
        character = chr(i)
        print(f"Processing page: {character}")

        artists_locs = soup.select(f'loc:contains("/artists/{character}/")')
        for artists_loc in artists_locs:
            artist_url = artists_loc.text

            if artist_url != f'https://www.artimage.org.uk/artists/{character}/':
                print(f"Fetching artist data from: {artist_url}")

                driver.get(artist_url)
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'h1')))
                page_source = driver.page_source
                soup_artist = BeautifulSoup(page_source, 'html.parser')

                artist_name_tag = soup_artist.find('h1')
                artist_name = artist_name_tag.text.strip() if artist_name_tag else 'No Name'

                image_tag = soup_artist.find('figure').find('img')
                image_url = "https://www.artimage.org.uk" + image_tag['src'].strip() if image_tag else 'No Image URL'

                artist_info = {}
                artist_info_ul = soup_artist.find('ul', class_='artist-info clear')

                if artist_info_ul:
                    li_elements = artist_info_ul.find_all('li')
                    for item in li_elements:
                        spans = item.find_all('span')
                        if len(spans) >= 2:
                            key = spans[0].text.strip()
                            value = spans[1].text.strip()
                            link = spans[1].find('a')
                            if link:
                                value = link['href'].strip()
                            artist_info[key] = value
                        elif len(spans) == 1:
                            key = spans[0].text.strip()
                            value = item.contents[-1].strip()
                            artist_info[key] = value

                artist_data = {
                    'UUID': uuid.uuid4(),
                    'Artist Name': artist_name,
                    'Artist Image URL': image_url,
                }
                artist_data.update(artist_info)
                all_data.append(artist_data)
                all_assets_link = soup_artist.find('section', class_='link-to-all-images')
               
                link = all_assets_link.find('a')['href']
                assets_page_url = "https://www.artimage.org.uk" + link
                driver.get(assets_page_url)

                    
                WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.TAG_NAME, 'h1')))

                
                last_height = driver.execute_script("return document.body.scrollHeight")
                while True:
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(2)
                    new_height = driver.execute_script("return document.body.scrollHeight")
                    if new_height == last_height:
                        break
                    last_height = new_height
                
                soup_assets = BeautifulSoup(driver.page_source, 'html.parser')
                
                all_li_tag = soup_assets.find_all('li', class_='col2')
                for li_tag in all_li_tag:

                    asset_link = li_tag.find('a')['href']
                    asset_url = "https://www.artimage.org.uk" + asset_link
                    driver.get(asset_url)
                    soup_asset_detail = BeautifulSoup(driver.page_source, 'html.parser')

                    asset_detail = soup_asset_detail.find('article', class_='clear item-information')
                    if asset_detail:
                        asset_name_tag = asset_detail.find('h1')
                        asset_name = ' '.join(asset_name_tag.text.strip().split())

                        figure = soup_asset_detail.find('figure')
                        img_src = ''
                        if figure:
                            img_src = figure.find('img').get('src', '').strip()
                            if img_src.startswith('//'):
                                img_src = 'https:' + img_src

                            further_info = {}
                            aside = soup_asset_detail.find('aside', {'id': 'image-further-info'})
                            if aside:
                                sections = aside.find_all('section')
                                for section in sections:
                                    heading = section.find('h2')
                                    if heading:
                                        heading_text = heading.text.strip()
                                        content = section.find('span') or section.find('ul')

                                        if content:
                                            if content.name == 'span':
                                                content_text = content.text.strip()
                                            elif content.name == 'ul':
                                                items = [li.text.strip() for li in content.find_all('li')]
                                                content_text = ', '.join(items).strip()

                                            content_text = ' '.join(content_text.split())
                                            further_info[heading_text] = content_text

                            asset_data = {
                                'UUID': uuid.uuid4(),
                                'Artist UUID': artist_data['UUID'],
                                'Asset Name': asset_name,
                                'Asset Image URL': img_src,
                            }
                            asset_data.update(further_info)
                            all_data.append(asset_data)

            time.sleep(2)  

    except:
        pass

    df = pd.DataFrame(all_data)
    csv_file_path = 'art_image_data.csv'
    df.to_csv(csv_file_path, index=False)
    print(f"Data saved page {character} successfully to {csv_file_path}")
    

Processing page: a
Fetching artist data from: https://www.artimage.org.uk/artists/a/faisal-abduallah/
Fetching artist data from: https://www.artimage.org.uk/artists/a/charles-avery/
Fetching artist data from: https://www.artimage.org.uk/artists/a/keith-arnatt/
Fetching artist data from: https://www.artimage.org.uk/artists/a/hurvin-anderson/
Fetching artist data from: https://www.artimage.org.uk/artists/a/ivor-abrahams/
Fetching artist data from: https://www.artimage.org.uk/artists/a/rasheed-araeen/
Fetching artist data from: https://www.artimage.org.uk/artists/a/laura-aldridge/
Fetching artist data from: https://www.artimage.org.uk/artists/a/larry-achiampong/
Fetching artist data from: https://www.artimage.org.uk/artists/a/john-akomfrah/
Fetching artist data from: https://www.artimage.org.uk/artists/a/ajamu/
Data saved page a successfully to art_image_data.csv
Processing page: b
Fetching artist data from: https://www.artimage.org.uk/artists/b/peter-blake/
Fetching artist data from: htt

Fetching artist data from: https://www.artimage.org.uk/artists/g/galpin-richard/
Fetching artist data from: https://www.artimage.org.uk/artists/g/gilbert-and-george/
Fetching artist data from: https://www.artimage.org.uk/artists/g/anya-gallaccio/
Fetching artist data from: https://www.artimage.org.uk/artists/g/douglas-gordon/
Fetching artist data from: https://www.artimage.org.uk/artists/g/nick-gentry/
Fetching artist data from: https://www.artimage.org.uk/artists/g/john-gerrard/
Fetching artist data from: https://www.artimage.org.uk/artists/g/hugo-glendinning/
Fetching artist data from: https://www.artimage.org.uk/artists/g/ori-gersht/
Fetching artist data from: https://www.artimage.org.uk/artists/g/sunil-gupta/
Fetching artist data from: https://www.artimage.org.uk/artists/g/margarita-gluzberg/
Fetching artist data from: https://www.artimage.org.uk/artists/g/joy-gregory/
Fetching artist data from: https://www.artimage.org.uk/artists/g/louise-giovanelli/
Fetching artist data from: htt

Fetching artist data from: https://www.artimage.org.uk/artists/n/ben-nicholson/
Fetching artist data from: https://www.artimage.org.uk/artists/n/beth-nicholas/
Fetching artist data from: https://www.artimage.org.uk/artists/n/david-noonan/
Fetching artist data from: https://www.artimage.org.uk/artists/n/mariele-neudecker/
Data saved page n successfully to art_image_data.csv
Processing page: o
Fetching artist data from: https://www.artimage.org.uk/artists/o/uriel-orlow/
Fetching artist data from: https://www.artimage.org.uk/artists/o/humphrey-ocean/
Fetching artist data from: https://www.artimage.org.uk/artists/o/julian-opie/
Data saved page o successfully to art_image_data.csv
Processing page: p
Fetching artist data from: https://www.artimage.org.uk/artists/p/jonathan-parsons/
Fetching artist data from: https://www.artimage.org.uk/artists/p/vong-phaophanit-and-claire-oboussier/
Fetching artist data from: https://www.artimage.org.uk/artists/p/peter-phillips/
Fetching artist data from: ht

Fetching artist data from: https://www.artimage.org.uk/artists/w/wood-and-harrison/
Fetching artist data from: https://www.artimage.org.uk/artists/w/caroline-walker/
Fetching artist data from: https://www.artimage.org.uk/artists/w/alberta-whittle/
Fetching artist data from: https://www.artimage.org.uk/artists/w/charmaine-watkiss/
Fetching artist data from: https://www.artimage.org.uk/artists/w/barbara-walker/
Fetching artist data from: https://www.artimage.org.uk/artists/w/simon-wheatley/
Data saved page w successfully to art_image_data.csv
Processing page: x
Data saved page x successfully to art_image_data.csv
Processing page: y
Fetching artist data from: https://www.artimage.org.uk/artists/y/li-yuan-chia/
Fetching artist data from: https://www.artimage.org.uk/artists/y/catherine-yass/
Fetching artist data from: https://www.artimage.org.uk/artists/y/marie-yates/
Fetching artist data from: https://www.artimage.org.uk/artists/y/jack-butler-yeats/
Fetching artist data from: https://www.a