In [101]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import os
import re
from urllib.parse import urlparse

In [86]:
df = pd.read_csv('data/art_data.csv')
df.head()

Unnamed: 0,AUTHOR,BORN-DIED,TITLE,DATE,TECHNIQUE,LOCATION,URL,FORM,TYPE,SCHOOL,TIMEFRAME
0,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Venus and Adonis,1574-88,"Oil on canvas, 68 x 95 cm","Fogg Art Museum, Harvard University, Cambridge",https://www.wga.hu/html/a/aachen/adonis.html,painting,mythological,German,1601-1650
1,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Allegory,1598,"Oil on copper, 56 x 47 cm","Alte Pinakothek, Munich",https://www.wga.hu/html/a/aachen/allegory.html,painting,mythological,German,1601-1650
2,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Allegory of Peace, Art and Abundance",1602,"Oil on canvas, 197 x 142 cm","The Hermitage, St. Petersburg",https://www.wga.hu/html/a/aachen/allegorz.html,painting,mythological,German,1601-1650
3,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Jupiter, Antiope and Cupid",1595-98,"Oil on copper, 31 x 21 cm","Kunsthistorisches Museum, Vienna",https://www.wga.hu/html/a/aachen/antiope.html,painting,mythological,German,1601-1650
4,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Pallas Athena, Venus and Juno",1593,"Oil on canvas, 54 x 67 cm","Museum of Fine Arts, Boston",https://www.wga.hu/html/a/aachen/athena.html,painting,mythological,German,1601-1650


In [87]:
# Keep only the rows where FORM is 'painting'
df = df[df['FORM'] == 'painting']
df = df.reset_index(drop=True)

In [88]:
df.head(30)

Unnamed: 0,AUTHOR,BORN-DIED,TITLE,DATE,TECHNIQUE,LOCATION,URL,FORM,TYPE,SCHOOL,TIMEFRAME
0,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Venus and Adonis,1574-88,"Oil on canvas, 68 x 95 cm","Fogg Art Museum, Harvard University, Cambridge",https://www.wga.hu/html/a/aachen/adonis.html,painting,mythological,German,1601-1650
1,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Allegory,1598,"Oil on copper, 56 x 47 cm","Alte Pinakothek, Munich",https://www.wga.hu/html/a/aachen/allegory.html,painting,mythological,German,1601-1650
2,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Allegory of Peace, Art and Abundance",1602,"Oil on canvas, 197 x 142 cm","The Hermitage, St. Petersburg",https://www.wga.hu/html/a/aachen/allegorz.html,painting,mythological,German,1601-1650
3,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Jupiter, Antiope and Cupid",1595-98,"Oil on copper, 31 x 21 cm","Kunsthistorisches Museum, Vienna",https://www.wga.hu/html/a/aachen/antiope.html,painting,mythological,German,1601-1650
4,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Pallas Athena, Venus and Juno",1593,"Oil on canvas, 54 x 67 cm","Museum of Fine Arts, Boston",https://www.wga.hu/html/a/aachen/athena.html,painting,mythological,German,1601-1650
5,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Bacchus, Ceres and Cupid",1595-1605,"Oil on canvas, 163 x 113 cm","Kunsthistorisches Museum, Vienna",https://www.wga.hu/html/a/aachen/bacchus.html,painting,mythological,German,1601-1650
6,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Bacchus, Venus and Cupid",1595-1600,"Oil on canvas, 63 x 50 cm","Kunsthistorisches Museum, Vienna",https://www.wga.hu/html/a/aachen/bacchus1.html,painting,mythological,German,1601-1650
7,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",David and Bathsheba,1612-15,"Oil on canvas, 128 x 105 cm","Kunsthistorisches Museum, Vienna",https://www.wga.hu/html/a/aachen/davidbat.html,painting,historical,German,1601-1650
8,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",The Amazement of the Gods,1590s,"Oil on copper, 36 x 46 cm","National Gallery, London",https://www.wga.hu/html/a/aachen/gods.html,painting,mythological,German,1601-1650
9,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Pan and Selene,1600-05,"Oil on panel, 40 x 49 cm",Private collection,https://www.wga.hu/html/a/aachen/gods1.html,painting,mythological,German,1601-1650


Need images scraped from url, and metadata(AUTHOR, DATE, TECHNIQUE, TYPE, SCHOOL, and TIMEFRAME)

In [89]:
def get_image_url(html_url):
    try:
        image_url_base = html_url.replace('/html/', '/detail/')
        image_url = image_url_base.replace('.html', '.jpg')
        print(f"Retrieved image URL: {image_url}")
        return image_url
    except Exception as e:
        print(f"Error occurred while converting HTML URL to image URL: {e}")
        return None

# test
img_url = get_image_url('https://www.wga.hu/html/a/aachen/athena.html')

Retrieved image URL: https://www.wga.hu/detail/a/aachen/athena.jpg


In [90]:
def download_image(url, save_path, image_name):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        image = Image.open(BytesIO(response.content))
        
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        full_path = os.path.join(save_path, image_name)
        
        # save image
        image.save(full_path)
        print(f"Image saved to {full_path}")
    else:
        print(f"Failed to retrieve image from {url}")

In [92]:
# get all image urls
for idx, url in enumerate(df['URL']):
    img_url = get_image_url(url)
    df.at[idx, 'img_url'] = img_url

Retrieved image URL: https://www.wga.hu/detail/a/aachen/adonis.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/allegory.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/allegorz.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/antiope.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/athena.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/bacchus.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/bacchus1.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/davidbat.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/gods.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/gods1.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/j_couple.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/k_couple.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/portrai1.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/portrai2.jpg
Retrieved image URL: https://www.wga.hu/detail/

In [93]:
for idx, url in enumerate(df['URL']):
    print(df['URL'].loc[idx])

https://www.wga.hu/html/a/aachen/adonis.html
https://www.wga.hu/html/a/aachen/allegory.html
https://www.wga.hu/html/a/aachen/allegorz.html
https://www.wga.hu/html/a/aachen/antiope.html
https://www.wga.hu/html/a/aachen/athena.html
https://www.wga.hu/html/a/aachen/bacchus.html
https://www.wga.hu/html/a/aachen/bacchus1.html
https://www.wga.hu/html/a/aachen/davidbat.html
https://www.wga.hu/html/a/aachen/gods.html
https://www.wga.hu/html/a/aachen/gods1.html
https://www.wga.hu/html/a/aachen/j_couple.html
https://www.wga.hu/html/a/aachen/k_couple.html
https://www.wga.hu/html/a/aachen/portrai1.html
https://www.wga.hu/html/a/aachen/portrai2.html
https://www.wga.hu/html/a/aachen/portrai3.html
https://www.wga.hu/html/a/aachen/portrai4.html
https://www.wga.hu/html/a/aachen/rudolf2.html
https://www.wga.hu/html/a/aachen/selfpor1.html
https://www.wga.hu/html/a/aachen/selfport.html
https://www.wga.hu/html/a/aachen/war.html
https://www.wga.hu/html/a/aachen/z_scene.html
https://www.wga.hu/html/a/aagaard

In [94]:
df['img_url'].loc[4095]

'https://www.wga.hu/detail/b/bruegel/jan_e/2/5sense1.jpg'

In [95]:
path = 'data/images'
faulty_idx = []
start_idx = 26388
for idx, url in enumerate(df['img_url'][start_idx:], start=start_idx):
    image_name = f'{idx}_image.png'
    try:
        download_image(url, path, image_name)
    except:
        faulty_idx.append(idx)
        
        


Image saved to data/images/26388_image.png
Image saved to data/images/26389_image.png
Image saved to data/images/26390_image.png
Image saved to data/images/26391_image.png
Image saved to data/images/26392_image.png
Image saved to data/images/26393_image.png
Image saved to data/images/26394_image.png
Image saved to data/images/26395_image.png
Image saved to data/images/26396_image.png
Image saved to data/images/26397_image.png
Image saved to data/images/26398_image.png
Image saved to data/images/26399_image.png
Image saved to data/images/26400_image.png
Image saved to data/images/26401_image.png
Image saved to data/images/26402_image.png
Image saved to data/images/26403_image.png
Image saved to data/images/26404_image.png
Image saved to data/images/26405_image.png
Image saved to data/images/26406_image.png
Image saved to data/images/26407_image.png
Image saved to data/images/26408_image.png
Image saved to data/images/26409_image.png
Image saved to data/images/26410_image.png
Image saved

In [109]:
# checking for missing IDXs
updated_faulty_idx = []
image_files = os.listdir('data/images')
image_indices = [int(re.match(r'(\d+)_image\.png', filename).group(1)) for filename in image_files if re.match(r'(\d+)_image\.png', filename)]

expected_indices = set(range(len(df)))
downloaded_indices = set(image_indices)

missing_indices = expected_indices - downloaded_indices

for idx in sorted(missing_indices):
    updated_faulty_idx.append(idx)
print(f'Bad indexes are: {updated_faulty_idx}. Filter these out when creating zipped dataset if need be.')

Bad indexes are: [11578, 11583, 11578, 11583]. Filter these out when creating zipped dataset if need be.
