In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import os

In [4]:
df = pd.read_csv('data/art_data.csv')
df.head()

Unnamed: 0,AUTHOR,BORN-DIED,TITLE,DATE,TECHNIQUE,LOCATION,URL,FORM,TYPE,SCHOOL,TIMEFRAME
0,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Venus and Adonis,1574-88,"Oil on canvas, 68 x 95 cm","Fogg Art Museum, Harvard University, Cambridge",https://www.wga.hu/html/a/aachen/adonis.html,painting,mythological,German,1601-1650
1,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Allegory,1598,"Oil on copper, 56 x 47 cm","Alte Pinakothek, Munich",https://www.wga.hu/html/a/aachen/allegory.html,painting,mythological,German,1601-1650
2,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Allegory of Peace, Art and Abundance",1602,"Oil on canvas, 197 x 142 cm","The Hermitage, St. Petersburg",https://www.wga.hu/html/a/aachen/allegorz.html,painting,mythological,German,1601-1650
3,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Jupiter, Antiope and Cupid",1595-98,"Oil on copper, 31 x 21 cm","Kunsthistorisches Museum, Vienna",https://www.wga.hu/html/a/aachen/antiope.html,painting,mythological,German,1601-1650
4,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Pallas Athena, Venus and Juno",1593,"Oil on canvas, 54 x 67 cm","Museum of Fine Arts, Boston",https://www.wga.hu/html/a/aachen/athena.html,painting,mythological,German,1601-1650


In [5]:
# Keep only the rows where FORM is 'painting'
df = df[df['FORM'] == 'painting']

Need images scraped from url, and metadata(AUTHOR, DATE, TECHNIQUE, TYPE, SCHOOL, and TIMEFRAME)

In [6]:
def get_image_url(html_url):
    # I was going to use bs4 but realised how simple the url difference was and this is way more efficient
    image_url_base = html_url.replace('/html/', '/detail/')
    image_url = image_url_base.replace('.html', '.jpg')
    print(f"retrieved image url: {image_url}")
    return image_url
# test
img_url = get_image_url('https://www.wga.hu/html/a/aachen/athena.html')

retrieved image url: https://www.wga.hu/detail/a/aachen/athena.jpg


In [7]:
def download_image(url, save_path, image_name):
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        image = Image.open(BytesIO(response.content))
        
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        full_path = os.path.join(save_path, image_name)
        
        # save image
        image.save(full_path)
        print(f"Image saved to {full_path}")
    else:
        print(f"Failed to retrieve image from {url}")

In [8]:
# get all image urls
for idx, url in enumerate(df['URL']):
    img_url = get_image_url(url)
    df.at[idx, 'img_url'] = img_url

retrieved image url: https://www.wga.hu/detail/a/aachen/adonis.jpg
retrieved image url: https://www.wga.hu/detail/a/aachen/allegory.jpg
retrieved image url: https://www.wga.hu/detail/a/aachen/allegorz.jpg
retrieved image url: https://www.wga.hu/detail/a/aachen/antiope.jpg
retrieved image url: https://www.wga.hu/detail/a/aachen/athena.jpg
retrieved image url: https://www.wga.hu/detail/a/aachen/bacchus.jpg
retrieved image url: https://www.wga.hu/detail/a/aachen/bacchus1.jpg
retrieved image url: https://www.wga.hu/detail/a/aachen/davidbat.jpg
retrieved image url: https://www.wga.hu/detail/a/aachen/gods.jpg
retrieved image url: https://www.wga.hu/detail/a/aachen/gods1.jpg
retrieved image url: https://www.wga.hu/detail/a/aachen/j_couple.jpg
retrieved image url: https://www.wga.hu/detail/a/aachen/k_couple.jpg
retrieved image url: https://www.wga.hu/detail/a/aachen/portrai1.jpg
retrieved image url: https://www.wga.hu/detail/a/aachen/portrai2.jpg
retrieved image url: https://www.wga.hu/detail/

In [9]:
# download images to data/images, idx here should be == to idx in df so not going to worry about that for now
save_path = 'data/images_png'
for idx, url in enumerate(df['img_url']):
    image_name = f'image_{idx}.png'
    download_image(url, save_path, image_name)

Image saved to data/images_png/image_0.png
Image saved to data/images_png/image_1.png
Image saved to data/images_png/image_2.png
Image saved to data/images_png/image_3.png
Image saved to data/images_png/image_4.png
Image saved to data/images_png/image_5.png
Image saved to data/images_png/image_6.png
Image saved to data/images_png/image_7.png
Image saved to data/images_png/image_8.png
Image saved to data/images_png/image_9.png
Image saved to data/images_png/image_10.png
Image saved to data/images_png/image_11.png
Image saved to data/images_png/image_12.png
Image saved to data/images_png/image_13.png
Image saved to data/images_png/image_14.png
Image saved to data/images_png/image_15.png
Image saved to data/images_png/image_16.png
Image saved to data/images_png/image_17.png
Image saved to data/images_png/image_18.png
Image saved to data/images_png/image_19.png
Image saved to data/images_png/image_20.png
Image saved to data/images_png/image_21.png
Image saved to data/images_png/image_22.pn

UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x11048cc70>

In [25]:
url = df['img_url'].loc[6511]
response = requests.get(url, stream=True)
with open('temp_image.jpeg', 'wb') as f:
    f.write(response.content)
# response.headers['Content-Type']
# Manually check 'temp_image' with an image viewer.
url

'image/jpeg'