In [30]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import os

In [31]:
df = pd.read_csv('data/art_data.csv')
df.head()

Unnamed: 0,AUTHOR,BORN-DIED,TITLE,DATE,TECHNIQUE,LOCATION,URL,FORM,TYPE,SCHOOL,TIMEFRAME
0,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Venus and Adonis,1574-88,"Oil on canvas, 68 x 95 cm","Fogg Art Museum, Harvard University, Cambridge",https://www.wga.hu/html/a/aachen/adonis.html,painting,mythological,German,1601-1650
1,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Allegory,1598,"Oil on copper, 56 x 47 cm","Alte Pinakothek, Munich",https://www.wga.hu/html/a/aachen/allegory.html,painting,mythological,German,1601-1650
2,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Allegory of Peace, Art and Abundance",1602,"Oil on canvas, 197 x 142 cm","The Hermitage, St. Petersburg",https://www.wga.hu/html/a/aachen/allegorz.html,painting,mythological,German,1601-1650
3,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Jupiter, Antiope and Cupid",1595-98,"Oil on copper, 31 x 21 cm","Kunsthistorisches Museum, Vienna",https://www.wga.hu/html/a/aachen/antiope.html,painting,mythological,German,1601-1650
4,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Pallas Athena, Venus and Juno",1593,"Oil on canvas, 54 x 67 cm","Museum of Fine Arts, Boston",https://www.wga.hu/html/a/aachen/athena.html,painting,mythological,German,1601-1650


In [32]:
# Keep only the rows where FORM is 'painting'
df = df[df['FORM'] == 'painting']

Need images scraped from url, and metadata(AUTHOR, DATE, TECHNIQUE, TYPE, SCHOOL, and TIMEFRAME)

In [33]:
def get_image_url(html_url):
    try:
        image_url_base = html_url.replace('/html/', '/detail/')
        image_url = image_url_base.replace('.html', '.jpg')
        print(f"Retrieved image URL: {image_url}")
        return image_url
    except Exception as e:
        print(f"Error occurred while converting HTML URL to image URL: {e}")
        return None

# test
img_url = get_image_url('https://www.wga.hu/html/a/aachen/athena.html')

Retrieved image URL: https://www.wga.hu/detail/a/aachen/athena.jpg


In [34]:
def download_image(url, save_path, image_name):
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        image = Image.open(BytesIO(response.content))
        
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        full_path = os.path.join(save_path, image_name)
        
        # save image
        image.save(full_path)
        print(f"Image saved to {full_path}")
    else:
        print(f"Failed to retrieve image from {url}")

In [35]:
# get all image urls
for idx, url in enumerate(df['URL']):
    img_url = get_image_url(url)
    df.at[idx, 'img_url'] = img_url

Retrieved image URL: https://www.wga.hu/detail/a/aachen/adonis.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/allegory.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/allegorz.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/antiope.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/athena.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/bacchus.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/bacchus1.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/davidbat.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/gods.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/gods1.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/j_couple.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/k_couple.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/portrai1.jpg
Retrieved image URL: https://www.wga.hu/detail/a/aachen/portrai2.jpg
Retrieved image URL: https://www.wga.hu/detail/

In [36]:
# download images to data/images, idx here should be == to idx in df so not going to worry about that for now
# save_path = 'data/images_png'
# for idx, url in enumerate(df['img_url']):
#     image_name = f'image_{idx}.png'
#     download_image(url, save_path, image_name)

In [37]:
url = df['img_url'].loc[6511]
response = requests.get(url, stream=True)
with open('temp_image.jpeg', 'wb') as f:
    f.write(response.content)
# response.headers['Content-Type']
# Manually check 'temp_image' with an image viewer.
url

'https://www.wga.hu/detail/c/cortona/1/05barber.jpg'

In [44]:
# url = df['img_url'].loc[88]
# title = df['URL'].loc[88]
# print(url, title)
# # df.head()
# df.iloc[88]
none_img_url_indices = df[df['img_url'].isnull()].index.tolist()
print(none_img_url_indices)
df.iloc[33221]

[33159, 33160, 33161, 33162, 33179, 33180, 33221, 33222, 33223, 33224, 33229, 33230, 33231, 33232, 33233, 33234, 33235, 33236, 33237, 33238, 33239, 33240, 33241, 33246, 33247, 33248, 33252, 33253, 33254, 33255, 33256, 33257, 33258, 33259, 33260, 33261, 33262, 33263, 33264, 33265, 33266, 33267, 33268, 33269, 33270, 33271, 33272, 33273, 33274, 33275, 33276, 33277, 33278, 33279, 33280, 33281, 33282, 33283, 33284, 33285, 33286, 33287, 33288, 33289, 33290, 33291, 33292, 33293, 33294, 33295, 33296, 33297, 33298, 33299, 33302, 33303, 33304, 33311, 33312, 33313, 33314, 33315, 33316, 33317, 33318, 33319, 33320, 33321, 33322, 33323, 33324, 33325, 33326, 33327, 33328, 33329, 33330, 33331, 33332, 33333, 33334, 33335, 33336, 33337, 33338, 33339, 33340, 33341, 33342, 33343, 33344, 33345, 33346, 33347, 33348, 33349, 33350, 33351, 33352, 33353, 33354, 33355, 33356, 33357, 33358, 33359, 33360, 33362, 33363, 33364, 33365, 33366, 33367, 33368, 33369, 33370, 33371, 33372, 33373, 33374, 33375, 33376, 33377

AUTHOR                                                     NaN
BORN-DIED                                                  NaN
TITLE                                                      NaN
DATE                                                       NaN
TECHNIQUE                                                  NaN
LOCATION                                                   NaN
URL                                                        NaN
FORM                                                       NaN
TYPE                                                       NaN
SCHOOL                                                     NaN
TIMEFRAME                                                  NaN
img_url      https://www.wga.hu/detail/b/beccaruz/portr_man...
Name: 1694, dtype: object