In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re
from urllib.parse import urlparse

In [5]:
df = pd.read_csv('data/art_data.csv')
df.head()

Unnamed: 0,AUTHOR,BORN-DIED,TITLE,DATE,TECHNIQUE,LOCATION,URL,FORM,TYPE,SCHOOL,TIMEFRAME
0,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Venus and Adonis,1574-88,"Oil on canvas, 68 x 95 cm","Fogg Art Museum, Harvard University, Cambridge",https://www.wga.hu/html/a/aachen/adonis.html,painting,mythological,German,1601-1650
1,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Allegory,1598,"Oil on copper, 56 x 47 cm","Alte Pinakothek, Munich",https://www.wga.hu/html/a/aachen/allegory.html,painting,mythological,German,1601-1650
2,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Allegory of Peace, Art and Abundance",1602,"Oil on canvas, 197 x 142 cm","The Hermitage, St. Petersburg",https://www.wga.hu/html/a/aachen/allegorz.html,painting,mythological,German,1601-1650
3,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Jupiter, Antiope and Cupid",1595-98,"Oil on copper, 31 x 21 cm","Kunsthistorisches Museum, Vienna",https://www.wga.hu/html/a/aachen/antiope.html,painting,mythological,German,1601-1650
4,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Pallas Athena, Venus and Juno",1593,"Oil on canvas, 54 x 67 cm","Museum of Fine Arts, Boston",https://www.wga.hu/html/a/aachen/athena.html,painting,mythological,German,1601-1650


In [6]:
# Keep only the rows where FORM is 'painting'
df = df[df['FORM'] == 'painting']
df = df.reset_index(drop=True)

In [7]:
df.head()

Unnamed: 0,AUTHOR,BORN-DIED,TITLE,DATE,TECHNIQUE,LOCATION,URL,FORM,TYPE,SCHOOL,TIMEFRAME
0,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Venus and Adonis,1574-88,"Oil on canvas, 68 x 95 cm","Fogg Art Museum, Harvard University, Cambridge",https://www.wga.hu/html/a/aachen/adonis.html,painting,mythological,German,1601-1650
1,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Allegory,1598,"Oil on copper, 56 x 47 cm","Alte Pinakothek, Munich",https://www.wga.hu/html/a/aachen/allegory.html,painting,mythological,German,1601-1650
2,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Allegory of Peace, Art and Abundance",1602,"Oil on canvas, 197 x 142 cm","The Hermitage, St. Petersburg",https://www.wga.hu/html/a/aachen/allegorz.html,painting,mythological,German,1601-1650
3,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Jupiter, Antiope and Cupid",1595-98,"Oil on copper, 31 x 21 cm","Kunsthistorisches Museum, Vienna",https://www.wga.hu/html/a/aachen/antiope.html,painting,mythological,German,1601-1650
4,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)","Pallas Athena, Venus and Juno",1593,"Oil on canvas, 54 x 67 cm","Museum of Fine Arts, Boston",https://www.wga.hu/html/a/aachen/athena.html,painting,mythological,German,1601-1650


In [8]:
# drop all styles that isnt portrait, landscape, still life, or genre
df['TYPE'].value_counts()
styles = ['landscape','genre','portrait','still-life']
df = df[df['TYPE'].isin(styles)]

In [9]:
df = df.reset_index(drop=True)

Need images scraped from url, and metadata(AUTHOR, DATE, TECHNIQUE, TYPE, SCHOOL, and TIMEFRAME)

In [13]:
def get_image_url(html_url):
    try:
        image_url_base = html_url.replace('/html/', '/art/')
        image_url = image_url_base.replace('.html', '.jpg')
        print(f"Retrieved image URL: {image_url}")
        return image_url
    except Exception as e:
        print(f"Error occurred while converting HTML URL to image URL: {e}")
        return None

# test
img_url = get_image_url('https://www.wga.hu/html/a/aachen/athena.html')

Retrieved image URL: https://www.wga.hu/art/a/aachen/athena.jpg


In [20]:
# def download_image(url, save_path, image_name):
#     try:
#         response = requests.get(url, stream=True)
#         # If the first attempt fails, change /art/ to /detail/ --> Same img but smaller
#         if response.status_code != 200:
#             print(f"Failed to retrieve image from {url}, trying altered URL...")
#             url = url.replace('/art/', '/detail/')
#             response = requests.get(url, stream=True)
#         if response.status_code == 200:
#             image = Image.open(BytesIO(response.content))
#             if not os.path.exists(save_path):
#                 os.makedirs(save_path)
#             full_path = os.path.join(save_path, image_name)
#             image.save(full_path)
#             print(f"Image saved to {full_path}")
#         else:
#             print(f"Failed to retrieve image from both URLs for {image_name}")
    
#     except Exception as e:
#         print(f"An error occurred while trying to download the image: {e}")
def download_image(url, save_path, image_name):
    def try_download_image(download_url):
        response = requests.get(download_url, stream=True)
        if response.status_code == 200 and 'image' in response.headers.get('Content-Type', ''):
            return Image.open(BytesIO(response.content))
        else:
            raise Exception(f"Failed to retrieve image from {download_url}")

    try:
        # First attempt to download the image using the original URL
        image = try_download_image(url)
    except Exception as UnidentifiedImageError:
        # If the first attempt fails due to an UnidentifiedImageError, try with the altered URL
        print(f"Failed to retrieve image from {url}, trying altered URL...")
        try:
            altered_url = url.replace('/art/', '/detail/')
            image = try_download_image(altered_url)
        except Exception as e:
            # If the second attempt also fails, log the error and exit the function
            print(e)
            return
    except Exception as e:
        # Catch any other exceptions that may occur
        print(e)
        return

    # If we've successfully got an image, save it
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    full_path = os.path.join(save_path, image_name)
    image.save(full_path)
    print(f"Image saved to {full_path}")

In [7]:
def get_description(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        description_elements = soup.select('td p')        
        if description_elements:
            description = description_elements[0].get_text(strip=True)
            if len(description)<1:
                # try description_elements[1]
                return 'empty'
            print(f'scraped idx {idx}')
            return description
        else:
            return ("No <p> tags were found within <td> tags.")

In [15]:
# landscape_df = df[df['TYPE'] == 'landscape']
# landscape_df.head()
# landscape_df.reset_index(drop=True)
# # get all image urls

for idx, url in enumerate(landscape_df['URL']):
    img_url = get_image_url(url)
    landscape_df.at[idx, 'img_url'] = img_url

Retrieved image URL: https://www.wga.hu/art/a/abbate/deerhunt.jpg
Retrieved image URL: https://www.wga.hu/art/a/abbati/abbati1.jpg
Retrieved image URL: https://www.wga.hu/art/a/abbati/abbati2.jpg
Retrieved image URL: https://www.wga.hu/art/a/abbati/abbati3.jpg
Retrieved image URL: https://www.wga.hu/art/a/achenbac/firework.jpg
Retrieved image URL: https://www.wga.hu/art/a/albotto/campo.jpg
Retrieved image URL: https://www.wga.hu/art/a/albotto/giuseppe.jpg
Retrieved image URL: https://www.wga.hu/art/a/allegrai/etienne/landmose.jpg
Retrieved image URL: https://www.wga.hu/art/a/allegrai/etienne/riverla1.jpg
Retrieved image URL: https://www.wga.hu/art/a/allegrai/etienne/riverla2.jpg
Retrieved image URL: https://www.wga.hu/art/a/alsloot/extensiv.jpg
Retrieved image URL: https://www.wga.hu/art/a/alsloot/skating.jpg
Retrieved image URL: https://www.wga.hu/art/a/alsloot/winter_l.jpg
Retrieved image URL: https://www.wga.hu/art/a/alsloot/winterla.jpg
Retrieved image URL: https://www.wga.hu/art/a

Make df s based on type and then split by nationality and year

ie. Portraits --> Italian 17th Century, Dutch 18th Century, Italian 14th Century etc


In [18]:
# Replace 'DATE' values with 'TIMEFRAME' values where 'DATE' is '-'
landscape_df.loc[landscape_df['DATE'] == '-', 'DATE'] = landscape_df.loc[landscape_df['DATE'] == '-', 'TIMEFRAME']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  landscape_df.loc[landscape_df['DATE'] == '-', 'DATE'] = landscape_df.loc[landscape_df['DATE'] == '-', 'TIMEFRAME']


In [19]:
def normalize_date(date_str):
    try:
        date_str = date_str.strip().lower()
        match_4 = re.search(r'\d{4}', date_str)
        match_3 = re.search(r'\d{3}', date_str)
        
        if date_str == '-':
            return None

        if match_4:
            return int(match_4.group(0))
        elif match_3:
            return int(match_3.group(0))
        else:
            print(f'no match no error: {date_str}')
            return None
    except ValueError:
        print(f"Value causing issue: '{date_str}'")
        return None

In [20]:
landscape_df['DATE_NORMALIZED'] = landscape_df['DATE'].apply(normalize_date)


no match no error: -.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  landscape_df['DATE_NORMALIZED'] = landscape_df['DATE'].apply(normalize_date)


In [21]:
landscape_df = landscape_df.dropna(subset=['DATE_NORMALIZED'])
landscape_df.reset_index(drop=True)

Unnamed: 0,AUTHOR,BORN-DIED,TITLE,DATE,TECHNIQUE,LOCATION,URL,FORM,TYPE,SCHOOL,TIMEFRAME,DATE_NORMALIZED
0,"AAGAARD, Carl Frederik","(b. 1833, Odense, d. 1895, København)",Deer beside a Lake,1888,"Oil on canvas, 53 x 82 cm",Private collection,https://www.wga.hu/html/a/aagaard/deerlake.html,painting,landscape,Danish,1851-1900,1888.0
1,"AAGAARD, Carl Frederik","(b. 1833, Odense, d. 1895, København)",The Rose Garden,1877,"Oil on canvas, 98 x 80 cm",Private collection,https://www.wga.hu/html/a/aagaard/rosegard.html,painting,landscape,Danish,1851-1900,1877.0
2,"ABBATE, Niccolò dell'","(b. 1509, Modena, d. 1571, Fontainebleau)",Stag Hunt,1550-52,"Oil on canvas, 116 x 159 cm","Galleria Borghese, Rome",https://www.wga.hu/html/a/abbate/deerhunt.html,painting,landscape,Italian,1501-1550,1550.0
3,"ABBATI, Giuseppe","(b. 1836, Napoli, d. 1868, Firenze)",Landscape at Castiglioncello,1863,"Oil on panel, 10 x 30 cm","Galleria dell'Arte Moderna, Palazzo Pitti, Flo...",https://www.wga.hu/html/a/abbati/abbati1.html,painting,landscape,Italian,1851-1900,1863.0
4,"ABBATI, Giuseppe","(b. 1836, Napoli, d. 1868, Firenze)",Country Road with Cypresses,c. 1860,"Oil on canvas, 28 x 37 cm","Galleria dell'Arte Moderna, Palazzo Pitti, Flo...",https://www.wga.hu/html/a/abbati/abbati2.html,painting,landscape,Italian,1851-1900,1860.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4097,"ZUCCARELLI, Francesco","(b. 1702, Pitigliano, d. 1788, Firenze)",Landscape with a Woman Leading a Cow,before 1774,"Oil on canvas, 60 x 88 cm","The Hermitage, St. Petersburg",https://www.wga.hu/html/z/zuccarel/land_cow.html,painting,landscape,Italian,1701-1750,1774.0
4098,"ZUCCARELLI, Francesco","(b. 1702, Pitigliano, d. 1788, Firenze)",Landscape with Bridge,c. 1735,"Oil on canvas, 56 x 73 cm","Szépmûvészeti Múzeum, Budapest",https://www.wga.hu/html/z/zuccarel/landbrid.html,painting,landscape,Italian,1701-1750,1735.0
4099,"ZUCCARELLI, Francesco","(b. 1702, Pitigliano, d. 1788, Firenze)",Landscape with Girls at the River,1760s,"Oil on canvas, 118 x 137 cm","Gallerie dell'Accademia, Venice",https://www.wga.hu/html/z/zuccarel/landgirl.html,painting,landscape,Italian,1701-1750,1760.0
4100,"ZUCCARELLI, Francesco","(b. 1702, Pitigliano, d. 1788, Firenze)",Landscape with a Rider,1701-1750,Oil on canvas,"Accademia Carrara, Bergamo",https://www.wga.hu/html/z/zuccarel/landscap.html,painting,landscape,Italian,1701-1750,1701.0


In [23]:
landscape_df['CENTURY'] = ((landscape_df['DATE_NORMALIZED'] // 100) + 1).astype(int)

CLEAN ENTIRE DF, 
1) DROP NATIONALITES THAT ARE NEGLIGIBLE
2) DROP CENTURIES <15 and >19
3) DROP BORN-DIED, DATE, LOCATION, 'FORM', 'TIMEFRAME'
4) RENAME DATE_NORMALIZED TO YEAR
5) ADD SOURCE --> ALL SOURCES ARE FROM WGA
6) ADD PATH
7) CLEAN AUTHOR TITLE
8) MAKE YEAR AN INT

In [24]:
# these are the 6 most abundant nationaionalities, everything else is negligible
nationalities = ['Italian', 'Spanish', 'Dutch', 'Flemish', 'French', 'German']
# df = df[df['SCHOOL'].isin(nationalities)]
landscape_df = landscape_df[landscape_df['SCHOOL'].isin(nationalities)]

# df = df[(df['CENTURY'] >= 15) & (df['CENTURY'] <= 19)]
landscape_df = landscape_df[(landscape_df['CENTURY'] >= 15) & (landscape_df['CENTURY'] <= 19)]

columns_to_drop = ['BORN-DIED', 'DATE', 'LOCATION', 'FORM', 'TIMEFRAME']
# df = df.drop(columns=columns_to_drop)
landscape_df = landscape_df.drop(columns=columns_to_drop)

# df['PATH'] = ''
# df['SOURCE'] = 'WGA'
landscape_df['PATH'] = ''
landscape_df['SOURCE'] = 'WGA'

# df.rename(columns={'DATE_NORMALIZED': 'YEAR'}, inplace=True)
landscape_df.rename(columns={'DATE_NORMALIZED': 'YEAR'}, inplace=True)
landscape_df['YEAR'] = landscape_df['YEAR'].astype(int)

landscape_df['AUTHOR'] = landscape_df['AUTHOR'].apply(lambda x: x.split(',')[0].strip().title().replace(' ', ''))

In [25]:
landscape_df = landscape_df.reset_index(drop=True)

In [26]:
# df['Path'] = df.apply(lambda row: f"data/{row['TYPE']+'s'}/{row['CENTURY']}th_Century_{row['SCHOOL'].replace(' ', '_')}", axis=1)
landscape_df['Path'] = landscape_df.apply(lambda row: f"data/{row['TYPE']+'s'}/{row['CENTURY']}th_Century_{row['SCHOOL'].replace(' ', '_')}", axis=1)

## !!! I AM HERE !!!!

In [17]:
# landscape_df.to_csv('data/WGA_landscapes/landscape_csv.csv', index=False)
landscape_df.head()
# landscape_df = pd.read_csv('data/WGA_landscapes/landscape_csv.csv')

Unnamed: 0,AUTHOR,TITLE,TECHNIQUE,URL,TYPE,SCHOOL,YEAR,CENTURY,PATH,SOURCE,Path,DESCRIPTION,img_url
0,Abbate,Stag Hunt,"Oil on canvas, 116 x 159 cm",https://www.wga.hu/html/a/abbate/deerhunt.html,landscape,Italian,1550,16,,WGA,data/landscapes/16th_Century_Italian,In a fabulous setting we see the last moments ...,https://www.wga.hu/art/a/abbate/deerhunt.jpg
1,Abbati,Landscape at Castiglioncello,"Oil on panel, 10 x 30 cm",https://www.wga.hu/html/a/abbati/abbati1.html,landscape,Italian,1863,19,,WGA,data/landscapes/19th_Century_Italian,The mid-nineteenth century in Italy was the pe...,https://www.wga.hu/art/a/abbati/abbati1.jpg
2,Abbati,Country Road with Cypresses,"Oil on canvas, 28 x 37 cm",https://www.wga.hu/html/a/abbati/abbati2.html,landscape,Italian,1860,19,,WGA,data/landscapes/19th_Century_Italian,"To this day, the hilly Tuscan countryside is d...",https://www.wga.hu/art/a/abbati/abbati2.jpg
3,Abbati,The Cloister of Santa Croce,"Oil on cardboard, 19 x 25 cm",https://www.wga.hu/html/a/abbati/abbati3.html,landscape,Italian,1861,19,,WGA,data/landscapes/19th_Century_Italian,Abbati painted in the cloisters of Santa Croce...,https://www.wga.hu/art/a/abbati/abbati3.jpg
4,Achenbach,Fireworks in Naples,"Oil on canvas, 66 x 102 cm",https://www.wga.hu/html/a/achenbac/firework.html,landscape,German,1875,19,,WGA,data/landscapes/19th_Century_German,Achenbach's favourite theme was the striking s...,https://www.wga.hu/art/a/achenbac/firework.jpg


In [8]:
landscape_df['DESCRIPTION'] = ''
for idx, url in enumerate(landscape_df['URL']):
    img_description = get_description(url)
    landscape_df.at[idx, 'DESCRIPTION'] = img_description

scraped idx 0
scraped idx 1
scraped idx 2
scraped idx 3
scraped idx 4
scraped idx 5
scraped idx 7
scraped idx 8
scraped idx 9
scraped idx 10
scraped idx 11
scraped idx 13
scraped idx 14
scraped idx 15
scraped idx 16
scraped idx 17
scraped idx 18
scraped idx 19
scraped idx 20
scraped idx 21
scraped idx 22
scraped idx 24
scraped idx 26
scraped idx 27
scraped idx 28
scraped idx 29
scraped idx 30
scraped idx 32
scraped idx 34
scraped idx 35
scraped idx 36
scraped idx 38
scraped idx 39
scraped idx 40
scraped idx 42
scraped idx 43
scraped idx 44
scraped idx 45
scraped idx 46
scraped idx 47
scraped idx 48
scraped idx 49
scraped idx 52
scraped idx 53
scraped idx 54
scraped idx 55
scraped idx 59
scraped idx 60
scraped idx 62
scraped idx 63
scraped idx 64
scraped idx 66
scraped idx 67
scraped idx 68
scraped idx 69
scraped idx 70
scraped idx 71
scraped idx 72
scraped idx 73
scraped idx 74
scraped idx 75
scraped idx 76
scraped idx 78
scraped idx 81
scraped idx 83
scraped idx 84
scraped idx 85
scra

SPLIT STYLE TYPES INTO CENTURY_NATIONALITY CSVs

In [29]:
portraits_df = df[df['TYPE'] == 'portrait']
genre_df = df[df['TYPE'] == 'genre']
landscape_df = df[df['TYPE'] == 'landscape']
still_life_df = df[df['TYPE'] == 'still-life']

In [1]:
# portrait_dfs = {}

# for (century, school), group_df in portraits_df.groupby(['CENTURY', 'SCHOOL']):
#     key = f"{century}th_Century_{school}"
#     portrait_dfs[key] = group_df
landscape_df.head()

NameError: name 'landscape_df' is not defined

In [18]:
path = 'data/WGA_landscapes'

for (century, school), group_df in landscape_df.groupby(['CENTURY', 'SCHOOL']):
    # Make dir name based on the century and school
    dir_name = f"{century}th_Century_{school}"
    dir_path = os.path.join(path, dir_name)
    os.makedirs(dir_path, exist_ok=True)
    
    # make csv name now and save it in the new dir
    filename = f"{dir_name}.csv"
    filepath = os.path.join(dir_path, filename)
    group_df.to_csv(filepath, index=False)
    
    print(f"Saved: {filepath}")

Saved: data/WGA_landscapes/15th_Century_German/15th_Century_German.csv
Saved: data/WGA_landscapes/15th_Century_Italian/15th_Century_Italian.csv
Saved: data/WGA_landscapes/16th_Century_Dutch/16th_Century_Dutch.csv
Saved: data/WGA_landscapes/16th_Century_Flemish/16th_Century_Flemish.csv
Saved: data/WGA_landscapes/16th_Century_German/16th_Century_German.csv
Saved: data/WGA_landscapes/16th_Century_Italian/16th_Century_Italian.csv
Saved: data/WGA_landscapes/16th_Century_Spanish/16th_Century_Spanish.csv
Saved: data/WGA_landscapes/17th_Century_Dutch/17th_Century_Dutch.csv
Saved: data/WGA_landscapes/17th_Century_Flemish/17th_Century_Flemish.csv
Saved: data/WGA_landscapes/17th_Century_French/17th_Century_French.csv
Saved: data/WGA_landscapes/17th_Century_German/17th_Century_German.csv
Saved: data/WGA_landscapes/17th_Century_Italian/17th_Century_Italian.csv
Saved: data/WGA_landscapes/17th_Century_Spanish/17th_Century_Spanish.csv
Saved: data/WGA_landscapes/18th_Century_Dutch/18th_Century_Dutch.cs

In [32]:
genre_dfs = {}

for (century, school), group_df in genre_df.groupby(['CENTURY', 'SCHOOL']):
    key = f"{century}th_Century_{school}"
    genre_dfs[key] = group_df

In [34]:
path = 'data/genres'

for (century, school), group_df in genre_df.groupby(['CENTURY', 'SCHOOL']):
    # Make dir name based on the century and school
    dir_name = f"{century}th_Century_{school}"
    dir_path = os.path.join(path, dir_name)
    os.makedirs(dir_path, exist_ok=True)
    
    # make csv name now and save it in the new dir
    filename = f"{dir_name}.csv"
    filepath = os.path.join(dir_path, filename)
    group_df.to_csv(filepath, index=False)
    
    print(f"Saved: {filepath}")

Saved: data/genres/15th_Century_Flemish/15th_Century_Flemish.csv
Saved: data/genres/15th_Century_Italian/15th_Century_Italian.csv
Saved: data/genres/16th_Century_Dutch/16th_Century_Dutch.csv
Saved: data/genres/16th_Century_Flemish/16th_Century_Flemish.csv
Saved: data/genres/16th_Century_French/16th_Century_French.csv
Saved: data/genres/16th_Century_German/16th_Century_German.csv
Saved: data/genres/16th_Century_Italian/16th_Century_Italian.csv
Saved: data/genres/16th_Century_Spanish/16th_Century_Spanish.csv
Saved: data/genres/17th_Century_Dutch/17th_Century_Dutch.csv
Saved: data/genres/17th_Century_Flemish/17th_Century_Flemish.csv
Saved: data/genres/17th_Century_French/17th_Century_French.csv
Saved: data/genres/17th_Century_German/17th_Century_German.csv
Saved: data/genres/17th_Century_Italian/17th_Century_Italian.csv
Saved: data/genres/17th_Century_Spanish/17th_Century_Spanish.csv
Saved: data/genres/18th_Century_Dutch/18th_Century_Dutch.csv
Saved: data/genres/18th_Century_Flemish/18th_

In [35]:
still_life_dfs = {}

for (century, school), group_df in still_life_df.groupby(['CENTURY', 'SCHOOL']):
    key = f"{century}th_Century_{school}"
    still_life_dfs[key] = group_df

In [36]:
path = 'data/still-lifes'

for (century, school), group_df in still_life_df.groupby(['CENTURY', 'SCHOOL']):
    # Make dir name based on the century and school
    dir_name = f"{century}th_Century_{school}"
    dir_path = os.path.join(path, dir_name)
    os.makedirs(dir_path, exist_ok=True)
    
    # make csv name now and save it in the new dir
    filename = f"{dir_name}.csv"
    filepath = os.path.join(dir_path, filename)
    group_df.to_csv(filepath, index=False)
    
    print(f"Saved: {filepath}")

Saved: data/still-lifes/15th_Century_Flemish/15th_Century_Flemish.csv
Saved: data/still-lifes/15th_Century_Italian/15th_Century_Italian.csv
Saved: data/still-lifes/16th_Century_Flemish/16th_Century_Flemish.csv
Saved: data/still-lifes/16th_Century_German/16th_Century_German.csv
Saved: data/still-lifes/16th_Century_Italian/16th_Century_Italian.csv
Saved: data/still-lifes/17th_Century_Dutch/17th_Century_Dutch.csv
Saved: data/still-lifes/17th_Century_Flemish/17th_Century_Flemish.csv
Saved: data/still-lifes/17th_Century_French/17th_Century_French.csv
Saved: data/still-lifes/17th_Century_German/17th_Century_German.csv
Saved: data/still-lifes/17th_Century_Italian/17th_Century_Italian.csv
Saved: data/still-lifes/17th_Century_Spanish/17th_Century_Spanish.csv
Saved: data/still-lifes/18th_Century_Dutch/18th_Century_Dutch.csv
Saved: data/still-lifes/18th_Century_Flemish/18th_Century_Flemish.csv
Saved: data/still-lifes/18th_Century_French/18th_Century_French.csv
Saved: data/still-lifes/18th_Century

In [37]:
landscape_dfs = {}

for (century, school), group_df in landscape_df.groupby(['CENTURY', 'SCHOOL']):
    key = f"{century}th_Century_{school}"
    landscape_dfs[key] = group_df

In [38]:
path = 'data/landscapes'

for (century, school), group_df in landscape_df.groupby(['CENTURY', 'SCHOOL']):
    # Make dir name based on the century and school
    dir_name = f"{century}th_Century_{school}"
    dir_path = os.path.join(path, dir_name)
    os.makedirs(dir_path, exist_ok=True)
    
    # make csv name now and save it in the new dir
    filename = f"{dir_name}.csv"
    filepath = os.path.join(dir_path, filename)
    group_df.to_csv(filepath, index=False)
    
    print(f"Saved: {filepath}")

Saved: data/landscapes/15th_Century_German/15th_Century_German.csv
Saved: data/landscapes/15th_Century_Italian/15th_Century_Italian.csv
Saved: data/landscapes/16th_Century_Dutch/16th_Century_Dutch.csv
Saved: data/landscapes/16th_Century_Flemish/16th_Century_Flemish.csv
Saved: data/landscapes/16th_Century_German/16th_Century_German.csv
Saved: data/landscapes/16th_Century_Italian/16th_Century_Italian.csv
Saved: data/landscapes/16th_Century_Spanish/16th_Century_Spanish.csv
Saved: data/landscapes/17th_Century_Dutch/17th_Century_Dutch.csv
Saved: data/landscapes/17th_Century_Flemish/17th_Century_Flemish.csv
Saved: data/landscapes/17th_Century_French/17th_Century_French.csv
Saved: data/landscapes/17th_Century_German/17th_Century_German.csv
Saved: data/landscapes/17th_Century_Italian/17th_Century_Italian.csv
Saved: data/landscapes/17th_Century_Spanish/17th_Century_Spanish.csv
Saved: data/landscapes/18th_Century_Dutch/18th_Century_Dutch.csv
Saved: data/landscapes/18th_Century_Flemish/18th_Centu

In [21]:
def process_csv_and_save_images(csv_path, output_folder):
    df = pd.read_csv(csv_path)
    
    for index, row in df.iterrows():
        image_filename = f"painting{index}_{row['TITLE'].replace(' ','')}_{row['YEAR']}_{row['AUTHOR']}_{row['SCHOOL']}.jpg"
        
        image_url = row['img_url'] 
        download_image(image_url, output_folder, image_filename)

In [22]:
base_dir = 'data/WGA_landscapes'
stop_processing = False

for century_school_dir in sorted(os.listdir(base_dir)): 
    # if century_school_dir == "16th_Century_Italian":
    #     start_processing = True  # Start processing where error occured
    if not stop_processing:
        century_school_path = os.path.join(base_dir, century_school_dir)
        if os.path.isdir(century_school_path):
            for file in os.listdir(century_school_path):
                if file.endswith('.csv'):
                    csv_path = os.path.join(century_school_path, file)
                    print(f"Processing {csv_path}")
                    process_csv_and_save_images(csv_path, century_school_path)

                    # if file == '16th_Century_French.csv':
                    #     stop_processing = True

Processing data/WGA_landscapes/15th_Century_German/15th_Century_German.csv
Image saved to data/WGA_landscapes/15th_Century_German/painting0_PrinceTassiloRidestoHunting_1444_MasterOfThePollingPanels_German.jpg
Processing data/WGA_landscapes/15th_Century_Italian/15th_Century_Italian.csv
Image saved to data/WGA_landscapes/15th_Century_Italian/painting0_ViewofthePiazzetta_1480_Bastiani_Italian.jpg
Image saved to data/WGA_landscapes/15th_Century_Italian/painting1_DanteandtheThreeKingdoms_1465_DomenicoDiMichelino_Italian.jpg
Image saved to data/WGA_landscapes/15th_Century_Italian/painting2_ArchitecturalView_1477_FrancescoDiGiorgioMartini_Italian.jpg
Image saved to data/WGA_landscapes/15th_Century_Italian/painting3_TheHuntintheForest_1460_Uccello_Italian.jpg
Image saved to data/WGA_landscapes/15th_Century_Italian/painting4_ViewofGenoa_1482_UnknownMaster_Italian.jpg
Processing data/WGA_landscapes/16th_Century_Dutch/16th_Century_Dutch.csv
Image saved to data/WGA_landscapes/16th_Century_Dutch/pa

In [51]:
# make master portraits csv

dataframes = []

base_path = 'data/portraits'

for folder_name in os.listdir(base_path):
    folder_path = os.path.join(base_path, folder_name)
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                file_path = os.path.join(folder_path, file_name)
                df = pd.read_csv(file_path)
                dataframes.append(df)

concatenated_df = pd.concat(dataframes, ignore_index=True)

output_path = 'data/portraits/portraits.csv'
concatenated_df.to_csv(output_path, index=False)


AUTHOR                                                     Conte
TITLE                                   Portrait of Michelangelo
TECHNIQUE                               Oil on panel, 88 x 64 cm
URL                https://www.wga.hu/html/c/conte/michelan.html
TYPE                                                    portrait
SCHOOL                                                   Italian
img_url              https://www.wga.hu/art/c/conte/michelan.jpg
YEAR                                                        1540
CENTURY                                                       16
PATH                                                         NaN
SOURCE                                                       WGA
Path                         data/portraits/16th_Century_Italian
DESCRIPTION    We kindly inform you that the location of some...
Name: 164, dtype: object