# Preprocessing

## Process Artists

In [41]:
import pandas as pd

# Find artists who have most paintings in the dataset
artists = pd.read_csv("data/artists.csv")
artists_sorted = artists.sort_values(by=["paintings"], ascending=False)

# drop artists who have the same style
artists_filtered = artists_sorted.drop_duplicates(subset="genre")
# drop artists who have two styles
artists_filtered = artists_filtered[~artists_filtered["genre"].str.contains(",")]

# make sure that atleast 100 paintaings are available for training
artists_top = artists_filtered[artists_filtered["paintings"] >= 100].reset_index()

artists_top

Unnamed: 0,index,id,name,years,genre,nationality,bio,wikipedia,paintings
0,8,8,Vincent van Gogh,1853 – 1890,Post-Impressionism,Dutch,Vincent Willem van Gogh (Dutch: [ˈvɪnsɛnt ˈʋɪl...,http://en.wikipedia.org/wiki/Vincent_van_Gogh,877
1,30,30,Edgar Degas,1834 - 1917,Impressionism,French,Edgar Degas (US: or UK: ; born Hilaire-Germai...,http://en.wikipedia.org/wiki/Edgar_Degas,702
2,13,13,Pablo Picasso,1881 - 1973,Cubism,Spanish,Pablo Ruiz Picasso (; Spanish: [ˈpaβlo piˈkaso...,http://en.wikipedia.org/wiki/Pablo_Picasso,439
3,19,19,Albrecht Durer,1471 - 1528,Northern Renaissance,German,Albrecht Dürer (; German: [ˈʔalbʁɛçt ˈdyːʁɐ]; ...,http://en.wikipedia.org/wiki/Albrecht_Dürer,328
4,16,16,Francisco Goya,1746 - 1828,Romanticism,Spanish,Francisco José de Goya y Lucientes (; Spanish:...,http://en.wikipedia.org/wiki/Francisco_Goya,291
5,31,31,Rembrandt,1606 - 1669,Baroque,Dutch,Rembrandt Harmenszoon van Rijn (; Dutch: [ˈrɛm...,http://en.wikipedia.org/wiki/Rembrandt,262
6,22,22,Marc Chagall,1887 - 1985,Primitivism,"French,Jewish,Belarusian",Marc Zakharovich Chagall ( shə-GAHL; born Mois...,http://en.wikipedia.org/wiki/Marc_Chagall,239
7,0,0,Amedeo Modigliani,1884 - 1920,Expressionism,Italian,Amedeo Clemente Modigliani (Italian pronunciat...,http://en.wikipedia.org/wiki/Amedeo_Modigliani,193
8,45,45,Andy Warhol,1928 – 1987,Pop Art,American,"Andy Warhol (; born Andrew Warhola; August 6, ...",https://en.wikipedia.org/wiki/Andy_Warhol,181
9,12,12,Mikhail Vrubel,1856 - 1910,Symbolism,Russian,Mikhail Aleksandrovich Vrubel (Russian: Михаи́...,http://en.wikipedia.org/wiki/Mikhail_Vrubel,171


## Process Images (for Softmax)

In [None]:
from PIL import Image
from tqdm import tqdm
import numpy as np
import os

IMG_HEIGHT = 64
IMG_WIDTH = 64
DATA_DIR = "./data/images/"

# Total features = 64 * 64 * 3 (RGB) = 12,288 columns per image
EXPECTED_FEATURES = IMG_HEIGHT * IMG_WIDTH * 3
def load_data():
    images = []
    labels = []
    valid_images_count = 0
    for artist in artists_top['name'].str.replace(" ", "_"):
        artist_folder = DATA_DIR + artist
        if os.path.exists(artist_folder):
            for image in tqdm(os.listdir(artist_folder)):
                try:
                    img_path = artist_folder+ "/" + image
                    with Image.open(img_path) as img:
                        img = img.resize((IMG_WIDTH, IMG_HEIGHT))
                        
                        img = img.convert('RGB')
                        
                        img_array = np.array(img).flatten()
                        
                        images.append(img_array)
                        labels.append(artist)
                        valid_images_count += 1
            
                except Exception as e:
                    print(f"Skipping corrupted file {img_path}: {e}")
    
    print(f"Loaded {valid_images_count} images")
load_data()

Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
