In [99]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [100]:
import os
import sys
import pandas as pd
import numpy as np

In [101]:
sys.path.insert(0, '..')
from data_wrangling import create_new_folder, clear_old_images, get_sample, get_image_info

In [102]:
art_data = pd.read_csv('../raw_data/artists.csv')

In [103]:
#drop id, years, nationality, bio, wikipedia
art_data.drop(labels='id', axis=1, inplace = True)

#rename 'Albrecht Dürer' to 'Albrecht Durer'
art_data['name'] = art_data['name'].replace(to_replace='Albrecht D.rer', value = 'Albrecht Durer', regex = True)
art_data['name'] = art_data['name'].str.replace(' ', '_')

In [104]:
#create a new folder for the csv files
datapath = '..\data'
create_new_folder(datapath)

..\data path already exists


In [105]:
#create a new csv file with at most one genre for an artist
newfile = 'artists_clean.csv'
fullpath = os.path.join(datapath, newfile)
art_data.to_csv(fullpath)

In [111]:
#create data frame with artist name, genre, filename, filepath
path = r'..\raw_images'
#get image information
art_file = get_image_info(path)

In [116]:
#extract the artist name and genre
labels = art_data.loc[:,['name', 'genre']]
labels['label'] = labels['name']
labels.drop(labels='name', axis=1, inplace=True)

Unnamed: 0,genre,label
0,Expressionism,Amedeo_Modigliani
1,"Expressionism,Abstractionism",Vasiliy_Kandinskiy
2,"Social Realism,Muralism",Diego_Rivera
3,Impressionism,Claude_Monet
4,"Surrealism,Impressionism",Rene_Magritte
5,Surrealism,Salvador_Dali
6,"Realism,Impressionism",Edouard_Manet
7,Byzantine Art,Andrei_Rublev
8,Post-Impressionism,Vincent_van_Gogh
9,"Symbolism,Art Nouveau",Gustav_Klimt


In [117]:
#label each file with their respective genres
art_file = art_file.merge(labels, how='left', on='label')
art_file.head()

Unnamed: 0,label,filename,filepath,genre
0,Albrecht_Durer,Albrecht_Durer_(1).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance
1,Albrecht_Durer,Albrecht_Durer_(10).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance
2,Albrecht_Durer,Albrecht_Durer_(100).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance
3,Albrecht_Durer,Albrecht_Durer_(101).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance
4,Albrecht_Durer,Albrecht_Durer_(102).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance


In [118]:
#drop artworks with multiple styles
art_file = art_file.loc[~art_file.genre.str.contains(',')]
art_file.reset_index(inplace=True, drop=True)
art_file.shape

(6669, 4)

In [119]:
#create a csv file from the new dataframe
newfile = 'art_labels.csv'
fullpath = os.path.join(datapath, newfile)
art_file.to_csv(fullpath)

In [120]:
#create a folder to get a sample of training and validation images
images = r'../images' #sample images to upload to github; NOTE: this never changes or is used for training and validation
train = r'../train_images'
val = r'../val_images'
test = r'../test_images'

In [121]:
images = get_image_info(images)

In [122]:
genre_sample = ['Impressionism', 'Cubism', 'Expressionism', 'Pop Art', 'Byzantine Art', 'Abstract Expressionism']

In [131]:
test_file = art_file[art_file.filename.isin(list(images.filename))]

(667, 4)

In [124]:
#get specific genres from ../images to test
get_sample(test, test_file, genre_sample, 1) #get all images
test_df = get_image_info(test)

Checking if ../test_images exists...
Clearing any previous samples...
Empty directory. Delete ../test_images\Abstract Expressionism
Empty directory. Delete ../test_images\Byzantine Art
Empty directory. Delete ../test_images\Cubism
Empty directory. Delete ../test_images\Expressionism
Empty directory. Delete ../test_images\Impressionism
Empty directory. Delete ../test_images\Pop Art
Deleted 230 images
../test_images\Impressionism created
../test_images\Cubism created
../test_images\Expressionism created
../test_images\Pop Art created
../test_images\Byzantine Art created
../test_images\Abstract Expressionism created
Generated 230 new images


In [129]:
train_file = art_file[~art_file.filename.isin(list(test_df.filename))]
train_file.shape

(6439, 4)

In [125]:
#get a sample of new images
get_sample(train, train_file, genre_sample, .5)
train_df = get_image_info(train)
train_df.to_csv(r'..\data\train.csv')

Checking if ../train_images exists...
Clearing any previous samples...
Empty directory. Delete ../train_images\Abstract Expressionism
Empty directory. Delete ../train_images\Byzantine Art
Empty directory. Delete ../train_images\Cubism
Empty directory. Delete ../train_images\Expressionism
Empty directory. Delete ../train_images\Impressionism
Empty directory. Delete ../train_images\Pop Art
Deleted 2983 images
../train_images\Impressionism created
../train_images\Cubism created
../train_images\Expressionism created
../train_images\Pop Art created
../train_images\Byzantine Art created
../train_images\Abstract Expressionism created
Generated 1038 new images


In [130]:
val_file = train_file[~train_file.filename.isin(list(train_df.filename))]
val_file.shape

(5401, 4)

In [132]:
get_sample(val, val_file, genre_sample, .2)
val_df = get_image_info(val)

Checking if ../val_images exists...
Clearing any previous samples...
Empty directory. Delete ../val_images\Abstract Expressionism
Empty directory. Delete ../val_images\Byzantine Art
Empty directory. Delete ../val_images\Cubism
Empty directory. Delete ../val_images\Expressionism
Empty directory. Delete ../val_images\Impressionism
Empty directory. Delete ../val_images\Pop Art
Deleted 518 images
../val_images\Impressionism created
../val_images\Cubism created
../val_images\Expressionism created
../val_images\Pop Art created
../val_images\Byzantine Art created
../val_images\Abstract Expressionism created
Generated 206 new images
