In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pandas as pd
import shutil
import random
import numpy as np

In [3]:
sys.path.insert(0, '..')
from data_wrangling import create_new_folder, clear_old_images, get_sample, get_image_info

In [4]:
art_data = pd.read_csv('../raw_data/artists.csv')

In [5]:
#drop id, years, nationality, bio, wikipedia
art_data.drop(labels='id', axis=1, inplace = True)

#rename 'Albrecht Dürer' to 'Albrecht Durer'
art_data['name'] = art_data['name'].replace(to_replace='Albrecht D.rer', value = 'Albrecht Durer', regex = True)
art_data['name'] = art_data['name'].str.replace(' ', '_')

#drop artworks with multiple styles
art_data = art_data.loc[~art_data.genre.str.contains(',')]
art_data.reset_index(inplace=True, drop=True)
art_data.head()

Unnamed: 0,name,years,genre,nationality,bio,wikipedia,paintings
0,Amedeo_Modigliani,1884 - 1920,Expressionism,Italian,Amedeo Clemente Modigliani (Italian pronunciat...,http://en.wikipedia.org/wiki/Amedeo_Modigliani,193
1,Claude_Monet,1840 - 1926,Impressionism,French,Oscar-Claude Monet (; French: [klod mɔnɛ]; 14 ...,http://en.wikipedia.org/wiki/Claude_Monet,73
2,Salvador_Dali,1904 - 1989,Surrealism,Spanish,Salvador Domingo Felipe Jacinto Dalí i Domènec...,http://en.wikipedia.org/wiki/Salvador_Dalí,139
3,Andrei_Rublev,1360 - 1430,Byzantine Art,Russian,"Andrei Rublev (Russian: Андре́й Рублёв, IPA: [...",http://en.wikipedia.org/wiki/Andrei_Rublev,99
4,Vincent_van_Gogh,1853 – 1890,Post-Impressionism,Dutch,Vincent Willem van Gogh (Dutch: [ˈvɪnsɛnt ˈʋɪl...,http://en.wikipedia.org/wiki/Vincent_van_Gogh,877


In [6]:
#create a new folder for the csv files
datapath = '..\data'
create_new_folder(datapath)

Checking if ..\data exists...
..\data path already exists


In [7]:
#create a new csv file with at most one genre for an artist
newfile = 'artists_clean.csv'
fullpath = os.path.join(datapath, newfile)
art_data.to_csv(fullpath)

In [8]:
#create data frame with artist name, genre, filename, filepath
path = r'..\raw_images'
#get image information
img_info = get_image_info(path)
#convert lists to dataframe
zipped = zip(['name', 'filename', 'filepath'], img_info)
art_file = pd.DataFrame(dict(list(zipped)))

In [9]:
#extract the artist name and genre
labels = art_data.loc[:,['name', 'genre']]
#label each file with their respective genres
art_file = art_file.merge(labels, how='left', on='name')
art_file.head()

Unnamed: 0,name,filename,filepath,genre
0,Albrecht_Durer,Albrecht_Durer_(1).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance
1,Albrecht_Durer,Albrecht_Durer_(10).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance
2,Albrecht_Durer,Albrecht_Durer_(100).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance
3,Albrecht_Durer,Albrecht_Durer_(101).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance
4,Albrecht_Durer,Albrecht_Durer_(102).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance


In [10]:
#create a csv file from the new dataframe
newfile = 'art_labels.csv'
fullpath = os.path.join(datapath, newfile)
art_file.to_csv(fullpath)

In [11]:
#create a folder to get a sample from the training images
imagepath = r'../images'

In [12]:
#get a sample of new images
get_sample(path, imagepath, art_file, ['Impressionism', 'Pop Art', 'Abstract Expressionism'], .8) #this is 64% of the total set in each class

Clearing any previous samples...
Deleted 1260 images
Checking if ../images\Impressionism exists...
../images\Impressionism path already exists
Checking if ../images\Pop Art exists...
../images\Pop Art path already exists
Checking if ../images\Abstract Expressionism exists...
../images\Abstract Expressionism path already exists
Generated 1260 new images


In [13]:
#get image information from imagepath
img_info = get_image_info(imagepath)
#convert lists to dataframe
zipped = zip(['label', 'filename', 'filepath'], img_info)
df_labels = pd.DataFrame(dict(list(zipped)))
#save training samples to sample.csv
df_labels.to_csv(r'..\data\sample.csv')