In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pandas as pd
import shutil
import random
import numpy as np

In [3]:
sys.path.insert(0, '..')
from function import *
from createimages import *

In [49]:
artists = pd.read_csv('../raw_data/artists.csv')

In [52]:
#drop id, years, nationality, bio, wikipedia
artists.drop(labels='id', axis=1, inplace = True)
#rename 'Albrecht Dürer' to 'Albrecht Durer'
artists['name'] = artists['name'].replace(to_replace='Albrecht D.rer', value = 'Albrecht Durer', regex = True)
artists['name'] = artists['name'].str.replace(' ', '_')
artists.head()

Unnamed: 0,name,years,genre,nationality,bio,wikipedia,paintings
0,Amedeo_Modigliani,1884 - 1920,Expressionism,Italian,Amedeo Clemente Modigliani (Italian pronunciat...,http://en.wikipedia.org/wiki/Amedeo_Modigliani,193
1,Vasiliy_Kandinskiy,1866 - 1944,"Expressionism,Abstractionism",Russian,Wassily Wassilyevich Kandinsky (Russian: Васи́...,http://en.wikipedia.org/wiki/Wassily_Kandinsky,88
2,Diego_Rivera,1886 - 1957,"Social Realism,Muralism",Mexican,Diego María de la Concepción Juan Nepomuceno E...,http://en.wikipedia.org/wiki/Diego_Rivera,70
3,Claude_Monet,1840 - 1926,Impressionism,French,Oscar-Claude Monet (; French: [klod mɔnɛ]; 14 ...,http://en.wikipedia.org/wiki/Claude_Monet,73
4,Rene_Magritte,1898 - 1967,"Surrealism,Impressionism",Belgian,René François Ghislain Magritte (French: [ʁəne...,http://en.wikipedia.org/wiki/René_Magritte,194


In [54]:
#create a new folder for the csv files
datapath = '..\data'
create_new_folder(datapath)

..\data path already exists


In [55]:
#create a new csv file with at most one genre for an artist
newfile = 'artists_clean.csv'
fullpath = os.path.join(datapath, newfile)
artists.to_csv(fullpath)

In [56]:
#create data frame with artist name, genre, filename, filepath
path = r'..\raw_images'
#get image information
img_info = get_image_info(path)
#convert lists to dataframe
zipped = zip(['name', 'filename', 'filepath'], img_info)
artlabels = pd.DataFrame(dict(list(zipped)))

In [57]:
#extract the artist name and genre
labels = artists.loc[:,['name', 'genre']]
#label each file with their respective genres
artlabels = artlabels.merge(labels, how='left', on='name')
artlabels.head()

Unnamed: 0,name,filename,filepath,genre
0,Albrecht_Durer,Albrecht_Durer_(1).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance
1,Albrecht_Durer,Albrecht_Durer_(10).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance
2,Albrecht_Durer,Albrecht_Durer_(100).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance
3,Albrecht_Durer,Albrecht_Durer_(101).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance
4,Albrecht_Durer,Albrecht_Durer_(102).jpg,..\raw_images\Albrecht_Durer\Albrecht_Durer_(1...,Northern Renaissance


In [58]:
#create a csv file from the new dataframe
newfile = 'artlabels.csv'
fullpath = os.path.join(datapath, newfile)
artlabels.to_csv(fullpath)

In [59]:
#separate dataframe into train and test sets by genre
genres = artlabels.genre.unique()
train_set = []
#set train_size
train_size = 0.8

for i in genres:
    df = artlabels.loc[artlabels.genre == str(i)]
    df.reset_index(inplace=True, drop=True)
    #round index to the nearest int
    train_index = round(len(df)*train_size)
    for x in range(train_index):
        train_set.append(list(df.iloc[x]))


In [60]:
#turn train_set into a DataFrame
train_set=pd.DataFrame(train_set, columns=['name', 'filename', 'filepath', 'genre'])

#place the remaining images from the artlabels csv not in train_set to test_set
test_set=artlabels[~artlabels.filename.isin(list(train_set.filename))]
test_set.reset_index(inplace=True, drop=True)

In [61]:
#create new paths for training and test images
trainpath = r'..\train_images'
testpath = r'..\test_images'

In [62]:
[create_new_folder(i) for i in [trainpath, testpath]]

..\raw_images path already exists
..\train_images path already exists
..\test_images path already exists


[None, None, None]

In [63]:
#create folders for all genres in the train and test folders
for i in genres:
    train = os.path.join(trainpath, i)
    test = os.path.join(testpath, i)
    create_new_folder(train)
    create_new_folder(test)

..\train_images\Northern Renaissance path already exists
..\test_images\Northern Renaissance path already exists
..\train_images\Impressionism path already exists
..\test_images\Impressionism path already exists
..\train_images\Expressionism path already exists
..\test_images\Expressionism path already exists
..\train_images\Byzantine Art path already exists
..\test_images\Byzantine Art path already exists
..\train_images\Pop Art path already exists
..\test_images\Pop Art path already exists
..\train_images\Impressionism,Post-Impressionism path already exists
..\test_images\Impressionism,Post-Impressionism path already exists
..\train_images\Baroque path already exists
..\test_images\Baroque path already exists
..\train_images\Social Realism,Muralism path already exists
..\test_images\Social Realism,Muralism path already exists
..\train_images\Realism,Impressionism path already exists
..\test_images\Realism,Impressionism path already exists
..\train_images\Symbolism,Expressionism path 

In [21]:
#copy all images from the train_set and test_set and move to the train_images and test_images folders respectively
for i in range(len(train_set)):
    shutil.copy(train_set.filepath[i], os.path.join(trainpath, train_set.genre[i]))
for i in range(len(test_set)):
    shutil.copy(test_set.filepath[i],os.path.join(testpath, test_set.genre[i]))

In [47]:
def get_images(oldpath, newpath, df, genres, perc):
    
    copied = 0
    
    for i in genres:
        paths = list(df.filepath.loc[df.genre == i].sample(frac=perc, replace=False))
        imgpath = os.path.join(newpath, i)
        create_new_folder(imgpath)
        for filepath in paths:
            shutil.copy(filepath, imgpath)
            copied += 1
    
    print(f'Generated {copied} new images')

In [74]:
#create a folder to get a sample from the training images
imagepath=r'../images'
testimagepath=r'../sample_test_images'

In [75]:
create_new_folder(testimagepath)

../sample_test_images created


In [79]:
#clear old images to procure a new sample
clear_old_images(imagepath, df_labels)
clear_old_images(testimagepath, df_test_labels)

Deleted 630 images


In [80]:
#get a sample of new images
get_images(trainpath, imagepath, train_set, ['Impressionism', 'Pop Art', 'Abstract Expressionism'], .5) #this is 40% of the total set in each class

../images\Impressionism path already exists
../images\Pop Art path already exists
../images\Abstract Expressionism path already exists
Generated 630 new images


In [83]:
get_images(testpath, testimagepath, test_set, ['Impressionism', 'Pop Art', 'Abstract Expressionism'], .5) #this 10% of the total set in each class

../sample_test_images\Impressionism created
../sample_test_images\Pop Art created
../sample_test_images\Abstract Expressionism created
Generated 157 new images


In [84]:
#get image information from imagepath
img_info = get_image_info(imagepath)
#convert lists to dataframe
zipped = zip(['label', 'filename', 'filepath'], img_info)
df_labels = pd.DataFrame(dict(list(zipped)))
#save training samples to sample.csv
df_labels.to_csv(r'..\data\sample_train.csv')

In [88]:
#get image information from imagepath
test_img_info = get_image_info(testimagepath)
#convert lists to dataframe
zipped = zip(['label', 'filename', 'filepath'], test_img_info)
df_test_labels = pd.DataFrame(dict(list(zipped)))
#save training samples to sample.csv
df_test_labels.to_csv(r'..\data\sample_test.csv')

In [89]:
df_labels.shape

(630, 3)

In [90]:
df_test_labels.shape

(157, 3)