In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re

np.random.seed(28)

In [49]:
df = pd.read_csv("resources/all_data_info.csv")

In [50]:
df

Unnamed: 0,artist,date,genre,pixelsx,pixelsy,size_bytes,source,style,title,artist_group,in_train,new_filename
0,Barnett Newman,1955.0,abstract,15530.0,6911.0,9201912.0,wikiart,Color Field Painting,Uriel,train_only,True,102257.jpg
1,Barnett Newman,1950.0,abstract,14559.0,6866.0,8867532.0,wikiart,Color Field Painting,Vir Heroicus Sublimis,train_only,True,75232.jpg
2,kiri nichol,2013.0,,9003.0,9004.0,1756681.0,,Neoplasticism,,test_only,False,32145.jpg
3,kiri nichol,2013.0,,9003.0,9004.0,1942046.0,,Neoplasticism,,test_only,False,20304.jpg
4,kiri nichol,2013.0,,9003.0,9004.0,1526212.0,,Neoplasticism,,test_only,False,836.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...
103245,Jackson Pollock,1948.0,abstract,682.0,220.0,96405.0,wikiart,Action painting,Number 13A (Arabesque),train_and_test,True,25525.jpg
103246,Bernardo Strozzi,,religious painting,329.0,456.0,127594.0,wikiart,Baroque,St. Francis of Assisi,train_only,True,47038.jpg
103247,Josef Sima,,landscape,293.0,512.0,102519.0,wikiart,Surrealism,Maisons à la campagne II,train_and_test,False,7680.jpg
103248,Brett Whiteley,1982.0,marina,293.0,512.0,167423.0,wikiart,,Thebe's Revenge,train_and_test,True,9021.jpg


In [4]:
art = df[["artist", "date", "style", "genre", "title", "new_filename", "artist_group"]]

In [5]:
non_duplicates = art.drop_duplicates()

In [6]:
def clean_year(item):
    string = str(item)
    try:
        year = re.findall(r"\d{3,4}", string)[0]
    except Exception as e:
        print(item, string)
        return ''
    return int(year)

In [7]:
# non_duplicates.dropna(subset=["date"], inplace=True)
# non_duplicates["new_date"] = non_duplicates["date"].apply(clean_year)
# non_duplicates = non_duplicates[non_duplicates["new_date"] != ''].copy()

In [8]:
non_duplicates

Unnamed: 0,artist,date,style,genre,title,new_filename,artist_group
0,Barnett Newman,1955.0,Color Field Painting,abstract,Uriel,102257.jpg,train_only
1,Barnett Newman,1950.0,Color Field Painting,abstract,Vir Heroicus Sublimis,75232.jpg,train_only
2,kiri nichol,2013.0,Neoplasticism,,,32145.jpg,test_only
3,kiri nichol,2013.0,Neoplasticism,,,20304.jpg,test_only
4,kiri nichol,2013.0,Neoplasticism,,,836.jpg,test_only
...,...,...,...,...,...,...,...
103245,Jackson Pollock,1948.0,Action painting,abstract,Number 13A (Arabesque),25525.jpg,train_and_test
103246,Bernardo Strozzi,,Baroque,religious painting,St. Francis of Assisi,47038.jpg,train_only
103247,Josef Sima,,Surrealism,landscape,Maisons à la campagne II,7680.jpg,train_and_test
103248,Brett Whiteley,1982.0,,marina,Thebe's Revenge,9021.jpg,train_and_test


In [9]:
ranking = non_duplicates[["new_filename", "style"]].groupby(["style"]).nunique().sort_values(["new_filename"],ascending=False)

In [10]:
ranking[:15]

Unnamed: 0_level_0,new_filename,style
style,Unnamed: 1_level_1,Unnamed: 2_level_1
Impressionism,10643,1
Realism,10523,1
Romanticism,9285,1
Expressionism,7013,1
Post-Impressionism,5778,1
Art Nouveau (Modern),4899,1
Baroque,4400,1
Surrealism,4167,1
Symbolism,3476,1
Rococo,2733,1


In [11]:
top_15 = list(ranking[:15].index)

-- 22,500 images
-- top 15 styles
-- 22,500 / 15 = 1500 images per category
-- 1050, 450 train/test split

In [12]:
top_15

['Impressionism',
 'Realism',
 'Romanticism',
 'Expressionism',
 'Post-Impressionism',
 'Art Nouveau (Modern)',
 'Baroque',
 'Surrealism',
 'Symbolism',
 'Rococo',
 'Northern Renaissance',
 'Naïve Art (Primitivism)',
 'Abstract Expressionism',
 'Neoclassicism',
 'Cubism']

In [13]:
top_15

['Impressionism',
 'Realism',
 'Romanticism',
 'Expressionism',
 'Post-Impressionism',
 'Art Nouveau (Modern)',
 'Baroque',
 'Surrealism',
 'Symbolism',
 'Rococo',
 'Northern Renaissance',
 'Naïve Art (Primitivism)',
 'Abstract Expressionism',
 'Neoclassicism',
 'Cubism']

In [39]:
N_SAMPLES_PER = 1600
TEST_PCT = .30

In [40]:
def split_genre(style):
    np.random.seed(28)
    style_list = non_duplicates[non_duplicates['style'] == style]['new_filename']
    style_files = np.random.choice(style_list, (N_SAMPLES_PER), replace=False)
    y = np.ones(N_SAMPLES_PER)
    X_train, X_test, y_train, y_test = train_test_split(style_files, y, test_size=TEST_PCT, random_state=42)
    return X_train, X_test

In [41]:
def sample_genre(style):
    np.random.seed(28)
    X_train, X_test = split_genre(style)
    train = non_duplicates[non_duplicates["new_filename"].isin(X_train)].copy()
    test = non_duplicates[non_duplicates["new_filename"].isin(X_test)].copy()
    
    train["group"] = "train"
    test["group"] = "test"
    
    return pd.concat([train, test])

In [42]:
sample_genre("Impressionism")

Unnamed: 0,artist,date,style,genre,title,new_filename,artist_group,group
402,Paul Signac,1922,Impressionism,cityscape,Le Havre with rain clouds,8250.jpg,train_and_test,train
504,Paul Signac,c.1921,Impressionism,landscape,Saint Paul de Vence,95477.jpg,train_and_test,train
1083,Alfred Sisley,1871,Impressionism,landscape,River steamboat and bridge,34200.jpg,train_and_test,train
1087,Claude Monet,1886,Impressionism,landscape,Haystack at Giverny,52023.jpg,train_and_test,train
1336,Anders Zorn,1893,Impressionism,genre painting,Omnibus,100236.jpg,train_and_test,train
...,...,...,...,...,...,...,...,...
100930,James McNeill Whistler,1895,Impressionism,portrait,Edward Guthrie Kennedy,87589.jpg,train_and_test,test
101304,Firmin Baes,,Impressionism,genre painting,Sunny terrace,69399.jpg,train_only,test
101650,Nikolay Bogdanov-Belsky,,Impressionism,genre painting,Fisherman,30921.jpg,train_and_test,test
101873,Max Slevogt,1917.0,Impressionism,wildlife painting,Tiger in the Jungle,15421.jpg,train_and_test,test


In [43]:
def get_sample():
    dfs = [sample_genre(style) for style in top_15]
    return pd.concat(dfs)

In [44]:
sample = get_sample()
sample

Unnamed: 0,artist,date,style,genre,title,new_filename,artist_group,group
402,Paul Signac,1922,Impressionism,cityscape,Le Havre with rain clouds,8250.jpg,train_and_test,train
504,Paul Signac,c.1921,Impressionism,landscape,Saint Paul de Vence,95477.jpg,train_and_test,train
1083,Alfred Sisley,1871,Impressionism,landscape,River steamboat and bridge,34200.jpg,train_and_test,train
1087,Claude Monet,1886,Impressionism,landscape,Haystack at Giverny,52023.jpg,train_and_test,train
1336,Anders Zorn,1893,Impressionism,genre painting,Omnibus,100236.jpg,train_and_test,train
...,...,...,...,...,...,...,...,...
102198,Juan Gris,1920,Cubism,sketch and study,Harlequin,87717.jpg,train_and_test,test
102240,Willi Baumeister,1930.0,Cubism,abstract,Machine Man with Spiral Turn,40473.jpg,train_and_test,test
102346,Henri Laurens,1915.0,Cubism,portrait,Head of a Woman,32438.jpg,train_only,test
102890,Roy Lichtenstein,1980.0,Cubism,landscape,Landscape with figures and rainbow,2862.jpg,train_only,test


In [None]:
filename = ''

In [31]:
# sample.to_csv(f"utils/{filename}.csv")

In [57]:

f = open(filename, 'rb')
data = pickle.load(f)
f.close()