The code in this file is used to clean our dataset and produce a new .csv to be used for the project everywhere else.

In [1]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/jd1771/ML-Artwork/main/data/wikiart.csv')
df.head()

Unnamed: 0,Style,Artwork,Artist,Date,Link
0,Early-Dynastic,Narmer Palette,Ancient Egypt,3050 BC,https://uploads3.wikiart.org/00265/images/anci...
1,Early-Dynastic,Box Inlay with a Geometric Pattern,Ancient Egypt,3100-2900 BC,https://uploads2.wikiart.org/00244/images/anci...
2,Old-Kingdom,Khafre Enthroned,Ancient Egypt,2570 BC,https://uploads2.wikiart.org/00305/images/anci...
3,Middle-Kingdom,Stele of the Serpent King (Stela of Djet),Ancient Egypt,3000 BC,https://uploads7.wikiart.org/00305/images/anci...
4,Middle-Kingdom,"Laden Donkeys and Ploughing, Tomb of Djar",Ancient Egypt,2060-2010 BC,https://uploads8.wikiart.org/00244/images/anci...


In [2]:
print(df.shape)

(124170, 5)


In [3]:
# get rid of all unknown artists signified by "Ancient <region>"
edited = df[df.Artist.str.match('^(?!Ancient).*')]

In [4]:
edited.head()

Unnamed: 0,Style,Artwork,Artist,Date,Link
249,Classical,Aphrodite Anadyomene from Pompeii (detail),Apelles,400-301 BC,https://uploads3.wikiart.org/00202/images/apel...
250,Classical,Aphrodite Anadyomene from Pompeii (detail),Apelles,400-301 BC,https://uploads8.wikiart.org/00202/images/apel...
251,Classical,Aphrodite Anadyomene from Pompeii,Apelles,400-301 BC,https://uploads7.wikiart.org/00202/images/apel...
252,Classical,Aphrodite Anadyomene from Pompeii (detail),Apelles,400-301 BC,https://uploads7.wikiart.org/00202/images/apel...
253,Classical,Aphrodite Anadyomene from Pompeii (detail),Apelles,400-301 BC,https://uploads1.wikiart.org/00205/images/apel...


In [5]:
# get rid of partial artworks, indicated by (detail) at the end
edited = edited[edited.Artwork.str.match('^(?!.*detail)(.*)$')]

In [6]:
edited.head()

Unnamed: 0,Style,Artwork,Artist,Date,Link
251,Classical,Aphrodite Anadyomene from Pompeii,Apelles,400-301 BC,https://uploads7.wikiart.org/00202/images/apel...
255,Classical,Alexander Mosaic (depicting the Battle of Issu...,Apelles,400-301 BC,https://uploads2.wikiart.org/00202/images/apel...
256,Classical,"Painting of Alexander as Zeus, based on an ori...",Apelles,400-301 BC,https://uploads4.wikiart.org/00205/images/apel...
293,Hellenistic,Aphrodite Anadyomene from Pompeii,Apelles,400-301 BC,https://uploads7.wikiart.org/00202/images/apel...
297,Hellenistic,Alexander Mosaic (depicting the Battle of Issu...,Apelles,400-301 BC,https://uploads2.wikiart.org/00202/images/apel...


In [7]:
edited.Style.unique()

array(['Classical', 'Hellenistic', 'Geometric', 'Early-Christian',
       'Early-Byzantine-(c.-–)', 'Middle-Byzantine-(c.-–)', 'Mozarabic',
       'Komnenian-style-(-)', 'Latin-Empire-of-Constantinople-(-)',
       'Late-Byzantine/Palaeologan-Renaissance-(c.-–)', 'Byzantine',
       'Macedonian-Renaissance-(–)', 'Romanesque', 'Mosan-art', 'Gothic',
       'International-Gothic', 'Viking-art', 'Medieval-Art',
       'Crusader-workshop', 'Moscow-school-of-icon-painting',
       'Cretan-school-of-icon-painting',
       'Macedonian-school-of-icon-painting', 'Coptic-art',
       'Novgorod-school-of-icon-painting', 'Kyiv-school-of-icon-painting',
       'Vladimir-school-of-icon-painting', 'Galicia-Volyn-school',
       'Pskov-school-of-icon-painting',
       'Yaroslavl-school-of-icon-painting',
       'Vologda-school-of-icon-painting',
       'Chernihiv-school-of-icon-painting', 'Proto-Renaissance',
       'Early-Renaissance', 'High-Renaissance',
       'Mannerism-(Late-Renaissance)', 'North

In [8]:
# get rid of photography styles
edited = edited[edited.Style.str.match('(?!.*hotography)(.*)$')]

In [9]:
edited.Style.unique()

array(['Classical', 'Hellenistic', 'Geometric', 'Early-Christian',
       'Early-Byzantine-(c.-–)', 'Middle-Byzantine-(c.-–)', 'Mozarabic',
       'Komnenian-style-(-)', 'Latin-Empire-of-Constantinople-(-)',
       'Late-Byzantine/Palaeologan-Renaissance-(c.-–)', 'Byzantine',
       'Macedonian-Renaissance-(–)', 'Romanesque', 'Mosan-art', 'Gothic',
       'International-Gothic', 'Viking-art', 'Medieval-Art',
       'Crusader-workshop', 'Moscow-school-of-icon-painting',
       'Cretan-school-of-icon-painting',
       'Macedonian-school-of-icon-painting', 'Coptic-art',
       'Novgorod-school-of-icon-painting', 'Kyiv-school-of-icon-painting',
       'Vladimir-school-of-icon-painting', 'Galicia-Volyn-school',
       'Pskov-school-of-icon-painting',
       'Yaroslavl-school-of-icon-painting',
       'Vologda-school-of-icon-painting',
       'Chernihiv-school-of-icon-painting', 'Proto-Renaissance',
       'Early-Renaissance', 'High-Renaissance',
       'Mannerism-(Late-Renaissance)', 'North

In [10]:
# aggregate by artwork, using first found link and style
agg_functions = {'Style': 'first', 'Link': 'first'}
edited = edited.groupby(['Artist','Artwork','Date']).aggregate(agg_functions)

In [11]:
edited.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Style,Link
Artist,Artwork,Date,Unnamed: 3_level_1,Unnamed: 4_level_1
3D,3D,1984,Street-art,https://uploads7.wikiart.org/images/3d/3d-1984...
3D,Agent Blue,1991,Street-art,https://uploads6.wikiart.org/images/3d/agent-b...
3D,In Baghdad,XX cent.,Street-art,https://uploads1.wikiart.org/images/3d/in-bagh...
3D,Naked Lunch,1994,Street-art,https://uploads7.wikiart.org/images/3d/naked-l...
3D,No Great Crime,1983,Street-art,https://uploads1.wikiart.org/images/3d/no-grea...


In [12]:
edited.shape

(107731, 2)

In [13]:
edited_counts = edited.groupby(['Artist']).size().reset_index(name='num_paintings').sort_values(['num_paintings'], ascending=False)
edited_counts.head(10)


Unnamed: 0,Artist,num_paintings
1078,Giovanni Battista Piranesi,1126
2902,Vincent van Gogh,917
2288,Pablo Picasso,812
96,Albrecht Durer,797
1964,Marc Chagall,713
2610,Salvador Dali,678
557,Claude Monet,670
2471,Rembrandt,579
138,Alfred Freddy Krupa,528
1236,Henri Matisse,498


In [14]:
top_10 = edited_counts.Artist[0:10]
edited_w_artist = edited.reset_index()

In [15]:
edited_top_10 = edited_w_artist[edited_w_artist.Artist.isin(top_10)]

In [16]:
edited_top_10.shape

(7318, 5)

In [17]:
# check to see how many different styles we have after choosing the 
# top 10 most prolific artists
edited_top_10.Style.unique()

array(['Northern-Renaissance', 'Renaissance', 'Ink-and-wash-painting',
       'Neo-Expressionism', 'New-Ink-Painting', 'Neo-Figurative-Art',
       'Sumi-e-(Suiboku-ga)', 'New-Realism', 'Photorealism',
       'New-European-Painting', 'Contemporary',
       'Figurative-Expressionism', 'Fantastic-Realism', 'Post-Minimalism',
       'Minimalism', 'Fantasy-Art', 'Conceptual-Art',
       'P&D-(Pattern-and-Decoration)', 'Neo-Minimalism', 'Art-Informel',
       'Contemporary-Realism', 'Pop-Art', 'Cubo-Expressionism',
       'Existential-Art', 'Neo-Pop-Art', 'Street-art', 'Fauvism',
       'Academicism', 'Japonism', 'Poster-Art-Realism',
       'Neo-Impressionism', 'Impressionism', 'Realism', 'Neoclassicism',
       'Orientalism', 'Expressionism', 'Abstract-Expressionism',
       'Post-Impressionism', 'Cubism', 'Divisionism', 'Pointillism',
       'Abstract-Art', 'Naïve-Art-(Primitivism)', 'Surrealism',
       'Symbolism', 'Analytical-Cubism', 'Art-Nouveau-(Modern)',
       'Synthetic-Cubism',

In [18]:
edited_top_10.to_csv('clean.csv')

https://machinelearningmastery.com/how-to-load-and-manipulate-images-for-deep-learning-in-python-with-pil-pillow/ -- for loading images

Finn: resize data for just top 10 artists, ensure variance.