In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [27]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [28]:
#Select Columns
artworks = artworks[['Artist','Nationality','Gender','Date','Department','DateAcquired','URL',
                    'ThumbnailURL','Height (cm)','Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

#Drop films and some other rows
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

#Drop missing data.
artworks = artworks.dropna()

In [29]:
artworks['Department'].unique()

array(['Architecture & Design', 'Prints & Illustrated Books', 'Drawings',
       'Painting & Sculpture', 'Photography'], dtype=object)

In [30]:
artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [31]:
#Remove date and month from Date Acquired to just Year Acquired, and check for datatype.
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [32]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to to year, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat artists with other variables
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department

In [33]:
from sklearn.model_selection import train_test_split
#Divide into training and test sets.
X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y,
    test_size=0.9,
    random_state=42)

In [None]:
#Running the MLP classifier with a smaller subset and changing the hidden layer sizes.

In [34]:
from sklearn.neural_network import MLPClassifier #Multi Layer Perceptron

#Establish and fit the model, with a single 5000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(5000,), max_iter=10, batch_size=500)
mlp.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size=500, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=10, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [35]:
mlp.score(X,Y)

0.54691402069211559

In [36]:
Y.value_counts()/len(Y)

Prints & Illustrated Books    0.524120
Photography                   0.225444
Architecture & Design         0.111840
Drawings                      0.103952
Painting & Sculpture          0.034644
Name: Department, dtype: float64

In [37]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)



array([ 0.54808388,  0.6424818 ,  0.42903288,  0.42312328,  0.50756991])