In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Drill: Playing with layers

Now it's your turn. Using the space below, experiment with different hidden layer structures. You can try this on a subset of the data to improve runtime. See how things vary. See what seems to matter the most. Feel free to manipulate other parameters as well. It may also be beneficial to do some real feature selection work...

In [2]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [3]:
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

In [7]:
# Select Columns.
artworks = artworks[['Artist', 'Gender', 'Department', 'URL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
#artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

In [8]:
artworks.head()

Unnamed: 0,Artist,Gender,Department,URL,Height (cm),Width (cm)
0,Otto Wagner,(Male),Architecture & Design,True,48.6,168.9
1,Christian de Portzamparc,(Male),Architecture & Design,True,40.6401,29.8451
2,Emil Hoppe,(Male),Architecture & Design,True,34.3,31.8
3,Bernard Tschumi,(Male),Architecture & Design,True,50.8,50.8
4,Emil Hoppe,(Male),Architecture & Design,True,38.4,19.1


The `DateAcquired` column is an object. Let's transform that to a datetime object and add a feature for just the year the artwork was acquired.

### Drop some variable and not create dummies, so to save time

In [12]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
#artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
#artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
#artworks['Date'] = pd.Series(artworks.Date.str.extract(
#    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'Artist'], 1)

# Create dummies separately.
#artists = pd.get_dummies(artworks.Artist)
#nationalities = pd.get_dummies(artworks.Nationality)
#dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
#X = pd.concat([X, artists], axis=1)

Y = artworks.Department

In [15]:
X.head()

Unnamed: 0,URL,Height (cm),Width (cm),Gender_(),Gender_(Female),Gender_(Male),Gender_(male),Gender_\(multiple_persons\)
0,True,48.6,168.9,0,0,1,0,0
1,True,40.6401,29.8451,0,0,1,0,0
2,True,34.3,31.8,0,0,1,0,0
3,True,50.8,50.8,0,0,1,0,0
4,True,38.4,19.1,0,0,1,0,0


In [16]:
# Alright! We've done our prep, let's build the model.
# Neural networks are hugely computationally intensive.
# This may take several minutes to run.

# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(10,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [17]:
mlp.score(X, Y)

0.56572485432883524

In [18]:
Y.value_counts()/len(Y)

Prints & Illustrated Books               0.510240
Photography                              0.220962
Architecture & Design                    0.136783
Drawings                                 0.099712
Painting & Sculpture                     0.032106
Architecture & Design - Image Archive    0.000198
Name: Department, dtype: float64

In [19]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)

array([ 0.50754039,  0.55880769,  0.48781145,  0.5240853 ,  0.51789001])

## I lowered the variables and hidden layer size and got almost the same result