In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


In [81]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [82]:
len(artworks)

134672

In [83]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

#Drop Films and tricky columns
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!= 'Media and Performance Art']
artworks = artworks[artworks['Department']!= 'Flexus Collection']

# Drop missing data.
artworks = artworks.dropna()

In [84]:
artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1


In [85]:
# Get data types.
artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [86]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [87]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

In [88]:
# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)


In [89]:
# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department

In [90]:
X, X_test, Y, y_test = train_test_split(
        X, Y, test_size=0.8, random_state=42)

### 300 hidden layers, same parameters as original. 

In [91]:
# Alright! We've done our prep, let's build the model.
# Neural networks are hugely computationally intensive.
# This may take several minutes to run.

# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 300perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(300))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=300, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [92]:
mlp.score(X, Y)

0.46474494842447367

In [93]:
Y.value_counts()/len(Y)

Prints & Illustrated Books    0.526636
Photography                   0.222929
Architecture & Design         0.110829
Drawings                      0.099289
Painting & Sculpture          0.032405
Fluxus Collection             0.007913
Name: Department, dtype: float64

In [94]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=3)

array([0.59627066, 0.62074325, 0.44833922])

The 1000 layer model baseline was:
- score 0.68022899006559012
- Prints & Illustrated Books    0.521662
- Photography                   0.229354
- Architecture & Design         0.111225
- Drawings                      0.103381
- Painting & Sculpture          0.034377
Name: Department, dtype: float64
[ 0.57519536,  0.52577072,  0.36922856,  0.48580744,  0.54039799])

Reducing the width of the layer made our model worse. Let's change the # of layers, but keep the overal size the same

### Same paramaters, 2 layers of 50 each. 

In [96]:
mlp = MLPClassifier(hidden_layer_sizes=(150,150))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(150, 150), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [97]:
mlp.score(X, Y)

0.3809052800150723

In [98]:
cross_val_score(mlp, X, Y, cv=3)

array([0.12798418, 0.47378833, 0.55265018])

The score is much better, but the scores don't hold up well across folds. 

### Same parameters, 2 layers of 300. 

In [99]:
mlp = MLPClassifier(hidden_layer_sizes=(300,300))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(300, 300), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [100]:
mlp.score(X, Y)

0.5550374452451604

In [101]:
cross_val_score(mlp, X, Y, cv=3)

array([0.53651646, 0.56492864, 0.55689046])

This is by far my best combination of score and cross-val. Let's change some parameters to see what that does. 

## 300 X 2  layers, logistic activation. 

In [116]:
mlp = MLPClassifier(hidden_layer_sizes=(300,300), activation = 'logistic')
mlp.fit(X, Y)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(300, 300), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [117]:
mlp.score(X, Y)

0.663369601055061

In [103]:
cross_val_score(mlp, X, Y, cv=3)

array([0.62282808, 0.64646036, 0.64946996])

Wow. logistic activation made a huge difference in accuracy and the crossfold score stayed consistent. Let's try the lbfgs solver which is supposed to work well on smaller data sets. 

### 300 X 2 layers, Logistic activation, lbfgs solver. 

In [106]:
mlp = MLPClassifier(hidden_layer_sizes=(300,300), 
                    activation = 'logistic',
                   solver = 
                    'lbfgs')
mlp.fit(X, Y)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(300, 300), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [107]:
cross_val_score(mlp, X, Y, cv=3)

array([0.53340867, 0.53610287, 0.53215548])

In [108]:
mlp.score(X, Y)

0.5403890537421695

Switching to a different solver made our scores worse, but slightly more consistent across folds. Let's try stochastic gradient descent with an adaptive learning rate.

### 300 X 2, logitistic activation, sgd solver with adaptive learning rate 

In [112]:
mlp = MLPClassifier(hidden_layer_sizes=(300,300), 
                    activation = 'logistic',
                   solver = 'sgd',
                   learning_rate = 'adaptive')
mlp.fit(X, Y)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(300, 300), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='sgd', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [113]:
cross_val_score(mlp, X, Y, cv=3)

array([0.52648679, 0.52663558, 0.52678445])

In [114]:
mlp.score(X, Y)

0.526635580048043

This actually made things a little worse overall, but look at the consistency of those folds! The adaptive learning rate seemed to make a huge difference in accuracy across folds. 

## 200 X 3 logistic activation

In [109]:
mlp = MLPClassifier(hidden_layer_sizes=(200,200,200),
                    activation = 'logistic')
mlp.fit(X, Y)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200, 200, 200), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [110]:
mlp.score(X, Y)

0.657010974518393

In [111]:
cross_val_score(mlp, X, Y, cv=3)

array([0.64359373, 0.64165607, 0.64310954])

This was our best overall model in terms of a balance of accuracy and consistency. Let's try adding a few more layers. 

# 100 x 6, logisitic activation

In [118]:
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100,100, 100, 100),
                    activation = 'logistic')
mlp.fit(X, Y)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100, 100, 100, 100),
       learning_rate='constant', learning_rate_init=0.001, max_iter=200,
       momentum=0.9, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [119]:
mlp.score(X, Y)

0.5454759549715039

In [120]:
cross_val_score(mlp, X, Y, cv=3)

array([0.54767623, 0.53045076, 0.56720848])

Worse than 200 X 3 in both accuracy and consistency. 

## Conclusion

200 X 3 logistic activation was my best performing nueral network. Unlike some of the classification/Regression models, tuning parameters actually seems to make a noticeable differnce on the model's performance. 