In [1]:
import numpy as np, pandas as pd, scipy, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

artworks = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/museum-collection-dataset/artworks.csv')

artworks = artworks.drop(['Artwork ID', 'Artist ID', 'Catalogue', 'Object Number', 'Credit',], 1)
artworks.columns

Index(['Title', 'Name', 'Date', 'Medium', 'Dimensions', 'Acquisition Date',
       'Department', 'Classification', 'Diameter (cm)', 'Circumference (cm)',
       'Height (cm)', 'Length (cm)', 'Width (cm)', 'Depth (cm)', 'Weight (kg)',
       'Duration (s)'],
      dtype='object')

First Neural Net

Multiple features feeding through a set of perceptron models to each generate a response that will be fed into our final model

In [2]:
# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']
artworks = artworks[artworks['Department']!='Architecture & Design - Image Archive']


# Convert acquisition date to numerical format
artworks['DateAcquired'] = pd.to_datetime(artworks['Acquisition Date'], errors='coerce')
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks = artworks.drop(['Acquisition Date', 'DateAcquired', 'Medium'], 1).replace(np.nan, 0)
artworks = artworks.dropna(how='any')

# Reduce the overall number of observations
artworks = artworks.sample(n=10000, random_state=23)

# Drop missing data.
artworks.shape[0]

10000

In [3]:
# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

artists = pd.get_dummies(artworks.Name)
artworks = pd.concat([artworks, artists], 1)

# Final and N/A drops
X = artworks.drop(['Title', 'Department', 'Name', 'Date', 'Dimensions', 'Classification'], 1)
Y = artworks.Department

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 79978 to 49481
Columns: 2994 entries, Diameter (cm) to Öyvind Fahlström
dtypes: float64(9), uint8(2985)
memory usage: 29.2 MB


In [5]:
# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 100 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100, 100,))
mlp.fit(X, Y)
mlp.score(X, Y)

0.8867

In [6]:
Y.value_counts()/len(Y)

Prints & Illustrated Books    0.4979
Photography                   0.2312
Architecture & Design         0.1475
Drawings                      0.0951
Painting & Sculpture          0.0283
Name: Department, dtype: float64

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)

Parameters
1. Hidden layer size - pass list with the size of each layer, number of layers is equal to items on the list
2. Alpha - regularization parameters penalize high coefficients, alpha scales the parameter
3. Activation function - determines if perceptron output is binary or continuous

RELU or rectified linear unit function - binary

sigmoid or logistic sigmoid function - allows for continuous variable 0-1, more nuanced model, less computationally efficient

Additional layers helps prevent overfitting, additional 