Create a multi-layer perceptron neural network model to predict on a labeled dataset of your choosing. Compare this model to either a boosted tree or a random forest model and describe the relative tradeoffs between complexity and accuracy. Be sure to vary the hyperparameters of your MLP!

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from scipy import stats
from scipy.stats import norm


from sklearn.preprocessing import StandardScaler
from sklearn import ensemble, tree, linear_model
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn.tree import DecisionTreeRegressor

from xgboost import XGBClassifier
from xgboost import XGBRegressor

from sklearn.feature_selection import RFECV

import timeit

import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
artworks = pd.read_csv('Artworks.csv')

In [4]:
artworks.shape

(135804, 29)

In [5]:
sample_size=8000
artworks=artworks.iloc[:sample_size,:]

In [6]:
artworks.shape

(8000, 29)

In [7]:
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

In [8]:
artworks['URL'].dtypes

dtype('O')

In [9]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

In [10]:
artworks['URL'].dtypes

dtype('bool')

In [11]:
artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1


## Building a Model

In [12]:
# Get data types.
artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [13]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [14]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department

## MLP

In [15]:
from sklearn.neural_network import MLPClassifier
for i in [1000,1200]:
    print("Number of Hidden Layers:",i)
    print()
    start=timeit.default_timer()
    mlp = MLPClassifier(hidden_layer_sizes=(i,))
    mlp.fit(X, Y)
       
    score=mlp.score(X, Y)
    print(score)
    print()
    
    value_imp=Y.value_counts()/len(Y)
    print(value_imp)
    print()
    
    cross_score=cross_val_score(mlp, X, Y, cv=5)
    print(cross_score)
    print()
    score=np.mean(cross_score)
    print(score)
    print()
    
    stop=timeit.default_timer()
    time=stop-start
    print("Time Runing:",time)
    print()

Number of Hidden Layers: 1000

0.9960131738602878

Architecture & Design         0.996013
Prints & Illustrated Books    0.003987
Name: Department, dtype: float64

[0.995671   0.98700173 0.99566724 0.99653079 0.94015611]

0.9830053753740137

Time Runing: 117.35108145299999

Number of Hidden Layers: 1200

0.9960131738602878

Architecture & Design         0.996013
Prints & Illustrated Books    0.003987
Name: Department, dtype: float64

[0.995671   0.99566724 0.99566724 0.99653079 0.99653079]

0.9960134125793448

Time Runing: 117.87119657400001



### Random Forest Model

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
start=timeit.default_timer()
RF = ensemble.RandomForestClassifier(n_estimators=100, random_state=0)

cross_score=cross_val_score(RF,X,Y,cv=5)
print(cross_score)
print()

score=np.mean(cross_score)
print(score)
print()

stop=timeit.default_timer()
time=stop-start
print("Time Runing:",time)

[0.9982684  0.9982669  0.99913345 1.         1.        ]

0.9991337489777697

Time Runing: 1.8584027530000071


## Summary

### As we can see from the results obtained from those two models, not only random forest model takes less time to run, but also predict the result more accurately compared to MLP model.