In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [3]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()
artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1


__Goal of this Model:__ See if we can predict which department a piece of artwork is supposed to go to, using the department as the Y (of course).

In [4]:
#The DateAcquired column is an object. 
# Let's transform that to a datetime object and add a feature for just the year the artwork was acquired.

artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [5]:
# miscellaneous cleaning

# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], axis=1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department

Now that the data cleaning is done:

### DRILL GOAL:

Modify the parameters of the MLP classifier and experiment with different hidden layer structures. You can try this on a subset of the data to improve runtime. See how things vary. See what seems to matter the most. Feel free to manipulate other parameters as well. It may also be beneficial to do some real feature selection work...

In [6]:
# Thinkful's parameters - DON'T RUN! 
# I'm throwing in an undefined variable just to make sure it doesn't run.
# initial score with following parameters = 68%
asd
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)

NameError: name 'asd' is not defined

In [9]:
# Now for my own parameters:

from sklearn.neural_network import MLPClassifier

# 1st Model: going to add multiple, smaller layers, dropping max_iter for runtime
# 2nd Model: same as 1st Model (except tweaking layers) PLUS increasing alpha (L2 regularization)
# 3rd Model: exact same as 2nd Model but going to do some feature work

# for all, I'm doing early stopping. My computer is a weakling... :(

mlp1 = MLPClassifier(hidden_layer_sizes=(50,4), 
                     alpha=0.005, 
                     max_iter=150, 
                     early_stopping=True)
mlp1.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.005, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(50, 4), learning_rate='constant',
       learning_rate_init=0.001, max_iter=150, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [10]:
# significantly faster
mlp1.score(X,Y)

0.7388297872340426

In [11]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp1, X, Y, cv=4)

array([0.62185062, 0.62185062, 0.62189705, 0.62189705])

In [12]:
# second model
mlp2 = MLPClassifier(hidden_layer_sizes=(100,3), 
                     alpha=0.015, 
                     max_iter=150, 
                     early_stopping=True)
mlp2.fit(X, Y)
mlp2.score(X,Y)

0.6218738335199702

In [13]:
cross_val_score(mlp2, X, Y, cv=4)

array([0.62185062, 0.62185062, 0.62189705, 0.62189705])

In [14]:
# third model - smaller alpha and larger layers
mlp3 = MLPClassifier(hidden_layer_sizes=(250,2), 
                     alpha=0.0005, 
                     max_iter=150, 
                     early_stopping=True)
mlp3.fit(X, Y)
mlp3.score(X,Y)

0.6218738335199702

In [15]:
cross_val_score(mlp3, X, Y, cv=4)

array([0.62185062, 0.62185062, 0.62189705, 0.62189705])

In [16]:
# fourth model: larger layers, up max_iter (although I don't know if early stopping will block that)
mlp4 = MLPClassifier(hidden_layer_sizes=(500,2), 
                     max_iter=200, 
                     early_stopping=True)
mlp4.fit(X, Y)
mlp4.score(X,Y)

0.6218738335199702

In [23]:
cross_val_score(mlp4, X, Y, cv=4)

array([0.62185062, 0.62185062, 0.62189705, 0.62189705])

In [17]:
X.head()

Unnamed: 0,URL,ThumbnailURL,Height (cm),Width (cm),YearAcquired,Gender_(),Gender_(Female),Gender_(Male),Gender_(male),Gender_\(multiple_persons\),...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,True,True,48.6,168.9,1996,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,True,True,40.6401,29.8451,1995,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,True,True,34.3,31.8,1997,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,True,True,50.8,50.8,1995,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,True,True,38.4,19.1,1997,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# fifth model - new X - dropping dummy columns but keeping gender:
X2 = X.iloc[:, :11]

In [22]:
X2.head()

Unnamed: 0,URL,ThumbnailURL,Height (cm),Width (cm),YearAcquired,Gender_(),Gender_(Female),Gender_(Male),Gender_(male),Gender_\(multiple_persons\),()
0,True,True,48.6,168.9,1996,0,0,1,0,0,0
1,True,True,40.6401,29.8451,1995,0,0,1,0,0,0
2,True,True,34.3,31.8,1997,0,0,1,0,0,0
3,True,True,50.8,50.8,1995,0,0,1,0,0,1
4,True,True,38.4,19.1,1997,0,0,1,0,0,0


In [25]:
# for the grand finale - modifying parameters and 
# TAKING AWAY EARLY STOPPING - GASP!

mlp5 = MLPClassifier(hidden_layer_sizes=(1000,4), 
                     max_iter=200,
                     alpha=0.001)
mlp5.fit(X2, Y)
mlp5.score(X2,Y)

0.6218738335199702

In [26]:
mlp6 = MLPClassifier(hidden_layer_sizes=(1000,3), 
                     max_iter=175,
                     alpha=0.00005,
                     early_stopping=True)
cross_val_score(mlp6, X2, Y, cv=4)

array([0.62185062, 0.62185062, 0.62189705, 0.62189705])

In [40]:
from sklearn.preprocessing import KBinsDiscretizer
discrete = KBinsDiscretizer(n_bins=20, encode='onehot') # trying to split it up as close as possible by the decade

X['YearAcq Reshaped'] = np.array(X['YearAcquired']).reshape(-1,1)

In [50]:
X['YearAcq Reshaped'] = np.array(X['YearAcq Reshaped'])
X['YearAcq Reshaped'][:5]

0    1996
1    1995
2    1997
3    1995
4    1997
Name: YearAcq Reshaped, dtype: int64

In [52]:
X3 = discrete.fit_transform(X2)

In [57]:
type(X3)

scipy.sparse.csr.csr_matrix

In [75]:
# doing discretization by hand
# wanted to do it by decade but I guess this works too

XCentury = []

for row in X['YearAcquired']:
    if row < 1950:
        XCentury.append("Before 1950")
    elif ((row >= 1950) & (row < 1960)):
        XCentury.append("1950s")
    elif ((row >= 1960) & (row < 1965)):
        XCentury.append("1960-65")
    elif ((row >= 1965) & (row < 1970)):
        XCentury.append("1965-70")
    elif ((row >= 1970) & (row < 1980)):
        XCentury.append("1970s")
    elif ((row >= 1980) & (row < 1990)):
        XCentury.append("1980s")
    elif ((row >= 1990) & (row < 2000)):
        XCentury.append("1990s")
    elif ((row >= 2000) & (row < 2005)):
        XCentury.append("2000-05")
    elif ((row >= 2005) & (row < 2010)):
        XCentury.append("2005-10")
    elif ((row >= 2010) & (row < 2015)):
        XCentury.append("2010-15")
    else:
        XCentury.append("2015-Present")


In [80]:
X['Century'] = XCentury

In [92]:
X.tail()

Unnamed: 0,URL,ThumbnailURL,Height (cm),Width (cm),YearAcquired,Gender_(),Gender_(Female),Gender_(Male),Gender_(male),Gender_\(multiple_persons\),...,2012,2013,2014,2015,2016,2017,2018,YearAcq Reshaped,Century,Date
136749,False,False,0.0,0.0,2008,0,0,1,0,0,...,0,0,0,0,0,0,0,2008,2005-10,1962.0
136750,False,False,0.0,0.0,2008,0,0,1,0,0,...,0,0,0,0,0,0,0,2008,2005-10,1962.0
136751,False,False,0.0,0.0,2008,0,0,1,0,0,...,0,0,0,0,0,0,0,2008,2005-10,1962.0
136752,False,False,0.0,0.0,2008,0,0,1,0,0,...,0,0,0,0,0,0,0,2008,2005-10,1962.0
136753,False,False,0.0,0.0,2008,0,0,1,0,0,...,0,0,0,0,0,0,0,2008,2005-10,


In [84]:
# adding this back in there
X['Date'] = artworks.Date

In [86]:
X.Date.value_counts()

1967    2215
1969    2105
1968    2047
1965    2033
1966    2013
1971    1902
1970    1794
1964    1670
1930    1615
1962    1578
1963    1570
1973    1532
2003    1357
1972    1350
1948    1275
1928    1250
1931    1243
1938    1218
2001    1150
1926    1143
2002    1138
1980    1123
1947    1113
1920    1112
1976    1097
1961    1094
1927    1078
1974    1068
1950    1066
1975    1056
        ... 
1840      22
1879      20
1886      20
1825      18
1816      18
1878      15
1851      14
1844      12
2018      10
1845       9
1841       9
1882       9
1837       9
1847       4
1768       2
1832       2
1839       2
1846       2
1838       1
1811       1
1828       1
1848       1
1808       1
1799       1
1501       1
1786       1
1805       1
1809       1
1842       1
1800       1
Name: Date, Length: 198, dtype: int64

In [87]:
X4 = X[X['Height (cm)'] != 0]
X4 = X[X['Width (cm)'] != 0]

In [90]:
X5 = X4.iloc[:, :11]
X5['Century'] = X4['Century']

In [91]:
X5.rename(columns=({'Century':'Decade Acquired'}), inplace=True)

In [93]:
X5['Date'] = X4.Date

In [94]:
X5.dropna(inplace=True)

In [100]:
# tried to run mlp at this point - wouldn't take in strings
# will have to one-hot encode my Decade Acquired section

In [96]:
# Now I think we're ready to do some work
# Final mlp configuration

mlp = MLPClassifier(hidden_layer_sizes=(500,4), 
                     alpha=0.005, 
                     max_iter=200, 
                     early_stopping=True)
mlp.fit(X5, Y)
cross_val_score(mlp, X5, Y, cv=5)

ValueError: could not convert string to float: '1990s'

In [98]:
# got error
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')

encoder.fit(X5)

OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=True)

In [102]:
# dropping decade acquired because it's the wrong data type
X5.drop(['Decade Acquired'], axis=1, inplace=True)

In [103]:
# now to finally do it
mlp = MLPClassifier(hidden_layer_sizes=(500,4), 
                     alpha=0.005, 
                     max_iter=200, 
                     early_stopping=True)
mlp.fit(X5, Y)
cross_val_score(mlp, X5, Y, cv=5)

ValueError: Found input variables with inconsistent numbers of samples: [105481, 107160]

In [106]:
artworks['Date'] = X5['Date']

In [110]:
# now have to fix lengths, which is frustrating

print(artworks.count())
print(X5.count())

Artist          107160
Nationality     107160
Gender          107160
Date            105481
Department      107160
DateAcquired    107160
URL             107160
ThumbnailURL    107160
Height (cm)     107160
Width (cm)      107160
YearAcquired    107160
dtype: int64
URL                            105481
ThumbnailURL                   105481
Height (cm)                    105481
Width (cm)                     105481
YearAcquired                   105481
Gender_()                      105481
Gender_(Female)                105481
Gender_(Male)                  105481
Gender_(male)                  105481
Gender_\(multiple_persons\)    105481
()                             105481
Date                           105481
dtype: int64


In [111]:
artworksdrop = artworks.dropna()

In [112]:
Ydrop = artworksdrop.Department

In [113]:
print(X5.count())
len(Ydrop)

URL                            105481
ThumbnailURL                   105481
Height (cm)                    105481
Width (cm)                     105481
YearAcquired                   105481
Gender_()                      105481
Gender_(Female)                105481
Gender_(Male)                  105481
Gender_(male)                  105481
Gender_\(multiple_persons\)    105481
()                             105481
Date                           105481
dtype: int64


105481

In [114]:
# okay, NOWWWWWWW we can do this

mlp = MLPClassifier(hidden_layer_sizes=(500,4), 
                     alpha=0.005, 
                     max_iter=200, 
                     early_stopping=True)
mlp.fit(X5, Ydrop)
cross_val_score(mlp, X5, Ydrop, cv=5)

array([0.62280785, 0.62280785, 0.62283737, 0.62284902, 0.62290807])