### MOMA dataset!

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
collection = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artists.csv')

In [3]:
collection.columns

Index(['ConstituentID', 'DisplayName', 'ArtistBio', 'Nationality', 'Gender',
       'BeginDate', 'EndDate', 'Wiki QID', 'ULAN'],
      dtype='object')

In [4]:
collection.describe()

Unnamed: 0,ConstituentID,BeginDate,EndDate,ULAN
count,15845.0,15845.0,15845.0,2941.0
mean,20839.550458,1469.513474,631.260208,500074600.0
std,19982.433157,824.393125,921.665576,86756.1
min,1.0,0.0,0.0,500000000.0
25%,4416.0,1832.0,0.0,500017600.0
50%,9693.0,1923.0,0.0,500033100.0
75%,35549.0,1949.0,1956.0,500114600.0
max,131789.0,2017.0,2019.0,500356600.0


In [5]:
# Select Columns.
collection = collection[['ConstituentID', 'DisplayName', 'ArtistBio', 'Nationality', 'Gender',
       'BeginDate', 'EndDate', 'Wiki QID', 'ULAN']]





# Drop missing data.
collection = collection.dropna()

In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(collection['Nationality'])

LabelEncoder()

In [7]:
collection["country_encoded"] = le.transform(collection.Nationality)


In [8]:
collection.head()

Unnamed: 0,ConstituentID,DisplayName,ArtistBio,Nationality,Gender,BeginDate,EndDate,Wiki QID,ULAN,country_encoded
3,4,Charles Arnoldi,"American, born 1946",American,Male,1946,0,Q1063584,500027998.0,1
7,9,David Aronson,"American, born Lithuania 1923",American,Male,1923,0,Q5230870,500003363.0,1
8,10,Irene Aronson,"American, born Germany 1918",American,Female,1918,0,Q19748568,500042413.0,1
9,11,Jean (Hans) Arp,"French, born Germany (Alsace). 1886–1966",French,Male,1886,1966,Q153739,500031000.0,25
15,19,Richard Artschwager,"American, 1923–2013",American,Male,1923,2013,Q568262,500114981.0,1


In [9]:
# Get data types.
collection.dtypes

ConstituentID        int64
DisplayName         object
ArtistBio           object
Nationality         object
Gender              object
BeginDate            int64
EndDate              int64
Wiki QID            object
ULAN               float64
country_encoded      int64
dtype: object

In [10]:


# Final column drops and NA drop.
X = collection.drop(['Wiki QID', 'DisplayName', 'ArtistBio', 'Nationality', 'Gender'], 1)

# Create dummies separately.
gender = pd.get_dummies(collection.Gender)


# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, gender], axis=1)

Y = collection.Nationality

In [11]:
from sklearn.model_selection import train_test_split


X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y,
    test_size=0.9,
    random_state=42)

In [12]:
# Alright! We've done our prep, let's build the model.
# Neural networks are hugely computationally intensive.
# This may take several minutes to run.

# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [13]:
Y.value_counts()/len(Y)

American      0.421016
French        0.097425
British       0.096729
German        0.093946
Italian       0.043145
                ...   
Albanian      0.000348
Namibian      0.000348
Georgian      0.000348
Ecuadorian    0.000348
Ghanaian      0.000348
Name: Nationality, Length: 72, dtype: float64

In [14]:
Y.value_counts()/len(Y)

American      0.421016
French        0.097425
British       0.096729
German        0.093946
Italian       0.043145
                ...   
Albanian      0.000348
Namibian      0.000348
Georgian      0.000348
Ecuadorian    0.000348
Ghanaian      0.000348
Name: Nationality, Length: 72, dtype: float64

In [15]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)



array([0.39477977, 0.41367521, 0.4245614 , 0.01798561, 0.44      ])

In [16]:
mlp.score(X, Y)

0.011482254697286013

In [20]:
# Alright! We've done our prep, let's build the model.
# Neural networks are hugely computationally intensive.
# This may take several minutes to run.

# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(2000,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(2000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [21]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)



array([0.04078303, 0.41367521, 0.09473684, 0.4352518 , 0.10181818])

In [22]:
mlp.score(X, Y)

0.09394572025052192

### Ok so we got an MLP score of .0125, now let's comparing with Random Forest 

In [17]:
from sklearn.model_selection import train_test_split


In [19]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier


# Final column drops and NA drop.
X = collection.drop(['Wiki QID', 'DisplayName', 'ArtistBio', 'Nationality', 'Gender'], 1)



# Create dummies separately.
gender = pd.get_dummies(collection.Gender)
nationality = pd.get_dummies(collection.Nationality)

randomforest = ensemble.RandomForestClassifier()
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, gender], axis=1)

Y = collection["country_encoded"]

randomforest.fit(X_train, Y_train)
randomforest.score(X_test, Y_test)
cross_val_score(randomforest, X, Y, cv=5)



array([0.67862969, 0.84786325, 0.84561404, 0.86151079, 0.67636364])

### Here I see that the random forest is performing almost twice as well as our neural network. Seeing as I only have about 2500 rows, this might not be enough data for the neural network to be properly trained which is why the random forest is performing much better. 