# Jonathan Halverson
# Monday, November 6, 2017
# Scitkit-Learn MLP versus Tensorflow

The MLPClassifier is a fully connected neural network. It offers choices of the activation function as well as L2 regularization. There are a large number of parameters to optimize.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('halverson')

In [2]:
from sklearn.datasets import load_digits

In [3]:
dir(load_digits())

['DESCR', 'data', 'images', 'target', 'target_names']

In [4]:
X, y = load_digits().data, load_digits().target

In [5]:
X.shape

(1797, 64)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
X_train[0]

array([  0.,   0.,   0.,   1.,  15.,   2.,   0.,   0.,   0.,   0.,   0.,
         5.,  15.,   0.,   4.,   0.,   0.,   0.,   0.,  13.,   8.,   1.,
        16.,   3.,   0.,   0.,   5.,  15.,   2.,   5.,  15.,   0.,   0.,
         5.,  15.,  16.,  16.,  16.,   8.,   0.,   0.,  14.,  12.,  12.,
        14.,  16.,   2.,   0.,   0.,   0.,   0.,   0.,  12.,  12.,   0.,
         0.,   0.,   0.,   0.,   2.,  16.,   5.,   0.,   0.])

In [8]:
y_train

array([4, 7, 1, ..., 7, 0, 3])

In [9]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

std_sc = StandardScaler()
X_train_std = std_sc.fit_transform(X_train).astype(np.float32)
X_test_std = std_sc.transform(X_test).astype(np.float32)
mm_sc = MinMaxScaler()
X_train_mm = mm_sc.fit_transform(X_train).astype(np.float32)
X_test_mm = mm_sc.transform(X_test).astype(np.float32)

In [10]:
X_train_mm.min(), X_train_mm.max()

(0.0, 1.0)

### Classification

#### Scikit-Learn

In [11]:
from sklearn.neural_network import MLPClassifier

In [12]:
# setting early_stopping to True made the model worse
# setting alpha to 1000 produces poor results (too much regularization)
# note that alpha is the prefactor for L2 regularization of the neuron weights
mlp_clf = MLPClassifier(activation='relu', batch_size=200, hidden_layer_sizes=(100, 50),
                        learning_rate='constant', early_stopping=False, alpha=0.0, max_iter=400)
mlp_clf.fit(X_train_mm, y_train)

MLPClassifier(activation='relu', alpha=0.0, batch_size=200, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=400, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [13]:
from sklearn.metrics import accuracy_score

accuracy_score(mlp_clf.predict(X_train_mm), y_train)

1.0

In [14]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(mlp_clf, X_train_mm, y_train, cv=10)
scores.min(), scores.mean(), scores.max()

(0.95104895104895104, 0.97217820543583344, 0.99300699300699302)

Lastly, we compute the accuracy on the test set:

In [15]:
accuracy_score(mlp_clf.predict(X_test_mm), y_test)

0.98333333333333328

#### Tensorflow

In [16]:
import tensorflow as tf

In [17]:
n_inputs = 64
n_hidden1 = 100
n_hidden2 = 50
n_outputs = 10
learning_rate = 0.01

In [18]:
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")

Define the computation graph:

In [19]:
with tf.name_scope('dnn'):
     hidden1 = tf.layers.dense(X, n_hidden1, name='hidden1', activation=tf.nn.relu)
     hidden2 = tf.layers.dense(hidden1, n_hidden2, name='hidden2', activation=tf.nn.relu)
     logits  = tf.layers.dense(hidden2, n_outputs, name='outputs')

In [20]:
with tf.name_scope('loss'):
     xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
     loss = tf.reduce_mean(xentropy, name='loss')
with tf.name_scope('train'):
     optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
     training_op = optimizer.minimize(loss)
with tf.name_scope('eval'):
     correct = tf.nn.in_top_k(logits, y, 1)
     accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()

In [21]:
def fetch_batch(batch_size):
     indices = np.random.choice(range(X_train_mm.shape[0]), size=batch_size, replace=False)
     return X_train_mm[indices], y_train[indices]

In [22]:
n_epochs = 2000
batch_size = 200
with tf.Session() as sess:
     init.run()
     for epoch in xrange(n_epochs + 1):
          for iteration in xrange(X_train_mm.shape[0] // batch_size):
               X_batch, y_batch = fetch_batch(batch_size)
               sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
          acc_train = accuracy.eval(feed_dict={X:X_batch, y:y_batch})
          acc_test = accuracy.eval(feed_dict={X:X_test_mm, y:y_test})
          if (epoch % 100 == 0): print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test)

(0, 'Train accuracy:', 0.1, 'Test accuracy:', 0.097222224)
(100, 'Train accuracy:', 0.88499999, 'Test accuracy:', 0.89166665)
(200, 'Train accuracy:', 0.94999999, 'Test accuracy:', 0.9222222)
(300, 'Train accuracy:', 0.95499998, 'Test accuracy:', 0.94722223)
(400, 'Train accuracy:', 0.98500001, 'Test accuracy:', 0.96111113)
(500, 'Train accuracy:', 0.94999999, 'Test accuracy:', 0.96388888)
(600, 'Train accuracy:', 0.98500001, 'Test accuracy:', 0.96666664)
(700, 'Train accuracy:', 0.99000001, 'Test accuracy:', 0.96944445)
(800, 'Train accuracy:', 0.98500001, 'Test accuracy:', 0.97500002)
(900, 'Train accuracy:', 0.99000001, 'Test accuracy:', 0.97500002)
(1000, 'Train accuracy:', 0.98500001, 'Test accuracy:', 0.97500002)
(1100, 'Train accuracy:', 0.98500001, 'Test accuracy:', 0.97777778)
(1200, 'Train accuracy:', 0.99000001, 'Test accuracy:', 0.96944445)
(1300, 'Train accuracy:', 1.0, 'Test accuracy:', 0.96944445)
(1400, 'Train accuracy:', 0.995, 'Test accuracy:', 0.96944445)
(1500, 'Tra

### Regression

In [23]:
import pandas as pd

hs = pd.read_csv('../machine_learning/geron_housing/housing.csv', header=0)
hs.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [24]:
hs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [25]:
hs.drop('ocean_proximity', axis=1, inplace=True)
hs.dropna(inplace=True)

In [26]:
hs.shape

(20433, 9)

In [27]:
y = hs.median_house_value.values
X = hs.drop('median_house_value', axis=1).values

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [29]:
std_hs_X = StandardScaler()
X_train_std = std_hs_X.fit_transform(X_train).astype(np.float32)
X_test_std = std_hs_X.transform(X_test).astype(np.float32)
std_hs_y = StandardScaler()
y_train_std = std_hs_y.fit_transform(y_train.reshape(-1, 1)).ravel().astype(np.float32)
y_test_std = std_hs_y.transform(y_test.reshape(-1, 1)).ravel().astype(np.float32)

In [30]:
X_train_std[0]

array([ 0.94581938, -1.00267756, -0.92998344, -0.18430434, -0.39382353,
       -0.31464478, -0.29997829,  0.19951358], dtype=float32)

In [31]:
y_train_std

array([ 0.05905237,  0.15671499,  1.82043672, ..., -0.27887762,
       -0.0403388 ,  2.06156826], dtype=float32)

#### Scikit-Learn

In [32]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

##### RF

In [33]:
rf_reg = RandomForestRegressor(n_estimators=100)
rf_reg.fit(X_train_std, y_train_std)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [34]:
mean_squared_error(y_train_std, rf_reg.predict(X_train_std)), r2_score(y_train_std, rf_reg.predict(X_train_std))

(0.024451167197697809, 0.97554883263635039)

In [35]:
scores = -cross_val_score(rf_reg, X_train_std, y_train_std, cv=5, scoring='neg_mean_squared_error')
scores.min(), scores.mean(), scores.max()

(0.1690171062111393, 0.18287603561513902, 0.19139185462305963)

In [36]:
scores = cross_val_score(rf_reg, X_train_std, y_train_std, cv=5, scoring='r2')
scores.min(), scores.mean(), scores.max()

(0.80511180395336557, 0.81606894228270899, 0.83382605177382008)

##### MLP

In [37]:
from sklearn.neural_network import MLPRegressor

mlp_reg = MLPRegressor(activation='relu', batch_size=200, hidden_layer_sizes=(100, 50),
                       learning_rate='constant', early_stopping=False, alpha=0.0, max_iter=400)

In [38]:
mlp_reg.fit(X_train_std, y_train_std)

MLPRegressor(activation='relu', alpha=0.0, batch_size=200, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=400, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [39]:
mean_squared_error(y_train_std, mlp_reg.predict(X_train_std)), r2_score(y_train_std, mlp_reg.predict(X_train_std))

(0.1896118823718608, 0.81038811634122987)

##### Tensorflow with batch approach

In [40]:
tf.reset_default_graph()

In [41]:
n_inputs = 8
n_hidden1 = 100
n_hidden2 = 50
n_outputs = 1
learning_rate = 0.01

In [42]:
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.float32, shape=(None), name="y")

In [43]:
he_init = tf.contrib.layers.variance_scaling_initializer()

In [44]:
hidden1 = tf.layers.dense(X, n_hidden1, name='hidden1', activation=tf.nn.elu, kernel_initializer=he_init)
hidden2 = tf.layers.dense(hidden1, n_hidden2, name='hidden2', activation=tf.nn.elu, kernel_initializer=he_init)
output  = tf.layers.dense(inputs=hidden2, units=n_outputs, name='outputs', activation=None)
y_pred = tf.squeeze(output, 1)

In [45]:
mse = tf.losses.mean_squared_error(y, y_pred)
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9, use_nesterov=True)
training_op = optimizer.minimize(mse)
init = tf.global_variables_initializer()

In [46]:
n_epochs = 2000
with tf.Session() as sess:
     init.run()
     for epoch in range(n_epochs + 1):
          sess.run(training_op, feed_dict={X:X_train_std, y:y_train_std})
          if (epoch % 200 == 0):
               mse_train = mse.eval(feed_dict={X:X_train_std, y:y_train_std})
               mse_test = mse.eval(feed_dict={X:X_test_std, y:y_test_std})
               print(epoch, "Train mse:", mse_train, "Test mse:", mse_test)

(0, 'Train mse:', 14.490368, 'Test mse:', 15.237665)
(200, 'Train mse:', 0.2566714, 'Test mse:', 0.26099709)
(400, 'Train mse:', 0.24329495, 'Test mse:', 0.24770327)
(600, 'Train mse:', 0.23506688, 'Test mse:', 0.24013588)
(800, 'Train mse:', 0.22917186, 'Test mse:', 0.23519807)
(1000, 'Train mse:', 0.22470091, 'Test mse:', 0.23158254)
(1200, 'Train mse:', 0.22110449, 'Test mse:', 0.22842504)
(1400, 'Train mse:', 0.21808864, 'Test mse:', 0.2256929)
(1600, 'Train mse:', 0.21550678, 'Test mse:', 0.22333924)
(1800, 'Train mse:', 0.21321961, 'Test mse:', 0.22125718)


#### Tensorflow with mini-batch approach

In [47]:
tf.reset_default_graph()

In [48]:
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.float32, shape=(None), name="y")

In [49]:
with tf.name_scope('dnn'):
     hidden1 = tf.layers.dense(X, n_hidden1, name='hidden1', activation=tf.nn.relu)
     hidden2 = tf.layers.dense(hidden1, n_hidden2, name='hidden2', activation=tf.nn.relu)
     output  = tf.layers.dense(inputs=hidden2, units=n_outputs, name='outputs', activation=None)
     y_pred  = tf.squeeze(output, 1)

In [50]:
with tf.name_scope('mse'):
     mse = tf.losses.mean_squared_error(y, y_pred)
     r2 = 1.0 - tf.reduce_mean(tf.square(y - y_pred)) / tf.reduce_mean(tf.square(y - tf.reduce_mean(y)))
with tf.name_scope('train'):
     optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9, use_nesterov=True)
     training_op = optimizer.minimize(mse)
init = tf.global_variables_initializer()

In [51]:
def fetch_batch2(batch_size_):
     indices = np.random.choice(range(X_train_std.shape[0]), size=batch_size_, replace=False)
     return X_train_std[indices], y_train_std[indices]

In [52]:
n_epochs = 1000
batch_size = 200
with tf.Session() as sess:
     init.run()
     for epoch in xrange(n_epochs + 1):
          for iteration in xrange(X_train_std.shape[0] // batch_size):
               X_batch, y_batch = fetch_batch2(batch_size)
               sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
          mse_train = mse.eval(feed_dict={X:X_batch, y:y_batch})
          mse_test = mse.eval(feed_dict={X:X_test_std, y:y_test_std})
          r2_test = r2.eval(feed_dict={X:X_test_std, y:y_test_std})
          if (epoch % 100 == 0):
               print(epoch, "Train mse:", mse_train, "Test mse:", mse_test, "R2:", r2_test)

(0, 'Train mse:', 0.29302537, 'Test mse:', 0.31087714, 'R2:', 0.68134558)
(100, 'Train mse:', 0.15288311, 'Test mse:', 0.20172754, 'R2:', 0.79322577)
(200, 'Train mse:', 0.1477693, 'Test mse:', 0.19711611, 'R2:', 0.79795259)
(300, 'Train mse:', 0.12728213, 'Test mse:', 0.19776991, 'R2:', 0.7972824)
(400, 'Train mse:', 0.12178169, 'Test mse:', 0.20159271, 'R2:', 0.79336399)
(500, 'Train mse:', 0.17457905, 'Test mse:', 0.19567171, 'R2:', 0.79943311)
(600, 'Train mse:', 0.09048447, 'Test mse:', 0.20376097, 'R2:', 0.79114151)
(700, 'Train mse:', 0.090534143, 'Test mse:', 0.20151956, 'R2:', 0.79343897)
(800, 'Train mse:', 0.12815505, 'Test mse:', 0.20259689, 'R2:', 0.79233468)
(900, 'Train mse:', 0.10429945, 'Test mse:', 0.20377927, 'R2:', 0.79112273)
(1000, 'Train mse:', 0.11579258, 'Test mse:', 0.20178998, 'R2:', 0.79316175)


##### Tensorflow from contrib

In [53]:
# use tf.estimator.DNNRegressor with version 1.4
feature_cols = tf.contrib.learn.infer_real_valued_columns_from_input(X_train_std)
dnn_reg = tf.contrib.learn.DNNRegressor(hidden_units=[100, 50], feature_columns=feature_cols)
dnn_reg = tf.contrib.learn.SKCompat(dnn_reg)
dnn_reg.fit(X_train_std, y_train_std, batch_size=200, steps=2000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': None, '_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_tf_random_seed': None, '_task_type': None, '_environment': 'local', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12dc81bd0>, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_num_worker_replicas': 0, '_task_id': 0, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_evaluation_master': '', '_keep_checkpoint_every_n_hours': 10000, '_master': ''}
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving

SKCompat()

In [54]:
y_pred = dnn_reg.predict(X_train_std)

INFO:tensorflow:Restoring parameters from /var/folders/tz/vmnmfncn05xfhb89h953_8_w0000gn/T/tmpsSRLgM/model.ckpt-2000


In [55]:
mean_squared_error(y_train_std, y_pred['scores']), r2_score(y_train_std, y_pred['scores'])

(0.21761793, 0.78238207029449358)

And on the test set:

In [56]:
y_pred_test = dnn_reg.predict(X_test_std)
mean_squared_error(y_test_std, y_pred_test['scores']), r2_score(y_test_std, y_pred_test['scores'])

INFO:tensorflow:Restoring parameters from /var/folders/tz/vmnmfncn05xfhb89h953_8_w0000gn/T/tmpsSRLgM/model.ckpt-2000


(0.22843881, 0.76584622898517685)