In [0]:
import numpy as np         # linear algebra
import sklearn as sk       # machine learning
import pandas as pd        # reading in data files, data cleaning
import matplotlib.pyplot as plt   # for plotting
import seaborn as sns      # visualization tool

file_id = 'letter-recognition.data'
link = 'https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/{FILE_ID}'
csv_url = link.format(FILE_ID = file_id)

data = pd.read_csv(csv_url)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 17 columns):
T      19999 non-null object
2      19999 non-null int64
8      19999 non-null int64
3      19999 non-null int64
5      19999 non-null int64
1      19999 non-null int64
8.1    19999 non-null int64
13     19999 non-null int64
0      19999 non-null int64
6      19999 non-null int64
6.1    19999 non-null int64
10     19999 non-null int64
8.2    19999 non-null int64
0.1    19999 non-null int64
8.3    19999 non-null int64
0.2    19999 non-null int64
8.4    19999 non-null int64
dtypes: int64(16), object(1)
memory usage: 2.6+ MB


In [0]:
data.columns = ['letter', 'hori', 'vert', 'width', 'height', 
                'pixels', 'x-bar', 'y-bar', 'x2bar', 'y2bar',
                'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy',
                'y-ege', 'yegvx']
data.head()

Unnamed: 0,letter,hori,vert,width,height,pixels,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
1,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
2,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
3,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
4,S,4,11,5,8,3,8,8,6,9,5,6,6,0,8,9,7


In [0]:
def z_trans(data):
  return (data - data.mean()) / data.std()

cols = [i for i in data.columns if i not in ['letter']]
for col in cols:
  data[col] = pd.to_numeric(data[col])
  data[col] = z_trans(data[col])
  
data.head()

Unnamed: 0,letter,hori,vert,width,height,pixels,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,I,0.510321,1.5023,-1.053306,0.719686,-0.687522,1.531267,-1.075304,0.137478,-0.495032,1.895867,-1.312733,0.51474,-0.448554,-0.219082,0.12001,1.35938
1,D,-0.012362,1.199695,0.435847,1.161881,1.138595,1.531267,-0.645213,-0.9737,0.344995,0.690314,-1.312733,-0.446489,-0.01983,-0.865595,-0.269549,0.741145
2,N,1.555687,1.199695,0.435847,0.277491,-0.230992,-0.936564,0.645062,-0.232915,0.344995,-1.720792,-0.932652,0.995354,1.266344,1.073944,-0.659109,0.122911
3,G,-1.057728,-1.826362,-1.053306,-1.933484,-1.144051,0.544134,-0.645213,0.507871,0.344995,-0.91709,-0.552571,0.51474,-0.877279,-0.865595,0.509569,1.35938
4,S,-0.012362,1.199695,-0.060537,1.161881,-0.230992,0.544134,0.21497,0.507871,1.605036,-1.318941,-0.172489,-0.927103,-1.306003,-0.219082,2.067806,-0.495323


Some general rules about the hidden layer are the following based on this paper:* Approximating Number of Hidden layer neurons in Multiple Hidden Layer BPNN Architecture* by Saurabh Karsoliya.

In general:

*   The number of hidden layer neurons are 2/3 (or 70% to 90%) of the size of the input layer.
*   The number of hidden layer neurons should be less than twice of the number of neurons in input layer.
*   The size of the hidden layer neurons is between the input layer size and the output layer size.



In [0]:
X = data.loc[:, data.columns != 'letter']
y = data['letter']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(17, 17, 17), random_state=0)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

from sklearn.metrics import accuracy_score
print('Training set R^2 =', accuracy_score(y_train, y_pred_train))
print('Test set R^2 =', accuracy_score(y_test, y_pred_test))

Training set R^2 = 0.872054503406463
Test set R^2 = 0.85325


In [0]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=100)

parameter_space = {
    'hidden_layer_sizes': [(17,17,17), (17,15,12), (17, 12, 5)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['sgd', 'adam', 'lbfgs'],
    'alpha': [0.0001, 0.001, 1e-5],
    'learning_rate': ['constant', 'adaptive', 'invscaling'],
    'random_state' : [0]
}

from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)

# Best paramete set
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
y_true, y_pred = y_test , clf.predict(X_test)

from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(y_true, y_pred))

Best parameters found:
 {'activation': 'tanh', 'alpha': 1e-05, 'hidden_layer_sizes': (17, 17, 17), 'learning_rate': 'constant', 'random_state': 0, 'solver': 'adam'}
0.739 (+/-0.004) for {'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (17, 17, 17), 'learning_rate': 'constant', 'random_state': 0, 'solver': 'sgd'}
0.766 (+/-0.004) for {'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (17, 17, 17), 'learning_rate': 'constant', 'random_state': 0, 'solver': 'adam'}
0.769 (+/-0.008) for {'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (17, 17, 17), 'learning_rate': 'constant', 'random_state': 0, 'solver': 'lbfgs'}
0.739 (+/-0.004) for {'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (17, 17, 17), 'learning_rate': 'adaptive', 'random_state': 0, 'solver': 'sgd'}
0.766 (+/-0.004) for {'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (17, 17, 17), 'learning_rate': 'adaptive', 'random_state': 0, 'solver': 'adam'}
0



In [0]:
clf = MLPClassifier(activation='tanh', alpha=1e-05, hidden_layer_sizes=(17, 17, 17), 
                    learning_rate='constant', random_state=0, solver='adam', max_iter=1000)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

print('Training set R^2 =', accuracy_score(y_train, y_pred_train))
print('Test set R^2 =', accuracy_score(y_test, y_pred_test))

Training set R^2 = 0.939496218513657
Test set R^2 = 0.91075


In [0]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=1000)

parameter_space = {
    'hidden_layer_sizes': [(17,17,17)],
    'activation': ['tanh'],
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.0001, 1e-5],
    'learning_rate': ['constant', 'adaptive', 'invscaling'],
    'random_state' : [0]
}

from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)

# Best paramete set
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
y_true, y_pred = y_test , clf.predict(X_test)

from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(y_true, y_pred))

Best parameters found:
 {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (17, 17, 17), 'learning_rate': 'constant', 'random_state': 0, 'solver': 'adam'}
0.891 (+/-0.014) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (17, 17, 17), 'learning_rate': 'constant', 'random_state': 0, 'solver': 'adam'}
0.877 (+/-0.012) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (17, 17, 17), 'learning_rate': 'constant', 'random_state': 0, 'solver': 'lbfgs'}
0.891 (+/-0.014) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (17, 17, 17), 'learning_rate': 'adaptive', 'random_state': 0, 'solver': 'adam'}
0.877 (+/-0.012) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (17, 17, 17), 'learning_rate': 'adaptive', 'random_state': 0, 'solver': 'lbfgs'}
0.891 (+/-0.014) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (17, 17, 17), 'learning_rate': 'invscaling', 'random_state': 0, 'solver': 'adam'}
0.877 (+/-0.012

In [0]:
clf = MLPClassifier(activation='tanh', alpha=0.0001, hidden_layer_sizes=(17, 17, 17), 
                    learning_rate='constant', random_state=0, solver='adam', max_iter=1000)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

print('Training set R^2 =', accuracy_score(y_train, y_pred_train))
print('Test set R^2 =', accuracy_score(y_test, y_pred_test))

Training set R^2 = 0.9395587224201513
Test set R^2 = 0.9105
