# Analysis of Classification Models in Letter Recognition
### by Joshua Gabella
_______________________________________________________________________________________________________________________________


Imports

In [76]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [63]:
df = pd.read_csv("letter-recognition.data", index_col=False)
data = df.to_numpy()
HKdata = data[np.logical_or(data[:,0]=='H', data[:,0] == 'K'), :]
MYdata = data[np.logical_or(data[:,0]=='M', data[:,0] == 'Y'), :]
DOdata = data[np.logical_or(data[:,0]=='D', data[:,0] == 'O'), :]

## H and K Analysis

In [64]:
training_data, test_data = train_test_split(HKdata, test_size=0.1, random_state=0)
training_y = training_data[:, 0]
training_X = training_data[:, 1:]
test_y = test_data[:, 0]
test_X = test_data[:, 1:]

### Kernel SVM - H/K

In [65]:
cs = [0.1, 1, 3, 5, 10]
scores = {param_value : cross_val_score(svm.SVC(kernel='rbf', C = param_value), training_X, training_y, cv=5) for param_value in cs}
means = {param_value : np.mean(scores[param_value]) for param_value in (list)(scores.keys())}
stdevs = {param_value : np.std(scores[param_value]) for param_value in (list)(scores.keys())}
for i in scores:
    print(f"C value: {i} --> Mean Score: {np.mean(scores[i])}, Standard Deviation: {np.std(scores[i])}")

C value: 0.1 --> Mean Score: 0.9222641509433963, Standard Deviation: 0.012309061456830254
C value: 1 --> Mean Score: 0.9607547169811321, Standard Deviation: 0.010566037735849054
C value: 3 --> Mean Score: 0.969811320754717, Standard Deviation: 0.007915538476755858
C value: 5 --> Mean Score: 0.9743396226415093, Standard Deviation: 0.007317252614968042
C value: 10 --> Mean Score: 0.9743396226415093, Standard Deviation: 0.005546014511961911


In [66]:
clf = GridSearchCV(svm.SVC(), {
    'C':[0.1, 1, 5, 10, 20],
    'kernel':['rbf', 'linear', 'sigmoid'],
    'gamma':['auto', 'scale']
}, cv=5)
clf.fit(training_X, training_y)
param_SVM_results = pd.DataFrame(clf.cv_results_)
trimmed_SVM_results = param_SVM_results[['param_C', 'param_kernel', 'param_gamma', 'mean_test_score', 'std_test_score']]



In [67]:
trimmed_SVM_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_score,std_test_score
12,5.0,rbf,auto,0.981887,0.003697
18,10.0,rbf,auto,0.981132,0.002387
24,20.0,rbf,auto,0.981132,0.002387
27,20.0,rbf,scale,0.978113,0.005006
6,1.0,rbf,auto,0.975849,0.003848
21,10.0,rbf,scale,0.97434,0.005546
15,5.0,rbf,scale,0.97434,0.007317
9,1.0,rbf,scale,0.960755,0.010566
25,20.0,linear,auto,0.928302,0.011194
19,10.0,linear,auto,0.928302,0.011194


---
### Decision Tree Classifier - H/K

In [68]:
clf = GridSearchCV(tree.DecisionTreeClassifier(), {
    'criterion':['gini', 'entropy'],
    'max_depth':[2, 5, 10, 20, 50]
}, cv=5)
clf.fit(training_X, training_y)
param_tree_results = pd.DataFrame(clf.cv_results_)
trimmed_tree_results = param_tree_results[['param_criterion', 'param_max_depth', 'mean_test_score', 'std_test_score']]


In [69]:
trimmed_tree_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_criterion,param_max_depth,mean_test_score,std_test_score
8,entropy,20,0.952453,0.012075
9,entropy,50,0.948679,0.007395
7,entropy,10,0.944151,0.010512
2,gini,10,0.939623,0.015467
3,gini,20,0.93434,0.014828
4,gini,50,0.93434,0.010832
1,gini,5,0.896604,0.019767
6,entropy,5,0.890566,0.025705
0,gini,2,0.843774,0.019911
5,entropy,2,0.831698,0.023699


---
### Random Forest

In [74]:
clf = GridSearchCV(RandomForestClassifier(), {
    'criterion':['gini', 'entropy'],
    'max_depth':[2, 5, 10, 20, 50],
    'n_estimators':[10, 50, 100, 200, 500]
}, cv=5)
clf.fit(training_X, training_y)
param_forest_results = pd.DataFrame(clf.cv_results_)
trimmed_forest_results = param_forest_results[['param_n_estimators', 'param_criterion', 'param_max_depth', 'mean_test_score', 'std_test_score']]

In [75]:
trimmed_forest_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_n_estimators,param_criterion,param_max_depth,mean_test_score,std_test_score
48,200,entropy,50,0.973585,0.005337
42,100,entropy,20,0.973585,0.00675
24,500,gini,50,0.972075,0.006579
44,500,entropy,20,0.972075,0.005119
14,500,gini,10,0.971321,0.005648
23,200,gini,50,0.971321,0.005648
47,100,entropy,50,0.970566,0.004401
49,500,entropy,50,0.969811,0.007916
43,200,entropy,20,0.969811,0.004134
41,50,entropy,20,0.969811,0.007916


---
### K-Nearest Neighbor - H/K

In [71]:
clf = GridSearchCV(KNeighborsClassifier(), {
    'algorithm':['ball_tree', 'brute', 'kd_tree'],
    'n_neighbors':[1, 3, 5, 10, 20],
    'weights': ['uniform', 'distance']
}, cv=5)
clf.fit(training_X, training_y)
param_knn_results = pd.DataFrame(clf.cv_results_)
trimmed_knn_results = param_knn_results[['param_n_neighbors', 'param_algorithm', 'param_weights', 'mean_test_score', 'std_test_score']]

In [72]:
trimmed_knn_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_n_neighbors,param_algorithm,param_weights,mean_test_score,std_test_score
25,5,kd_tree,distance,0.955472,0.007317
5,5,ball_tree,distance,0.954717,0.008268
15,5,brute,distance,0.953208,0.007395
21,1,kd_tree,distance,0.953208,0.006131
20,1,kd_tree,uniform,0.953208,0.006131
10,1,brute,uniform,0.953208,0.006579
11,1,brute,distance,0.953208,0.006579
24,5,kd_tree,uniform,0.951698,0.009057
23,3,kd_tree,distance,0.951698,0.010237
1,1,ball_tree,distance,0.951698,0.007317


---
### Artifical Neural Network - H/K

In [77]:
clf = GridSearchCV(MLPClassifier(), {
    'hidden_layer_sizes' : [(20), (10, 15, 20, 15, 10), (10, 20, 10)],
    'activation':['identity', 'relu', 'logistic']
}, cv=5)
clf.fit(training_X, training_y)
param_ann_results = pd.DataFrame(clf.cv_results_)
trimmed_ann_results = param_ann_results[['param_activation', 'param_hidden_layer_sizes','mean_test_score', 'std_test_score']]



In [78]:
trimmed_ann_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_activation,param_hidden_layer_sizes,mean_test_score,std_test_score
5,relu,"(10, 20, 10)",0.949434,0.010832
8,logistic,"(10, 20, 10)",0.943396,0.00675
4,relu,"(10, 15, 20, 15, 10)",0.939623,0.013288
3,relu,20,0.936604,0.016774
6,logistic,20,0.929057,0.01078
1,identity,"(10, 15, 20, 15, 10)",0.923019,0.00777
0,identity,20,0.923019,0.015577
2,identity,"(10, 20, 10)",0.921509,0.014596
7,logistic,"(10, 15, 20, 15, 10)",0.58717,0.174348


---
## M and Y Analysis

In [80]:
training_data, test_data = train_test_split(MYdata, test_size=0.1, random_state=0)
training_y = training_data[:, 0]
training_X = training_data[:, 1:]
test_y = test_data[:, 0]
test_X = test_data[:, 1:]

---
### K-Nearest Neighbor - M/Y

In [85]:
clf = GridSearchCV(KNeighborsClassifier(), {
    'algorithm':['ball_tree', 'brute', 'kd_tree'],
    'n_neighbors':[1, 3, 5, 10, 20],
    'weights': ['uniform', 'distance']
}, cv=5)
clf.fit(training_X, training_y)
param_knn_results = pd.DataFrame(clf.cv_results_)
trimmed_knn_results = param_knn_results[['param_n_neighbors', 'param_algorithm', 'param_weights', 'mean_test_score', 'std_test_score']]
trimmed_knn_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_n_neighbors,param_algorithm,param_weights,mean_test_score,std_test_score
0,1,ball_tree,uniform,0.998592,0.001725
10,1,brute,uniform,0.998592,0.001725
25,5,kd_tree,distance,0.998592,0.001725
24,5,kd_tree,uniform,0.998592,0.001725
21,1,kd_tree,distance,0.998592,0.001725
20,1,kd_tree,uniform,0.998592,0.001725
1,1,ball_tree,distance,0.998592,0.001725
14,5,brute,uniform,0.998592,0.001725
11,1,brute,distance,0.998592,0.001725
15,5,brute,distance,0.998592,0.001725


---
### Decision Tree Classifier - M/Y

In [84]:
clf = GridSearchCV(tree.DecisionTreeClassifier(), {
    'criterion':['gini', 'entropy'],
    'max_depth':[2, 5, 10, 20, 50]
}, cv=5)
clf.fit(training_X, training_y)
param_tree_results = pd.DataFrame(clf.cv_results_)
trimmed_tree_results = param_tree_results[['param_criterion', 'param_max_depth', 'mean_test_score', 'std_test_score']]
trimmed_tree_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_criterion,param_max_depth,mean_test_score,std_test_score
3,gini,20,0.993662,0.002635
7,entropy,10,0.992958,0.003857
1,gini,5,0.992254,0.00345
2,gini,10,0.992254,0.001408
4,gini,50,0.992254,0.004106
8,entropy,20,0.992254,0.005175
9,entropy,50,0.992254,0.00345
6,entropy,5,0.987324,0.006531
0,gini,2,0.96831,0.006299
5,entropy,2,0.947183,0.012399


---
### Random Forest - M/Y

In [86]:
clf = GridSearchCV(RandomForestClassifier(), {
    'criterion':['gini', 'entropy'],
    'max_depth':[2, 5, 10, 20, 50],
    'n_estimators':[10, 50, 100, 200, 500]
}, cv=5)
clf.fit(training_X, training_y)
param_forest_results = pd.DataFrame(clf.cv_results_)
trimmed_forest_results = param_forest_results[['param_n_estimators', 'param_criterion', 'param_max_depth', 'mean_test_score', 'std_test_score']]
trimmed_forest_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_n_estimators,param_criterion,param_max_depth,mean_test_score,std_test_score
47,100,entropy,50,0.997887,0.002817
43,200,entropy,20,0.997887,0.002817
42,100,entropy,20,0.997887,0.002817
17,100,gini,20,0.997183,0.002635
41,50,entropy,20,0.997183,0.002635
39,500,entropy,10,0.997183,0.002635
38,200,entropy,10,0.997183,0.004106
36,50,entropy,10,0.997183,0.002635
14,500,gini,10,0.997183,0.002635
12,100,gini,10,0.997183,0.002635


---
### Kernel SVM - M/Y

In [82]:
clf = GridSearchCV(svm.SVC(), {
    'C':[0.1, 1, 5, 10, 20],
    'kernel':['rbf', 'linear', 'sigmoid'],
    'gamma':['auto', 'scale']
}, cv=5)
clf.fit(training_X, training_y)
param_SVM_results = pd.DataFrame(clf.cv_results_)
trimmed_SVM_results = param_SVM_results[['param_C', 'param_kernel', 'param_gamma', 'mean_test_score', 'std_test_score']]
trimmed_SVM_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_score,std_test_score
27,20.0,rbf,scale,0.999296,0.001408
21,10.0,rbf,scale,0.999296,0.001408
6,1.0,rbf,auto,0.998592,0.001725
24,20.0,rbf,auto,0.998592,0.001725
12,5.0,rbf,auto,0.998592,0.001725
18,10.0,rbf,auto,0.998592,0.001725
15,5.0,rbf,scale,0.997183,0.002635
1,0.1,linear,auto,0.997183,0.002635
4,0.1,linear,scale,0.997183,0.002635
9,1.0,rbf,scale,0.997183,0.002635


---
### Artifical Neural Network - M/Y

In [87]:
clf = GridSearchCV(MLPClassifier(), {
    'hidden_layer_sizes' : [(20), (10, 15, 20, 15, 10), (10, 20, 10)],
    'activation':['identity', 'relu', 'logistic']
}, cv=5)
clf.fit(training_X, training_y)
param_ann_results = pd.DataFrame(clf.cv_results_)
trimmed_ann_results = param_ann_results[['param_activation', 'param_hidden_layer_sizes','mean_test_score', 'std_test_score']]
trimmed_ann_results.sort_values('mean_test_score', ascending=False)



Unnamed: 0,param_activation,param_hidden_layer_sizes,mean_test_score,std_test_score
1,identity,"(10, 15, 20, 15, 10)",0.997183,0.002635
4,relu,"(10, 15, 20, 15, 10)",0.996479,0.003857
8,logistic,"(10, 20, 10)",0.996479,0.002227
5,relu,"(10, 20, 10)",0.995775,0.002635
3,relu,20,0.995775,0.002635
6,logistic,20,0.995775,0.00345
0,identity,20,0.99507,0.003591
2,identity,"(10, 20, 10)",0.99507,0.003591
7,logistic,"(10, 15, 20, 15, 10)",0.99507,0.002817


---
## D/O Analysis

In [93]:
training_data, test_data = train_test_split(DOdata, test_size=0.1, random_state=0)
training_y = training_data[:, 0]
training_X = training_data[:, 1:]
test_y = test_data[:, 0]
test_X = test_data[:, 1:]

---
### K-Nearest Neighbor - D/O

In [94]:
clf = GridSearchCV(KNeighborsClassifier(), {
    'algorithm':['ball_tree', 'brute', 'kd_tree'],
    'n_neighbors':[1, 3, 5, 10, 20],
    'weights': ['uniform', 'distance']
}, cv=5)
clf.fit(training_X, training_y)
param_knn_results = pd.DataFrame(clf.cv_results_)
trimmed_knn_results = param_knn_results[['param_n_neighbors', 'param_algorithm', 'param_weights', 'mean_test_score', 'std_test_score']]
trimmed_knn_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_n_neighbors,param_algorithm,param_weights,mean_test_score,std_test_score
22,3,kd_tree,uniform,0.989301,0.004514
13,3,brute,distance,0.989301,0.003912
12,3,brute,uniform,0.989301,0.003912
23,3,kd_tree,distance,0.989301,0.004514
25,5,kd_tree,distance,0.988589,0.004159
2,3,ball_tree,uniform,0.988589,0.004159
3,3,ball_tree,distance,0.988589,0.004159
4,5,ball_tree,uniform,0.988589,0.004159
5,5,ball_tree,distance,0.988589,0.004159
24,5,kd_tree,uniform,0.988589,0.004159


---
### Decision Tree Classifier - D/O

In [95]:
clf = GridSearchCV(tree.DecisionTreeClassifier(), {
    'criterion':['gini', 'entropy'],
    'max_depth':[2, 5, 10, 20, 50]
}, cv=5)
clf.fit(training_X, training_y)
param_tree_results = pd.DataFrame(clf.cv_results_)
trimmed_tree_results = param_tree_results[['param_criterion', 'param_max_depth', 'mean_test_score', 'std_test_score']]
trimmed_tree_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_criterion,param_max_depth,mean_test_score,std_test_score
8,entropy,20,0.964326,0.009065
7,entropy,10,0.962186,0.006672
2,gini,10,0.961479,0.01555
9,entropy,50,0.961472,0.01185
4,gini,50,0.960768,0.012584
3,gini,20,0.952928,0.010665
6,entropy,5,0.950061,0.011104
1,gini,5,0.949357,0.014692
0,gini,2,0.877298,0.017395
5,entropy,2,0.867316,0.023005


---
### Random Forest - D/O

In [96]:
clf = GridSearchCV(RandomForestClassifier(), {
    'criterion':['gini', 'entropy'],
    'max_depth':[2, 5, 10, 20, 50],
    'n_estimators':[10, 50, 100, 200, 500]
}, cv=5)
clf.fit(training_X, training_y)
param_forest_results = pd.DataFrame(clf.cv_results_)
trimmed_forest_results = param_forest_results[['param_n_estimators', 'param_criterion', 'param_max_depth', 'mean_test_score', 'std_test_score']]
trimmed_forest_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_n_estimators,param_criterion,param_max_depth,mean_test_score,std_test_score
18,200,gini,20,0.982875,0.012457
37,100,entropy,10,0.982161,0.013744
43,200,entropy,20,0.98074,0.010494
39,500,entropy,10,0.980737,0.012894
36,50,entropy,10,0.980737,0.013275
46,50,entropy,50,0.980735,0.011206
48,200,entropy,50,0.980023,0.012078
13,200,gini,10,0.980018,0.010747
42,100,entropy,20,0.979314,0.008855
44,500,entropy,20,0.979311,0.013439


---
### Kernel SVM - D/O

In [97]:
clf = GridSearchCV(svm.SVC(), {
    'C':[0.1, 1, 5, 10, 20],
    'kernel':['rbf', 'linear', 'sigmoid'],
    'gamma':['auto', 'scale']
}, cv=5)
clf.fit(training_X, training_y)
param_SVM_results = pd.DataFrame(clf.cv_results_)
trimmed_SVM_results = param_SVM_results[['param_C', 'param_kernel', 'param_gamma', 'mean_test_score', 'std_test_score']]
trimmed_SVM_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_score,std_test_score
18,10.0,rbf,auto,0.992867,0.005036
24,20.0,rbf,auto,0.992867,0.005036
12,5.0,rbf,auto,0.992867,0.005036
21,10.0,rbf,scale,0.99215,0.004733
15,5.0,rbf,scale,0.992148,0.007285
6,1.0,rbf,auto,0.991436,0.006224
27,20.0,rbf,scale,0.990722,0.005801
9,1.0,rbf,scale,0.990015,0.006921
4,0.1,linear,scale,0.967191,0.013008
1,0.1,linear,auto,0.967191,0.013008


---
### Artifical Neural Network - D/O

In [98]:
clf = GridSearchCV(MLPClassifier(), {
    'hidden_layer_sizes' : [(20), (10, 15, 20, 15, 10), (10, 20, 10)],
    'activation':['identity', 'relu', 'logistic']
}, cv=5)
clf.fit(training_X, training_y)
param_ann_results = pd.DataFrame(clf.cv_results_)
trimmed_ann_results = param_ann_results[['param_activation', 'param_hidden_layer_sizes','mean_test_score', 'std_test_score']]
trimmed_ann_results.sort_values('mean_test_score', ascending=False)



Unnamed: 0,param_activation,param_hidden_layer_sizes,mean_test_score,std_test_score
5,relu,"(10, 20, 10)",0.977168,0.010985
4,relu,"(10, 15, 20, 15, 10)",0.974334,0.00822
8,logistic,"(10, 20, 10)",0.969319,0.011671
3,relu,20,0.96076,0.012802
1,identity,"(10, 15, 20, 15, 10)",0.957211,0.010778
6,logistic,20,0.952921,0.010464
2,identity,"(10, 20, 10)",0.947224,0.008218
0,identity,20,0.945803,0.012571
7,logistic,"(10, 15, 20, 15, 10)",0.694329,0.224951
