In [2]:
import numpy as np
np.random.seed(0)
import scipy.misc
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
import time

# Naive Bayes

In [3]:
nb_classes = 71
# input image dimensions
img_rows, img_cols = 32, 32
# img_rows, img_cols = 127, 128

ary = np.load("hiragana.npz")['arr_0'].reshape([-1, 127, 128]).astype(np.float32) / 15
X_train = np.zeros([nb_classes * 160, img_rows, img_cols], dtype=np.float32)
for i in range(nb_classes * 160):
    X_train[i] = scipy.misc.imresize(ary[i], (img_rows, img_cols), mode='F')
    # X_train[i] = ary[i]
Y_train = np.repeat(np.arange(nb_classes), 160)

X_train, X_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2)

In [4]:
ary.shape

(11360, 127, 128)

In [6]:
nsamples, nx, ny = X_train.shape
train_dataset = X_train.reshape((nsamples,nx*ny))
train_dataset.shape

(9088, 1024)

In [7]:
nsamples, nx, ny = X_test.shape
test_dataset = X_test.reshape((nsamples,nx*ny))
test_dataset.shape

(2272, 1024)

In [8]:
y_test.shape

(2272,)

In [9]:
clf = GaussianNB()
start_time = time.time()
clf.fit(train_dataset, y_train)

# Use the fitted model to create predictions for the X_test data.
preds = clf.predict(test_dataset)
print("--- %s seconds ---" % (time.time() - start_time))

# Run it all and see how you did!
print(preds)
print(y_test)
print("accuracy score: {}".format(accuracy_score(y_test, preds)))
print("f1 score: {}".format(f1_score(y_test, preds, average='micro')))

--- 1.235764980316162 seconds ---
[62  5 57 ..., 60 14 61]
[ 2 41 29 ..., 60  8 11]
accuracy score: 0.2746478873239437
f1 score: 0.2746478873239437


## Random Forest
The accuracy of our model goes up as we increase number of estimators and max depth of each model in the forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest_clf = RandomForestClassifier(n_estimators=100, max_depth=32, random_state=4, min_samples_split=5)

start_time = time.time()
random_forest_clf.fit(train_dataset, y_train)
ranf_preds = random_forest_clf.predict(test_dataset)
print("--- %s seconds ---" % (time.time() - start_time))

print(ranf_preds)
print(y_test)
print("accuracy score: {}".format(accuracy_score(y_test, ranf_preds)))
print("f1 score: {}".format(f1_score(y_test, ranf_preds, average='micro')))

In [None]:
random_forest_clf2 = RandomForestClassifier(n_estimators=10, max_depth=4, random_state=4, min_samples_split=5)
start_time = time.time()
random_forest_clf2.fit(train_dataset, y_train)
ranf_preds2 = random_forest_clf2.predict(test_dataset)
print("--- %s seconds ---" % (time.time() - start_time))

print(ranf_preds2)
print(y_test)
print("accuracy score: {}".format(accuracy_score(y_test, ranf_preds2)))
print("f1 score: {}".format(f1_score(y_test, ranf_preds2, average='micro')))

In [None]:
random_forest_clf3 = RandomForestClassifier(n_estimators=10, max_depth=20, random_state=4, min_samples_split=5)
start_time = time.time()
random_forest_clf3.fit(train_dataset, y_train)
ranf_preds3 = random_forest_clf3.predict(test_dataset)
print("--- %s seconds ---" % (time.time() - start_time))

print(ranf_preds)
print(y_test)
print("accuracy score: {}".format(accuracy_score(y_test, ranf_preds3)))
print("f1 score: {}".format(f1_score(y_test, ranf_preds3, average='micro')))

In [None]:
random_forest_clf4 = RandomForestClassifier(n_estimators=10, max_depth=30, random_state=4, min_samples_split=5)
start_time = time.time()
random_forest_clf4.fit(train_dataset, y_train)
ranf_preds4 = random_forest_clf4.predict(test_dataset)
print("--- %s seconds ---" % (time.time() - start_time))

print(ranf_preds)
print(y_test)
print("accuracy score: {}".format(accuracy_score(y_test, ranf_preds4)))
print("f1 score: {}".format(f1_score(y_test, ranf_preds4, average='micro')))