In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time

In [2]:
# Load data
images = np.load("Data/sample_1000_image.npy")
labels = np.load("Data/sample_1000_label.npy")
# Normalize image data.  0-255 to 0-1
images = images / 255
df = pd.DataFrame(np.concatenate((images, labels), axis=1))
# Rename the last column as "label"
df.rename(columns={784:"label"}, inplace=True)
# Convert label column to integer type
df['label'] = df['label'].astype('int64')

In [3]:
# Display the first 10 rows in the data frame
df.iloc[0:10, :]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [4]:
# Get "img" data frame and "lbl" series from the df
img = df.iloc[:, 0:-1]
lbl = df['label']
# Split into train, validation and test set
x_train, x_test, y_train, y_test = train_test_split(img, lbl, test_size = 0.20, random_state = 123, stratify = lbl)


In [5]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
# Fit Random Forest model
rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=16, n_jobs=-1, random_state=123)

In [10]:
y_train.shape

(276000,)

In [14]:
start_time = time.time()
rnd_clf.fit(x_train, y_train)

print('Time elapsed: %.2f min' % ((time.time() - start_time) / 60))

Time elapsed: 10.56 min


In [15]:
start_time = time.time()
print('Accuracy: %.2f%%' % (rnd_clf.score(x_test, y_test)*100))
print('Time elapsed: %.2f min' % ((time.time() - start_time) / 60))

Accuracy: 31.25%
Time elapsed: 2.27 min


In [16]:
from sklearn.decomposition import PCA

In [17]:
pca = PCA(n_components=100)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)
print(x_train_pca.shape)
print(x_train.shape)

(276000, 100)
(276000, 784)


In [18]:
# Fit Random Forest model
rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=16, n_jobs=-1, random_state=123)
start_time = time.time()
rnd_clf.fit(x_train_pca, y_train)

print('Time elapsed: %.2f min' % ((time.time() - start_time) / 60))

Time elapsed: 25.12 min


In [19]:
start_time = time.time()
print('Accuracy: %.2f%%' % (rnd_clf.score(x_test_pca, y_test)*100))
print('Time elapsed: %.2f min' % ((time.time() - start_time) / 60))

Accuracy: 30.87%
Time elapsed: 0.86 min


In [20]:
# Directly load data transformed by autoencoder

# Load data
x_train_ae = np.load("Data/xtrain_ae.npy")
x_test_ae = np.load("Data/xtest_ae.npy")
y_train_ae = np.load("Data/ytrain_ae.npy")
y_test_ae = np.load("Data/ytest_ae.npy")

In [21]:
x_train_ae.shape

(276000, 100)

In [22]:
# Fit Random Forest model
rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=16, n_jobs=-1, random_state=123)
start_time = time.time()
rnd_clf.fit(x_train_ae, y_train_ae)

print('Time elapsed: %.2f min' % ((time.time() - start_time) / 60))

Time elapsed: 24.60 min


In [23]:
start_time = time.time()
print('Accuracy: %.2f%%' % (rnd_clf.score(x_test_ae, y_test_ae)*100))
print('Time elapsed: %.2f min' % ((time.time() - start_time) / 60))

Accuracy: 28.33%
Time elapsed: 1.12 min
