# Models
In this notebook we will build models with the hyperparameters obtained in the other notebooks.

In [1]:
import numpy as np
import pandas as pd
import pickle
import warnings
import time

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

from tensorflow.keras.datasets import mnist

warnings.filterwarnings('ignore')

SAVE_MODELS = False




## Load data

In [2]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

x_train shape: (60000, 28, 28)
x_test shape: (10000, 28, 28)
y_train shape: (60000,)
y_test shape: (10000,)


## Reshape data

In [3]:
# Reshape
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)

## Scaling between 0 and 1

In [4]:
x_train_sc = x_train / 255
x_test_sc = x_test / 255

## PCA
We will use a 95% PCA.

In [5]:
scaler = StandardScaler()
pca = PCA(0.95)

x_train_std = scaler.fit_transform(x_train)
x_test_std = scaler.transform(x_test)

x_train_pca = pca.fit_transform(x_train_std)
x_test_pca = pca.transform(x_test_std)

print("Original number of features:", x_train_std.shape[1])
print("Number of principal components:", pca.n_components_)

Original number of features: 784
Number of principal components: 331


## Train models

In [6]:
knn = KNeighborsClassifier(n_neighbors=4, weights="distance", n_jobs=-1).fit(x_train_sc, y_train)
knn_pca = KNeighborsClassifier(n_neighbors=4, weights="distance", n_jobs=-1).fit(x_train_pca, y_train)
print("KNN complete")

KNN complete


In [7]:
rf = RandomForestClassifier(criterion="gini", max_features="sqrt", random_state=25, n_jobs=-1).fit(x_train_sc, y_train)
rf_pca = RandomForestClassifier(criterion="entropy", max_features="sqrt", random_state=25, n_jobs=-1).fit(x_train_pca, y_train)
print("Random Forest complete")

Random Forest complete


In [8]:
lr = LogisticRegression(penalty="l1", solver="saga", multi_class="multinomial").fit(x_train_sc, y_train)
lr_pca = LogisticRegression(penalty="l2", solver="lbfgs", multi_class="multinomial").fit(x_train_pca, y_train)
print("Logistic Regression complete")

Logistic Regression complete


In [9]:
models = {
    "knn": knn,
    "knn_pca": knn_pca,
    "lr": lr,
    "lr_pca": lr_pca,
    "rf": rf,
    "rf_pca": rf_pca,
}

In [10]:
if SAVE_MODELS:
    # Save models with pickle
    for key, m in models.items():
        file_name = key + ".pkl"
        pickle.dump(m, open(file_name, "wb"))

## Predictions

In [11]:
predictions = dict()
predictions["groundtruth"] = y_test
for key in models.keys():
    if "_pca" in key:
        predictions[key] = models[key].predict(x_test_pca)
    else:
        predictions[key] = models[key].predict(x_test_sc)

In [12]:
predictions_df = pd.DataFrame(predictions)
display(predictions_df)

Unnamed: 0,groundtruth,knn,knn_pca,lr,lr_pca,rf,rf_pca
0,7,7,7,7,7,7,7
1,2,2,2,2,2,2,2
2,1,1,1,1,1,1,1
3,0,0,0,0,0,0,0
4,4,4,4,4,4,4,4
...,...,...,...,...,...,...,...
9995,2,2,2,2,2,2,2
9996,3,3,3,3,3,3,3
9997,4,4,4,4,4,4,4
9998,5,5,5,5,5,5,5


## Save predictions with csv file

In [13]:
# Save DataFrame
predictions_df.to_csv("predictions_mnist.csv", index=False)

In [14]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

x_train shape: (60000, 28, 28)
x_test shape: (10000, 28, 28)
y_train shape: (60000,)
y_test shape: (10000,)


In [15]:
def predict_from_image(image, mod, use_pca=False, scaler=scaler, pca=pca):
    im = image.reshape(1, -1)
    if use_pca:
        im = scaler.transform(im)
        im = pca.transform(im)
        return mod.predict(im)
    else:
        im = im / 255
        return mod.predict(im)        

In [16]:
time_dict = dict()
for model_name in models.keys():
    time_dict[model_name] = []

n_iterations = 20
n_samples = 200
for it in range(n_iterations):
    sample = resample(x_test, n_samples=n_samples)
    
    for model_name in models.keys():
        mod = models[model_name]
        use_pca = False
        if "_pca" in model_name:
            use_pca = True

        # Make predictions
        t0 = time.time()
        for im in sample:
            predict_from_image(im, mod, use_pca=use_pca)
        t1 = time.time()
        
        elapsed_time = t1 - t0
        time_dict[model_name].append(elapsed_time)
    print("Iteration {} complete".format(it + 1))

Iteration 1 complete
Iteration 2 complete
Iteration 3 complete
Iteration 4 complete
Iteration 5 complete
Iteration 6 complete
Iteration 7 complete
Iteration 8 complete
Iteration 9 complete
Iteration 10 complete
Iteration 11 complete
Iteration 12 complete
Iteration 13 complete
Iteration 14 complete
Iteration 15 complete
Iteration 16 complete
Iteration 17 complete
Iteration 18 complete
Iteration 19 complete
Iteration 20 complete


In [17]:
time_df = pd.DataFrame(time_dict, index=["sample {}".format(i+1) for i in range(n_iterations)])
display(time_df)

Unnamed: 0,knn,knn_pca,lr,lr_pca,rf,rf_pca
sample 1,5.524789,3.204314,0.046875,0.21951,9.103432,9.275419
sample 2,5.279866,2.424293,0.031258,0.127184,9.306986,9.328355
sample 3,5.547308,2.968476,0.015623,0.157279,9.253816,9.386984
sample 4,5.593665,3.046826,0.062933,0.236748,9.048225,9.513781
sample 5,5.566937,3.033562,0.031637,0.191558,9.137695,9.247313
sample 6,5.549124,2.990804,0.031856,0.141285,9.065421,9.358049
sample 7,5.486656,2.967658,0.03165,0.17233,9.216817,9.267126
sample 8,5.489112,3.049477,0.031291,0.172971,9.255413,9.137701
sample 9,5.50942,2.917354,0.031585,0.187922,9.309682,9.382509
sample 10,5.609041,2.831205,0.015657,0.156806,9.167919,9.350077


In [18]:
display(time_df.describe())

Unnamed: 0,knn,knn_pca,lr,lr_pca,rf,rf_pca
count,20.0,20.0,20.0,20.0,20.0,20.0
mean,5.508149,2.93649,0.033337,0.181304,9.127993,9.282345
std,0.078041,0.184605,0.011956,0.037258,0.216064,0.171133
min,5.279866,2.424293,0.015521,0.126773,8.500882,8.891154
25%,5.492758,2.895816,0.031239,0.157161,9.061122,9.249128
50%,5.520609,2.986275,0.0316,0.175374,9.177358,9.32882
75%,5.547762,3.047489,0.03561,0.196345,9.274883,9.387087
max,5.609041,3.204314,0.062933,0.252821,9.353096,9.533273


In [19]:
time_df.to_csv("time_mnist.csv", index=True)