# Models
In this notebook we will build models with the hyperparameters obtained in the other notebooks.

In [1]:
import numpy as np
import pandas as pd
import pickle
import warnings
import time

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

from tensorflow.keras.datasets import mnist

warnings.filterwarnings('ignore')

SAVE_MODELS = False




## Load data

In [2]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

x_train shape: (60000, 28, 28)
x_test shape: (10000, 28, 28)
y_train shape: (60000,)
y_test shape: (10000,)


## Reshape data

In [3]:
# Reshape
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)

## Scaling between 0 and 1

In [4]:
x_train_sc = x_train / 255
x_test_sc = x_test / 255

## PCA
We will use a 95% PCA.

In [5]:
scaler = StandardScaler()
pca = PCA(0.95)

x_train_std = scaler.fit_transform(x_train)
x_test_std = scaler.transform(x_test)

x_train_pca = pca.fit_transform(x_train_std)
x_test_pca = pca.transform(x_test_std)

print("Original number of features:", x_train_std.shape[1])
print("Number of principal components:", pca.n_components_)

Original number of features: 784
Number of principal components: 331


## Train models

In [6]:
knn = KNeighborsClassifier(n_neighbors=4, weights="distance", n_jobs=-1).fit(x_train_sc, y_train)
knn_pca = KNeighborsClassifier(n_neighbors=4, weights="distance", n_jobs=-1).fit(x_train_pca, y_train)
print("KNN complete")

KNN complete


In [7]:
rf = RandomForestClassifier(criterion="gini", max_features="sqrt", random_state=25, n_jobs=-1).fit(x_train_sc, y_train)
rf_pca = RandomForestClassifier(criterion="entropy", max_features="sqrt", random_state=25, n_jobs=-1).fit(x_train_pca, y_train)
print("Random Forest complete")

Random Forest complete


In [8]:
lr = LogisticRegression(penalty="l1", solver="saga", multi_class="multinomial", n_jobs=-1, max_iter=300).fit(x_train_sc, y_train)
lr_pca = LogisticRegression(penalty="l2", solver="lbfgs", multi_class="multinomial", n_jobs=-1, max_iter=300).fit(x_train_pca, y_train)
print("Logistic Regression complete")

Logistic Regression complete


In [9]:
models = {
    "knn": knn,
    "knn_pca": knn_pca,
    "lr": lr,
    "lr_pca": lr_pca,
    "rf": rf,
    "rf_pca": rf_pca,
}

In [10]:
if SAVE_MODELS:
    # Save models with pickle
    for key, m in models.items():
        file_name = key + ".pkl"
        pickle.dump(m, open(file_name, "wb"))

## Predictions

In [11]:
predictions = dict()
predictions["groundtruth"] = y_test
for key in models.keys():
    if "_pca" in key:
        predictions[key] = models[key].predict(x_test_pca)
    else:
        predictions[key] = models[key].predict(x_test_sc)

In [12]:
predictions_df = pd.DataFrame(predictions)
display(predictions_df)

Unnamed: 0,groundtruth,knn,knn_pca,lr,lr_pca,rf,rf_pca
0,7,7,7,7,7,7,7
1,2,2,2,2,2,2,2
2,1,1,1,1,1,1,1
3,0,0,0,0,0,0,0
4,4,4,4,4,4,4,4
...,...,...,...,...,...,...,...
9995,2,2,2,2,2,2,2
9996,3,3,3,3,3,3,3
9997,4,4,4,4,4,4,4
9998,5,5,5,5,5,5,5


## Save predictions with csv file

In [13]:
# Save DataFrame
predictions_df.to_csv("predictions_mnist.csv", index=False)

# Speed comparison

In [14]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

x_train shape: (60000, 28, 28)
x_test shape: (10000, 28, 28)
y_train shape: (60000,)
y_test shape: (10000,)


In [15]:
def predict_from_image(image, mod, use_pca=False, scaler=scaler, pca=pca):
    im = image.reshape(1, -1)
    if use_pca:
        im = scaler.transform(im)
        im = pca.transform(im)
        return mod.predict(im)
    else:
        im = im / 255
        return mod.predict(im)        

In [16]:
time_dict = dict()
for model_name in models.keys():
    time_dict[model_name] = []

n_iterations = 500
n_samples = 100
for it in range(n_iterations):
    sample = resample(x_test, n_samples=n_samples)
    
    for model_name in models.keys():
        mod = models[model_name]
        use_pca = False
        if "_pca" in model_name:
            use_pca = True

        # Make predictions
        t0 = time.time()
        for im in sample:
            predict_from_image(im, mod, use_pca=use_pca)
        t1 = time.time()
        
        elapsed_time = t1 - t0
        time_dict[model_name].append(elapsed_time)
    print("Iteration {} complete".format(it + 1))

Iteration 1 complete
Iteration 2 complete
Iteration 3 complete
Iteration 4 complete
Iteration 5 complete
Iteration 6 complete
Iteration 7 complete
Iteration 8 complete
Iteration 9 complete
Iteration 10 complete
Iteration 11 complete
Iteration 12 complete
Iteration 13 complete
Iteration 14 complete
Iteration 15 complete
Iteration 16 complete
Iteration 17 complete
Iteration 18 complete
Iteration 19 complete
Iteration 20 complete
Iteration 21 complete
Iteration 22 complete
Iteration 23 complete
Iteration 24 complete
Iteration 25 complete
Iteration 26 complete
Iteration 27 complete
Iteration 28 complete
Iteration 29 complete
Iteration 30 complete
Iteration 31 complete
Iteration 32 complete
Iteration 33 complete
Iteration 34 complete
Iteration 35 complete
Iteration 36 complete
Iteration 37 complete
Iteration 38 complete
Iteration 39 complete
Iteration 40 complete
Iteration 41 complete
Iteration 42 complete
Iteration 43 complete
Iteration 44 complete
Iteration 45 complete
Iteration 46 comple

In [17]:
time_df = pd.DataFrame(time_dict, index=["sample {}".format(i+1) for i in range(n_iterations)])
display(time_df)

Unnamed: 0,knn,knn_pca,lr,lr_pca,rf,rf_pca
sample 1,2.777538,1.519985,0.015656,0.062865,4.610059,4.855984
sample 2,2.937077,1.492658,0.015732,0.094157,4.536593,4.338106
sample 3,2.925663,1.511454,0.015729,0.094251,4.327929,4.673692
sample 4,2.734674,1.530000,0.031254,0.062934,4.629632,4.758421
sample 5,2.855869,1.465750,0.032177,0.057655,4.506133,4.670340
...,...,...,...,...,...,...
sample 496,2.812327,1.491885,0.015621,0.047267,4.583116,4.678260
sample 497,2.781240,1.413015,0.031343,0.062918,4.586740,4.692921
sample 498,2.778851,1.427982,0.022109,0.056393,4.633978,4.741769
sample 499,2.798491,1.475965,0.032411,0.062137,4.647099,4.649060


In [18]:
display(time_df.describe())

Unnamed: 0,knn,knn_pca,lr,lr_pca,rf,rf_pca
count,500.0,500.0,500.0,500.0,500.0,500.0
mean,2.771954,1.522386,0.018761,0.082497,4.574719,4.66274
std,0.055297,0.112954,0.010091,0.022479,0.141822,0.138215
min,2.299629,1.098679,0.0,0.044898,3.473997,3.350544
25%,2.75268,1.455366,0.015612,0.0629,4.538838,4.641448
50%,2.776395,1.527469,0.015659,0.078491,4.606567,4.682996
75%,2.797341,1.599994,0.031233,0.094339,4.648971,4.723721
max,2.93754,1.858797,0.044575,0.142228,4.906602,4.903152


In [19]:
time_df.to_csv("time_mnist.csv", index=True)