# Models
In this notebook we will build models with the hyperparameters obtained in the other notebooks.

In [1]:
import numpy as np
import pandas as pd
import pickle
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras.datasets import mnist

warnings.filterwarnings('ignore')

SAVE_MODELS = False




## Load data

In [2]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

x_train shape: (60000, 28, 28)
x_test shape: (10000, 28, 28)
y_train shape: (60000,)
y_test shape: (10000,)


## Reshape data

In [3]:
# Reshape
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)

## Scaling between 0 and 1

In [4]:
x_train_sc = x_train / 255
x_test_sc = x_test / 255

## PCA
We will use a 95% PCA.

In [14]:
scaler = StandardScaler()
pca = PCA(0.95)

x_train_std = scaler.fit_transform(x_train)
x_test_std = scaler.transform(x_test)

x_train_pca = pca.fit_transform(x_train_std)
x_test_pca = pca.transform(x_test_std)

print("Original number of features:", x_train_std.shape[1])
print("Number of principal components:", pca.n_components_)

Original number of features: 784
Number of principal components: 331


## Train models

In [6]:
knn = KNeighborsClassifier(n_neighbors=4, weights="distance", n_jobs=-1).fit(x_train_sc, y_train)
knn_pca = KNeighborsClassifier(n_neighbors=4, weights="distance", n_jobs=-1).fit(x_train_pca, y_train)
print("KNN complete")

KNN complete


In [7]:
rf = RandomForestClassifier(criterion="gini", max_features="sqrt", random_state=25, n_jobs=-1).fit(x_train_sc, y_train)
rf_pca = RandomForestClassifier(criterion="entropy", max_features="sqrt", random_state=25, n_jobs=-1).fit(x_train_pca, y_train)
print("Random Forest complete")

Random Forest complete


In [8]:
lr = LogisticRegression(penalty="l1", solver="saga", multi_class="multinomial").fit(x_train_sc, y_train)
lr_pca = LogisticRegression(penalty="l2", solver="lbfgs", multi_class="multinomial").fit(x_train_pca, y_train)
print("Logistic Regression complete")

Logistic Regression complete


In [9]:
models = {
    "knn": knn,
    "knn_pca": knn_pca,
    "lr": lr,
    "lr_pca": lr_pca,
    "rf": rf,
    "rf_pca": rf_pca,
}

In [10]:
if SAVE_MODELS:
    # Save models with pickle
    for key, m in models.items():
        file_name = key + ".pkl"
        pickle.dump(m, open(file_name, "wb"))

## Predictions

In [11]:
predictions = dict()
predictions["groundtruth"] = y_test
for key in models.keys():
    if "_pca" in key:
        predictions[key] = models[key].predict(x_test_pca)
    else:
        predictions[key] = models[key].predict(x_test_sc)

In [12]:
predictions_df = pd.DataFrame(predictions)
display(predictions_df)

Unnamed: 0,groundtruth,knn,knn_pca,lr,lr_pca,rf,rf_pca
0,7,7,7,7,7,7,7
1,2,2,2,2,2,2,2
2,1,1,1,1,1,1,1
3,0,0,0,0,0,0,0
4,4,4,4,4,4,4,4
...,...,...,...,...,...,...,...
9995,2,2,2,2,2,2,2
9996,3,3,3,3,3,3,3
9997,4,4,4,4,4,4,4
9998,5,5,5,5,5,5,5


## Save predictions with csv file

In [13]:
# Save DataFrame
predictions_df.to_csv("predictions_mnist.csv", index=False)