In [1]:
import numpy as np
import pandas as pd
import pickle
import warnings

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras.datasets import mnist

warnings.filterwarnings('ignore')

SAVE_MODELS = False

# Data

In [2]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Reshape data

In [None]:
# Reshape
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)

# Scaling between 0 and 1

In [None]:
x_train_sc = x_train / 255
x_test_sc = x_test / 255

# PCA
We will use a 95% PCA

In [None]:
scaler = StandardScaler()
pca = PCA(0.95)

x_train_std = scaler.fit_transform(x_train)
x_test_std = scaler.transform(x_test)

x_train_pca = pca.fit_transform(x_train_std)
x_test_pca = pca.transform(x_test_std)

print("Original number of features:", x_train_std.shape[0])
print("Number of principal components:", pca._ncomponents)
print("Number of principal components:", x_train_pca.shape[0])

# Models
We will use the hyperparameters obtained with the other notebooks

In [6]:
knn = KNeighborsClassifier(n_neighbors=4, weights="distance", n_jobs=-1).fit(x_train_sc, y_train)
knn_pca = KNeighborsClassifier(n_neighbors=4, weights="distance", n_jobs=-1).fit(x_train_pca, y_train)
print("KNN complete")

KNN complete


In [7]:
rf = RandomForestClassifier(criterion="gini", max_features="sqrt", random_state=25, n_jobs=-1).fit(x_train_sc, y_train)
rf_pca = RandomForestClassifier(criterion="entropy", max_features="sqrt", random_state=25, n_jobs=-1).fit(x_train_pca, y_train)
print("Random Forest complete")

Random Forest complete


In [8]:
lr = LogisticRegression(penalty="l1", solver="saga", multi_class="multinomial").fit(x_train_sc, y_train)
lr_pca = LogisticRegression(penalty="l2", solver="lbfgs", multi_class="multinomial").fit(x_train_pca, y_train)
print("Logistic Regression complete")

Logistic Regression complete


In [9]:
models = {
    "knn": knn,
    "knn_pca": knn_pca,
    "lr": lr,
    "lr_pca": lr_pca,
    "rf": rf,
    "rf_bw": rf_bw,
    "rf_pca": rf_pca,
}

In [10]:
if SAVE_MODELS:
    # Save models with pickle
    for key, m in models.items():
        file_name = key + ".pkl"
        pickle.dump(m, open(file_name, "wb"))

# Predictions

In [12]:
predictions = dict()
predictions["groundtruth"] = y_test
for key in models.keys():
    if "_bw_pca" in key:
        predictions[key] = models[key].predict(X_test_bw_pca)
    elif "_pca" in key:
        predictions[key] = models[key].predict(X_test_pca)
    elif "_bw" in key:
        predictions[key] = models[key].predict(X_test_std_bw)
    else:
        predictions[key] = models[key].predict(X_test_std)

In [13]:
predictions_df = pd.DataFrame(predictions)

In [14]:
display(predictions_df)

Unnamed: 0,groundtruth,knn,knn_bw,knn_pca,knn_bw_pca,lr,lr_bw,lr_pca,lr_bw_pca,rf,rf_bw,rf_pca,rf_bw_pca
0,airplane,airplane,ship,airplane,airplane,airplane,ship,airplane,ship,ship,ship,airplane,ship
1,airplane,airplane,airplane,airplane,bird,ship,cat,ship,cat,ship,airplane,airplane,frog
2,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,bird,airplane
3,airplane,truck,ship,truck,ship,bird,airplane,bird,airplane,horse,horse,bird,truck
4,airplane,ship,ship,airplane,ship,airplane,cat,airplane,frog,airplane,bird,ship,deer
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,truck,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,truck,airplane
9996,truck,ship,ship,ship,ship,truck,truck,truck,truck,ship,ship,truck,ship
9997,truck,truck,truck,truck,truck,truck,truck,truck,truck,truck,truck,truck,truck
9998,truck,frog,deer,frog,deer,automobile,automobile,automobile,automobile,truck,truck,automobile,automobile


In [15]:
# Save DataFrame
predictions_df.to_csv("predictions.csv", index=False)