In [1]:
import numpy as np
import pandas as pd
import pickle
import warnings

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings('ignore')

# Data

## Load  RGB data

In [2]:
data_train_1 = pickle.load(open("data/data_train_flatten_batch_1.pkl", "rb"))
data_train_2 = pickle.load(open("data/data_train_flatten_batch_2.pkl", "rb"))
labels_train_1 = pickle.load(open("data/labels_train_batch_1.pkl", "rb"))
labels_train_2 = pickle.load(open("data/labels_train_batch_2.pkl", "rb"))

data_train = np.concatenate((data_train_1, data_train_2))
labels_train = np.concatenate((labels_train_1, labels_train_2))

data_test = pickle.load(open("data/data_test_flatten.pkl", "rb"))
labels_test = pickle.load(open("data/labels_test.pkl", "rb"))

# Shuffle train set
data_train, labels_train = shuffle(data_train, labels_train, random_state=25)

## Load black and white data

In [3]:
train_batches = []
labels_batches = []

for i in [1, 2, 3, 4, 5]:
    data_filepath = "data/data_train_bw_flatten_batch_" + str(i) + ".pkl"
    labels_filepath = "data/labels_train_bw_batch_" + str(i) + ".pkl"
    train_batches.append(pickle.load(open(data_filepath, "rb")))
    labels_batches.append(pickle.load(open(labels_filepath, "rb")))
    
data_train_bw = np.concatenate(train_batches)
labels_train_bw = np.concatenate(labels_batches)

data_test_bw = pickle.load(open("data/data_test_bw_flatten.pkl", "rb"))
labels_test_bw = pickle.load(open("data/labels_test_bw.pkl", "rb"))

# Shuffle train set
data_train_bw, labels_train_bw = shuffle(data_train_bw, labels_train_bw, random_state=25)

## Scaling

In [4]:
# Scale
scaler = StandardScaler()

X_train_std = scaler.fit_transform(data_train)
X_test_std = scaler.transform(data_test)

scaler_bw = StandardScaler()

X_train_std_bw = scaler_bw.fit_transform(data_train_bw)
X_test_std_bw = scaler_bw.transform(data_test_bw)

## PCA
We will use a 95% PCA

In [5]:
print("Original number of features RGB:", X_train_std.shape[1])
pca = PCA(0.95)
pca.fit(X_train_std)
print("Number of pricipal components RGB:", pca.n_components_)

X_train_pca = pca.transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

print("Original number of features black and white:", X_train_std_bw.shape[1])
pca_bw = PCA(0.95)
pca_bw.fit(X_train_std_bw)
print("Number of pricipal components black and white:", pca_bw.n_components_)

X_train_bw_pca = pca_bw.transform(X_train_std_bw)
X_test_bw_pca = pca_bw.transform(X_test_std_bw)

Original number of features RGB: 3072
Number of pricipal components RGB: 221
Original number of features black and white: 1024
Number of pricipal components black and white: 163


# Models
We will use the hyperparameters obtained with the other notebooks

In [6]:
knn = KNeighborsClassifier(n_neighbors=8, weights="distance", n_jobs=-1).fit(X_train_std, labels_train)
knn_bw = KNeighborsClassifier(n_neighbors=11, weights="distance", n_jobs=-1).fit(X_train_std_bw, labels_train)
knn_pca = KNeighborsClassifier(n_neighbors=6, weights="distance", n_jobs=-1).fit(X_train_pca, labels_train)
knn_bw_pca = KNeighborsClassifier(n_neighbors=10, weights="distance", n_jobs=-1).fit(X_train_bw_pca, labels_train)
print("KNN complete")

KNN complete


In [8]:
lr = LogisticRegression(penalty="l2", solver="lbfgs", multi_class="multinomial").fit(X_train_std, labels_train)
lr_bw = LogisticRegression(penalty="l1", solver="saga", multi_class="multinomial").fit(X_train_std_bw, labels_train)
lr_pca = LogisticRegression(penalty="l1", solver="saga", multi_class="multinomial").fit(X_train_pca, labels_train)
lr_bw_pca = LogisticRegression(penalty="l2", solver="lbfgs", multi_class="multinomial").fit(X_train_bw_pca, labels_train)
print("Logistic Regression complete")

Logistic Regression complete


In [9]:
rf = RandomForestClassifier(criterion="gini", max_features="sqrt", random_state=25, n_jobs=4).fit(X_train_std, labels_train)
rf_bw = RandomForestClassifier(criterion="gini", max_features="sqrt", random_state=25, n_jobs=4).fit(X_train_std_bw, labels_train)
rf_pca = RandomForestClassifier(criterion="gini", max_features="sqrt", random_state=25, n_jobs=4).fit(X_train_pca, labels_train)
rf_bw_pca = RandomForestClassifier(criterion="gini", max_features="sqrt", random_state=25, n_jobs=4).fit(X_train_bw_pca, labels_train)
print("Random Forest complete")

Random Forest complete


In [10]:
models = {
    "knn": knn,
    "knn_bw": knn_bw,
    "knn_pca": knn_pca,
    "knn_bw_pca": knn_bw_pca,
    "lr": lr,
    "lr_bw": lr_bw,
    "lr_pca": lr_pca,
    "lr_bw_pca": lr_bw_pca,
    "rf": rf,
    "rf_bw": rf_bw,
    "rf_pca": rf_pca,
    "rf_bw_pca": rf_bw_pca
}

In [18]:
# Save models with pickle
for key, m in models.items():
    file_name = key + ".pkl"
    pickle.dump(m, open(file_name, "wb"))

# Predictions

In [20]:
# Check test_bw equal to test_rgb
for i, j in zip(labels_test, labels_test_bw):
    if i!=j:
        print("Error!")

In [21]:
predictions = dict()
predictions["groundtruth"] = labels_test
for key in models.keys():
    if "_bw_pca" in key:
        predictions[key] = models[key].predict(X_test_bw_pca)
    elif "_pca" in key:
        predictions[key] = models[key].predict(X_test_pca)
    elif "_bw" in key:
        predictions[key] = models[key].predict(X_test_std_bw)
    else:
        predictions[key] = models[key].predict(X_test_std)

In [22]:
predictions_df = pd.DataFrame(predictions)

In [23]:
display(predictions_df)

Unnamed: 0,groundtruth,knn,knn_bw,knn_pca,knn_bw_pca,lr,lr_bw,lr_pca,lr_bw_pca,rf,rf_bw,rf_pca,rf_bw_pca
0,airplane,airplane,airplane,airplane,ship,airplane,ship,airplane,ship,ship,ship,airplane,ship
1,airplane,airplane,cat,airplane,cat,ship,cat,ship,cat,ship,airplane,airplane,frog
2,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,bird,airplane
3,airplane,truck,deer,truck,deer,bird,airplane,bird,airplane,horse,horse,bird,truck
4,airplane,airplane,ship,airplane,ship,airplane,cat,airplane,frog,airplane,bird,ship,deer
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,truck,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,airplane,truck,airplane
9996,truck,ship,ship,ship,ship,truck,truck,truck,truck,ship,ship,truck,ship
9997,truck,truck,truck,truck,truck,truck,truck,truck,truck,truck,truck,truck,truck
9998,truck,frog,deer,frog,deer,automobile,automobile,automobile,automobile,truck,truck,automobile,automobile


In [24]:
# Save DataFrame
predictions_df.to_csv("predictions.csv", index=False)