In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import warnings

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, KFold

warnings.filterwarnings('ignore')

# Data

## Load  RGB data

In [2]:
data_train_1 = pickle.load(open("data/data_train_flatten_batch_1.pkl", "rb"))
data_train_2 = pickle.load(open("data/data_train_flatten_batch_2.pkl", "rb"))
labels_train_1 = pickle.load(open("data/labels_train_batch_1.pkl", "rb"))
labels_train_2 = pickle.load(open("data/labels_train_batch_2.pkl", "rb"))

data_train = np.concatenate((data_train_1, data_train_2))
labels_train = np.concatenate((labels_train_1, labels_train_2))

data_test = pickle.load(open("data/data_test_flatten.pkl", "rb"))
labels_test = pickle.load(open("data/labels_test.pkl", "rb"))

# Shuffle train set
data_train, labels_train = shuffle(data_train, labels_train, random_state=25)

## Load black and white data

In [3]:
train_batches = []
labels_batches = []

for i in [1, 2, 3, 4, 5]:
    data_filepath = "data/data_train_bw_flatten_batch_" + str(i) + ".pkl"
    labels_filepath = "data/labels_train_bw_batch_" + str(i) + ".pkl"
    train_batches.append(pickle.load(open(data_filepath, "rb")))
    labels_batches.append(pickle.load(open(labels_filepath, "rb")))
    
data_train_bw = np.concatenate(train_batches)
labels_train_bw = np.concatenate(labels_batches)

data_test_bw = pickle.load(open("data/data_test_bw_flatten.pkl", "rb"))
labels_test_bw = pickle.load(open("data/labels_test_bw.pkl", "rb"))

# Shuffle train set
data_train_bw, labels_train_bw = shuffle(data_train_bw, labels_train_bw, random_state=25)

## Scaling

In [4]:
# Scale
scaler = StandardScaler()

X_train_std = scaler.fit_transform(data_train)
X_test_std = scaler.transform(data_test)

scaler_bw = StandardScaler()

X_train_std_bw = scaler_bw.fit_transform(data_train_bw)
X_test_std_bw = scaler_bw.transform(data_test_bw)

## PCA
We will use a 95% PCA

In [5]:
print("Original number of features RGB:", X_train_std.shape[1])
pca = PCA(0.95)
pca.fit(X_train_std)
print("Number of pricipal components RGB:", pca.n_components_)

X_train_pca = pca.transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

print("Original number of features black and white:", X_train_std_bw.shape[1])
pca_bw = PCA(0.95)
pca_bw.fit(X_train_std_bw)
print("Number of pricipal components black and white:", pca_bw.n_components_)

X_train_bw_pca = pca_bw.transform(X_train_std_bw)
X_test_bw_pca = pca_bw.transform(X_test_std_bw)

Original number of features RGB: 3072
Number of pricipal components RGB: 221
Original number of features black and white: 1024
Number of pricipal components black and white: 163


# Models
We will use the hyperparameters obtained in the other notebooks

In [None]:
knn = KNeighborsClassifier(n_neighbors=6, weights="distance", n_jobs=4).fit(X_train_std, labels_train)
knn_bw = KNeighborsClassifier(n_neighbors=4, weights="distance", n_jobs=4).fit(X_train_std_bw, labels_train)
knn_pca = KNeighborsClassifier(n_neighbors=6, weights="distance", n_jobs=4).fit(X_train_pca, labels_train)
knn_bw_pca = KNeighborsClassifier(n_neighbors=5, weights="distance", n_jobs=4).fit(X_train_bw_pca, labels_train)
print("KNN complete")

lr = LogisticRegression(penalty="l2").fit(X_train_std, labels_train)
lr_bw = LogisticRegression(penalty="l2").fit(X_train_std_bw, labels_train)
lr_pca = LogisticRegression(penalty="l2").fit(X_train_pca, labels_train)
lr_bw_pca = LogisticRegression(penalty="l2").fit(X_train_bw_pca, labels_train)
print("Logistic Regression complete")

rf = RandomForestClassifier(criterion="gini", max_features="sqrt", random_state=25, n_jobs=4).fit(X_train_std, labels_train)
rf_bw = RandomForestClassifier(criterion="gini", max_features="sqrt", random_state=25, n_jobs=4).fit(X_train_std_bw, labels_train)
rf_pca = RandomForestClassifier(criterion="gini", max_features="sqrt", random_state=25, n_jobs=4).fit(X_train_pca, labels_train)
rf_bw_pca = RandomForestClassifier(criterion="gini", max_features="sqrt", random_state=25, n_jobs=4).fit(X_train_bw_pca, labels_train)
print("Random Forest complete")