# Reference
https://pyimagesearch.com/2016/08/08/k-nn-classifier-for-image-classification/

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

import numpy as np
import imutils
import cv2
import os

In [9]:
def image_to_feature_vector(image, size=(32, 32)):
    return cv2.resize(image, size).flatten()

def extract_color_histogram(image, bins=[8, 8, 8]):
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
    hist = cv2.normalize(hist, hist)
    
    return hist.flatten()


In [22]:
# describe images
raw_images = []
features = []
labels = []
verbose = 500
total = len(os.listdir('../datasets/dog-vs-cat/train/'))

for (i, image_path) in enumerate(os.listdir('../datasets/dog-vs-cat/train/')):
    # format sample: dog.8011.jpg
    image = cv2.imread(f'../datasets/dog-vs-cat/train/{image_path}')
    label = image_path.split('.')[0]
    
    pixels = image_to_feature_vector(image=image)
    hist = extract_color_histogram(image=image)
    
    raw_images.append(pixels)
    features.append(hist)
    labels.append(label)
    
    if (i > 0 and i % verbose == 0) or (i == total - 1):
        print(f'[INFO] processed {i+1}/{total}')
    

[INFO] processed 500/25000
[INFO] processed 1000/25000
[INFO] processed 1500/25000
[INFO] processed 2000/25000
[INFO] processed 2500/25000
[INFO] processed 3000/25000
[INFO] processed 3500/25000
[INFO] processed 4000/25000
[INFO] processed 4500/25000
[INFO] processed 5000/25000
[INFO] processed 5500/25000
[INFO] processed 6000/25000
[INFO] processed 6500/25000
[INFO] processed 7000/25000
[INFO] processed 7500/25000
[INFO] processed 8000/25000
[INFO] processed 8500/25000
[INFO] processed 9000/25000
[INFO] processed 9500/25000
[INFO] processed 10000/25000
[INFO] processed 10500/25000
[INFO] processed 11000/25000
[INFO] processed 11500/25000
[INFO] processed 12000/25000
[INFO] processed 12500/25000
[INFO] processed 13000/25000
[INFO] processed 13500/25000
[INFO] processed 14000/25000
[INFO] processed 14500/25000
[INFO] processed 15000/25000
[INFO] processed 15500/25000
[INFO] processed 16000/25000
[INFO] processed 16500/25000
[INFO] processed 17000/25000
[INFO] processed 17500/25000
[INFO

In [23]:
# memory consumed by data
raw_images = np.array(raw_images)
features = np.array(features)
labels = np.array(labels)

print("[INFO] pixels matrix: {:.2f}MB".format(
	raw_images.nbytes / (1024 * 1000.0)))
print("[INFO] features matrix: {:.2f}MB".format(
	features.nbytes / (1024 * 1000.0)))
print("[INFO] labels matrix: {:.2f}MB".format(
	labels.nbytes / (1024 * 1000.0)))


[INFO] pixels matrix: 75.00MB
[INFO] features matrix: 50.00MB
[INFO] labels matrix: 0.29MB


In [24]:
X_train_rl, X_test_rl, y_train_rl, y_test_rl = train_test_split(raw_images, labels, test_size=0.2, random_state=42)
X_train_fl, X_test_fl, y_train_fl, y_test_fl = train_test_split(features, labels, test_size=0.2, random_state=42)

In [27]:
model = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
model.fit(X_train_rl, y_train_rl)
model.score(X_test_rl, y_test_rl)

0.5502

In [26]:
model = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
model.fit(X_train_fl, y_train_fl)
model.score(X_test_fl, y_test_fl)

0.5988

color histogram is not good feature descriptor for this problem, difference color of dog/cat may lead to false negative. e.g black dog/cat, white dog/cat