#### Building a K-NN Classifier for the images
As a preliminary model to build a baseline for a classifier that utilizes a simple algorithm, I will utlize the K-NN algorithm.


In [1]:
# import the necessary packages
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from imutils import paths
import numpy as np
import argparse
import imutils
import cv2
import os
import pandas as pd

In [2]:
data = pd.read_csv("data/meta/working_df_subset.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35350 entries, 0 to 35349
Data columns (total 5 columns):
food_class        35350 non-null object
label             35350 non-null object
id                35350 non-null int64
path              35350 non-null object
training_image    35350 non-null object
dtypes: int64(1), object(4)
memory usage: 1.3+ MB


In [3]:
def image_to_feature_vector(image, size=(32,32)):
    #resize an image to a fixed size, then flatten image
    # int a list of raw pixel intesities
    # this is necessary to ensure that the images are a standard data
    # format for our classifier
    return cv2.resize(image, size).flatten()

In [4]:
def extract_color_histogram(image, bins=(8,8,8)):
    # extract a 3D color histogram from the HSV color space
    # using the supplied number of 'bins' per channel
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0,1,2], None, bins, 
           [0,180,0,256,0,256])
    
    # handle normalizing the histogram if we are using OPenCV 2.4.X
    if imutils.is_cv2():
        hist = cv2.normalize(hist)
        
    # otherwise, perform 'in place' normalization in OpenCV3
    else:
        cv2.normalize(hist,hist)
        
    # return the flattened histogram as the feature vector
    return hist.flatten()

In [5]:
# construct arguments
ap = argparse.ArgumentParser()
ap.add_argument("-k", "--neighbors", default=1, 
                help="# of nearest neighbors")
ap.add_argument("-j", "--jobs", type=int, default=-1,
               help="# of jobs for k-NN distance (-1 uses all available cores)")
# args = vars(ap.parse_args())


_StoreAction(option_strings=['-j', '--jobs'], dest='jobs', nargs=None, const=None, default=-1, type=<class 'int'>, choices=None, help='# of jobs for k-NN distance (-1 uses all available cores)', metavar=None)

In [6]:
# initiate pixel intensity, features, and labels matrixes
rawImages = []
features = []
labels = []

imagePaths = data['path'].tolist()
labels = data['label'].tolist()

for (i, imagePath) in enumerate(imagePaths):
    image = cv2.imread(imagePath + ".jpg")
    pixels = image_to_feature_vector(image)
    hist = extract_color_histogram(image)
    
    # update the raw images, features, and label matricies,
    # respectively
    rawImages.append(pixels)
    features.append(hist)
    # show an update every 1,000 images
    if i > 0 and i % 1000 == 0:
        print("[INFO] processed {}/{}".format(i, len(imagePaths)))
    
rawImages = np.array(rawImages)
features = np.array(features)
labels = np.array(labels)

print("[INFO] pixels matrix: {:.2f}MB".format(
	rawImages.nbytes / (1024 * 1000.0)))
print("[INFO] features matrix: {:.2f}MB".format(
	features.nbytes / (1024 * 1000.0)))

[INFO] processed 1000/35350
[INFO] processed 2000/35350
[INFO] processed 3000/35350
[INFO] processed 4000/35350
[INFO] processed 5000/35350
[INFO] processed 6000/35350
[INFO] processed 7000/35350
[INFO] processed 8000/35350
[INFO] processed 9000/35350
[INFO] processed 10000/35350
[INFO] processed 11000/35350
[INFO] processed 12000/35350
[INFO] processed 13000/35350
[INFO] processed 14000/35350
[INFO] processed 15000/35350
[INFO] processed 16000/35350
[INFO] processed 17000/35350
[INFO] processed 18000/35350
[INFO] processed 19000/35350
[INFO] processed 20000/35350
[INFO] processed 21000/35350
[INFO] processed 22000/35350
[INFO] processed 23000/35350
[INFO] processed 24000/35350
[INFO] processed 25000/35350
[INFO] processed 26000/35350
[INFO] processed 27000/35350
[INFO] processed 28000/35350
[INFO] processed 29000/35350
[INFO] processed 30000/35350
[INFO] processed 31000/35350
[INFO] processed 32000/35350
[INFO] processed 33000/35350
[INFO] processed 34000/35350
[INFO] processed 35000/

In [None]:
np.savetxt('pixels.csv', rawImages, delimiter=',')
np.savetxt('features.csv',features,delimiter=',')

In [None]:
len(labels)

In [27]:
# partition the data into training and testing splits, using 75%
# of the data for training and the remaining 25% for testing
(trainRI, testRI, trainRL, testRL) = train_test_split(
    rawImages, labels, test_size=0.2, random_state=42)
(trainFeat, testFeat, trainLabels, testLabels) = train_test_split(
    features, labels, test_size=0.2, random_state=42)

In [28]:
# train and evaluate a k-NN classifer on the raw pixel intensities
print("[INFO] evaluating raw pixel accuracy...")
model = KNeighborsClassifier(n_neighbors=10,
	n_jobs=-1)
model.fit(trainRI, trainRL)
acc = model.score(testRI, testRL)
print("[INFO] raw pixel accuracy: {:.2f}%".format(acc * 100))

[INFO] evaluating raw pixel accuracy...
[INFO] raw pixel accuracy: 2.19%


In [29]:
# train and evaluate a k-NN classifer on the histogram
# representations
print("[INFO] evaluating histogram accuracy...")
model = KNeighborsClassifier(n_neighbors=2,
    n_jobs=-1)
model.fit(trainFeat, trainLabels)
acc = model.score(testFeat, testLabels)
print("[INFO] histogram accuracy: {:.2f}%".format(acc * 100))

[INFO] evaluating histogram accuracy...
[INFO] histogram accuracy: 3.17%
