In [38]:
# necessary libraries, functions, and constants
import os
import numpy as np
import random
from PIL import Image
from sklearn import cross_validation
from sklearn import metrics
from sklearn import neighbors

IMAGE_SIZE = (32, 32)
labels = ['ocean', 'ship', 'sky']

def normalize_rgb(r, g, b):
    """takes an input between 1 and 255 and returns a 
    value between 0 and 1"""
    return (r/255.0, g/255.0, b/255.0)

def extract_features_labels(file_list):
    """trains the classifier given a set of files"""
    # X contains the features, Y contains the classes
    X = []
    Y = []
    
    # train on the file_list
    for file in file_list:
        # metadata
        label, dataset_name, dataset_index = file.split("_") # class_dataset_index.extension
        dataset_index, extension = dataset_index.split(".") # index.extension
        #print("%s %s %s" % (classification, dataset_name, dataset_index))

        # set the classifier
        Y.append(label)

        path = image_dir + "/" + file
        im = Image.open(path)
        im = im.resize(IMAGE_SIZE, resample=Image.LANCZOS)

        # get the rgb color components of all pixels
        image_features = []
        for x in range(0,IMAGE_SIZE[0]):
            for y in range(0,IMAGE_SIZE[1]):
                r, g, b = im.getpixel((x, y))
                r, g, b = normalize_rgb(r, g, b)
                image_features.extend([r, g, b])
        X.append(image_features)

        #print("%s %s %s" % (r, g, b))

        new_name = ("%s_%s_%s.%s" % (label, dataset_name, dataset_index, "png"))

        im.save(tmp_dir + "/" + new_name, "PNG")
    
    return X, Y   

In [39]:
# directory structure
cwd = os.getcwd()
image_dir = "../images/combined"
tmp_dir = "../images/tmp"
print("cwd = " + cwd)
print("image_dir = " + image_dir)
print("tmp_dir = " + tmp_dir)

if not os.path.exists(tmp_dir):
    os.mkdir(tmp_dir)

cwd = C:\Users\jchadwick\Documents\ml-project\code
image_dir = ../images/combined
tmp_dir = ../images/tmp


In [40]:
# process images, create labels and features
files = os.listdir(image_dir)

X, y = extract_features_labels(files)


# some stats about the dataset
print("%s observations" % (len(y)))

print("Label counts:")
for label in labels:
    print("%s - %s" % (label, y.count(label)))


3347 observations
Label counts:
ocean - 504
ship - 2347
sky - 496


In [47]:
# K-NN
#
# inputs:
# number of neighbors
#
# output metrics:
# confusion matrix
# normalized accuracy score
# precision score
# area under the ROC curve

# record metrics for the cross-validated value - n-neighbors
# format of knn_metrics is [neighbors, accuracy, precision, confusion matrix] 
knn_metrics = []

for n_neighbors in range(1, 11):
    # create training and test sets
    # first method - evenly split training and test set into two random sets
    # randomly select from the original data into even-sized training and test sets
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5, random_state=0)
    classifier = neighbors.KNeighborsClassifier(n_neighbors, 'distance')
    classifier.fit(X_train, y_train)
    y_prediction = classifier.predict(X_test)

    cm = metrics.confusion_matrix(y_test, y_prediction, labels=labels)
    accuracy = metrics.accuracy_score(y_test, y_prediction, normalize=True)
    precision = metrics.precision_score(y_test, y_prediction, average='micro')
    
    knn_metrics.append({'n_neighbors': n_neighbors, 'accuracy': accuracy, 'precision': precision, 'cm': cm})

print(knn_metrics)

# second method - Stratified k-folds


# third method - LOOCV


[{'cm': array([[143,  64,  44],
       [113, 993,  57],
       [ 90,  42, 128]]), 'precision': 0.75507765830346474, 'n_neighbors': 1, 'accuracy': 0.75507765830346474}, {'cm': array([[143,  64,  44],
       [113, 993,  57],
       [ 90,  42, 128]]), 'precision': 0.75507765830346474, 'n_neighbors': 2, 'accuracy': 0.75507765830346474}, {'cm': array([[ 146,   71,   34],
       [ 110, 1010,   43],
       [  81,   48,  131]]), 'precision': 0.76881720430107525, 'n_neighbors': 3, 'accuracy': 0.76881720430107525}, {'cm': array([[ 140,   75,   36],
       [ 117, 1007,   39],
       [  86,   48,  126]]), 'precision': 0.76045400238948624, 'n_neighbors': 4, 'accuracy': 0.76045400238948624}, {'cm': array([[ 138,   81,   32],
       [ 107, 1029,   27],
       [  83,   54,  123]]), 'precision': 0.77060931899641572, 'n_neighbors': 5, 'accuracy': 0.77060931899641572}, {'cm': array([[ 141,   75,   35],
       [ 115, 1020,   28],
       [  78,   55,  127]]), 'precision': 0.76941457586618878, 'n_neighbors'

In [18]:
# create the classifier
n_neighbors = 5
classifier = neighbors.KNeighborsClassifier(n_neighbors, 'distance')
classifier.fit(X[:5], Y[:5])

# predict the test set
print(classifier.predict(X[6]))
print(Y[6])


['ocean']
ocean


