In [1]:
# necessary libraries, functions, and constants
import matplotlib.pyplot as plt
import os
import numpy as np
import random
from PIL import Image
from sklearn import metrics
from sklearn import model_selection
from sklearn import neighbors

IMAGE_SIZE = (32, 32)
labels = ['ocean', 'ship', 'sky']

def normalize_rgb(r, g, b):
    """takes an input between 1 and 255 and returns a 
    value between 0 and 1"""
    return (r/255.0, g/255.0, b/255.0)

def extract_features_labels(file_list):
    """trains the classifier given a set of files"""
    # X contains the features, Y contains the classes
    X = []
    Y = []
    
    # train on the file_list
    for file in file_list:
        # metadata
        if file[0] != ".":
            label, dataset_name, dataset_index = file.split("_") # label_dataset_index.extension
            dataset_index, extension = dataset_index.split(".") # index.extension
            #print("%s %s %s" % (classification, dataset_name, dataset_index))
            # set the label
            Y.append(label)

            path = image_dir + "/" + file
            im = Image.open(path)
            im = im.resize(IMAGE_SIZE, resample=Image.LANCZOS)

            # get the rgb color components of all pixels
            image_features = []
            for x in range(0,IMAGE_SIZE[0]):
                for y in range(0,IMAGE_SIZE[1]):
                    r, g, b = im.getpixel((x, y))
                    r, g, b = normalize_rgb(r, g, b)
                    image_features.extend([r, g, b])
            X.append(image_features)

            #print("%s %s %s" % (r, g, b))
            new_name = ("%s_%s_%s.%s" % (label, dataset_name, dataset_index, "png"))
            im.save(tmp_dir + "/" + new_name, "PNG")
    
    return X, Y

# this function taken from:
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [2]:
# directory structure
cwd = os.getcwd()
image_dir = "../images/combined"
tmp_dir = "../images/tmp"
print("cwd = " + cwd)
print("image_dir = " + image_dir)
print("tmp_dir = " + tmp_dir)

if not os.path.exists(tmp_dir):
    os.mkdir(tmp_dir)

cwd = /Users/jordan/Documents/ml-project/code
image_dir = ../images/combined
tmp_dir = ../images/tmp


In [3]:
# process images, create labels and features
files = os.listdir(image_dir)

X, y = extract_features_labels(files)

# some stats about the dataset
print("%s observations" % (len(y)))

print("Label counts:")
for label in labels:
    print("%s - %s" % (label, y.count(label)))


3347 observations
Label counts:
ocean - 504
ship - 2347
sky - 496


In [4]:
# K-NN
#
# inputs:
# number of neighbors
#
# output metrics:
# confusion matrix
# normalized accuracy score
# precision score
# area under the ROC curve

# record metrics for the cross-validated value - n-neighbors
# format of knn_metrics is [neighbors, accuracy, precision, confusion matrix] 
knn_neighbors = []
knn_accuracy = []
knn_precision = []
knn_cm = []

for n_neighbors in range(1, 20):
    # create training and test sets
    # first method - evenly split training and test set into two random sets
    # randomly select from the original data into even-sized training and test sets
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.5, random_state=0)
    classifier = neighbors.KNeighborsClassifier(n_neighbors, 'distance')
    classifier.fit(X_train, y_train)
    y_prediction = classifier.predict(X_test)

    cm = metrics.confusion_matrix(y_test, y_prediction, labels=labels)
    accuracy = metrics.accuracy_score(y_test, y_prediction, normalize=True)
    precision = metrics.precision_score(y_test, y_prediction, average='micro')
    
    knn_neighbors.append(n_neighbors)
    knn_accuracy.append(accuracy)
    knn_precision.append(precision)
    knn_cm.append(cm)

print("max accuracy = %s" % (max(knn_accuracy)))
print("max precision = %s" % (max(knn_precision)))


# second method - Stratified k-folds


# third method - LOOCV


max accuracy = 0.797491039427
max precision = 0.797491039427


In [None]:
import itertools
plot_confusion_matrix(knn_cm[16], labels, title='Confusion matrix for k=17')
plt.show()

In [None]:
accuracy_index = knn_accuracy.index(max(knn_accuracy))
precision_index = knn_precision.index(max(knn_precision))

plt.plot(range(1, 20), knn_accuracy, label='accuracy')
plt.title("Accuracy for different values of k")
plt.xlabel("k")
plt.ylabel("accuracy")
plt.show()