In [8]:
# imports and functions
import csv
import os
from PIL import Image

IMAGE_SIZE = (32, 32)
labels = ['ocean', 'ship', 'sky']

def normalize_rgb(r, g, b):
    """takes an input between 1 and 255 and returns a 
    value between 0 and 1"""
    return (r/255.0, g/255.0, b/255.0)

def extract_features_labels(file_list, feature_dims, rgb=True):
    """trains the classifier given a set of files"""
    # X contains the features, Y contains the classes
    X = []
    Y = []
    
    num_files = len(file_list)
    
    # train on the file_list
    for i, file in enumerate(file_list):
        # metadata
        if file[0] != ".":
            label, dataset_name, dataset_index = file.split("_") # label_dataset_index.extension
            dataset_index, extension = dataset_index.split(".") # index.extension
            #print("%s %s %s" % (classification, dataset_name, dataset_index))
            # set the label
            Y.append(label)

            path = image_dir + "/" + file
            im = Image.open(path)
            if not rgb:
                im_gray = im.convert('L')
            im = im.resize(feature_dims, resample=Image.LANCZOS)

            # TODO
            # Look into other methods for extracting features
            # Ideas: 
            # 1. randomly sample a subset of pixels (consistent across all images)
            # 2. use PCA to determine the pixels that have the most impact on the outcome
            #  does this get too close to CNN?
            #
            
            # extract features from each pixel
            image_features = []
            for x in range(0,feature_dims[0]):
                for y in range(0,feature_dims[1]):
                    if rgb:
                        r, g, b = im.getpixel((x, y))
                        r, g, b = normalize_rgb(r, g, b)
                        image_features.extend([r, g, b])
                    else:
                        pixel_value = im.getpixel((x, y))
                        feature = pixel_value[0]/255.0
                        image_features.extend([feature])
            X.append(image_features)

            #print("%s %s %s" % (r, g, b))
            if rgb:
                new_name = ("%s_%s_%s_rgb.%s" % (label, dataset_name, dataset_index, "png"))
            else:
                new_name = ("%s_%s_%s_gray.%s" % (label, dataset_name, dataset_index, "png"))
            im.save(tmp_dir + "/" + new_name, "PNG")
            if i%100 == 0:
                print("processed %d out of %d images" % (i, num_files))
    
    return X, Y

In [9]:
# directory structure
cwd = os.getcwd()
image_dir = "../images/combined"
tmp_dir = "../images/tmp"
print("cwd = " + cwd)
print("image_dir = " + image_dir)
print("tmp_dir = " + tmp_dir)

if not os.path.exists(tmp_dir):
    os.mkdir(tmp_dir)

cwd = /Users/jordan/Documents/ml-project/code
image_dir = ../images/combined
tmp_dir = ../images/tmp


In [10]:
# process images, create labels and features
files = os.listdir(image_dir)

# create rgb 32x32 pixel images for KNN and LDA/QDA
X_rgb, y_rgb = extract_features_labels(files, IMAGE_SIZE)

# create grayscale 28x28 pixel images for CNN
X_gray, y_gray = extract_features_labels(files, (28, 28), rgb=False)

# some stats about the dataset
print("RGB:")
print("%s observations" % (len(y_rgb)))
assert len(X_rgb) == len(y_rgb)

print("Label counts:")
for label in labels:
    print("%s - %s" % (label, y_rgb.count(label)))

print("\nGrayscale:")
print("%s observations" % (len(y_gray)))
assert len(X_gray) == len(y_gray)

print("Label counts:")
for label in labels:
    print("%s - %s" % (label, y_gray.count(label)))

RGB:
3347 observations
Label counts:
ocean - 504
ship - 2347
sky - 496

Grayscale:
3347 observations
Label counts:
ocean - 504
ship - 2347
sky - 496


In [11]:
# export to data_rgb.csv and data_gray.csv
with open('data_rgb.csv', 'w') as csvfile:
    data_writer = csv.writer(csvfile, dialect='excel')
    for row_num, label in enumerate(y_rgb):
        row = [label]
        row.extend(X_rgb[row_num])
        data_writer.writerow(row)
with open('data_gray.csv', 'w') as csvfile:
    data_writer = csv.writer(csvfile, dialect='excel')
    for row_num, label in enumerate(y_gray):
        row = [label]
        row.extend(X_gray[row_num])
        data_writer.writerow(row)