In [1]:
!pwd

/Users/kevinli/src/Python_For_Data_Science/Kevin/Homework4


In [3]:
!ls

[34m50_categories[m[m
50_categories.tar.gz
HW4-V1.ipynb
hw_4-machine-learning-parallel-strawman.py


### Libraries 

In [7]:
import os
from six.moves import cPickle 

In [8]:
import warnings
import sys
from glob import glob

In [9]:
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import KFold
from sklearn import preprocessing

### Loading Data

In [None]:
"""Image processing and feature extraction

To compute features for training, run:

`python image_processing.py`.

This script assumes there is a directory called `50_categories`, which
contains subdirectories named by category, and then within each
subdirectory the actual images.

Running this script from the command line will load the images,
perform some basic preprocessing (equalizing, scaling, etc.), and
compute features. It will then save the feature array to a file called
`image_dataset.npy`, and the categories to a file called
`image_categories.npy` (these are both numpy files, and can be loaded
by calling `np.load`).

"""

# built-in
import os
import sys
from glob import glob
# external
import numpy as np
import skimage.exposure
import skimage.feature
import skimage.filter
import skimage.filter.rank
import skimage.io
import skimage.morphology


def load_image(img_path, n=400):
    """Load an image from file, and perform minimal processing on it to
    prepare it for feature extraction.

    Specifically, this function does the following operations:

        1) Load image
        2) Convert to RGB if grayscale
        3) Equalize histograms
        4) Denoise
        5) Resize

    Parameters
    ----------
    img_path : string
        The path to the image
    n : int (optional)
        The size to scale the largest dimension to

    Returns
    -------
    img : numpy.ndarray

    """
    # load the image from file
    img = skimage.io.imread(img_path).astype('f8')
    # make sure it has three channels
    if img.ndim == 2:
        img = img[:, :, None] * np.ones(img.shape + (3,))

    # equalize histograms
    img = skimage.exposure.equalize_hist(img)
    # reduce noise
    img = skimage.filter.denoise_bilateral(img, 3, 0.1)

    # scale largest dimension to be of size n
    shape = img.shape[:2]
    scale = float(n) / max(shape)
    img = skimage.transform.rescale(img, scale)
    shape = img.shape[:2]

    return img


def extract_features(img):
    """Extract a vector of features from an image. The feature vector is
    flat, but has the following components:

        1) Mean of R, G, and B channels
        2) Covariance between R, G, and B channels
        3) Summary statistics of image entropy

    Parameters
    ----------
    img : numpy.ndarray
        The image to extract features from.

    Returns
    -------
    feature_vec : numpy.ndarray
        One-dimensional numpy array of features

    """
    RGB = img.reshape((-1, 3)).T

    # mean of each channel
    mean = np.mean(RGB, axis=1)
    # median of each channel
    median = np.median(RGB, axis=1)
    # covariance between channels
    cov = np.cov(RGB).ravel()
    # (normalized) entropy of the grayscale image
    entropy = skimage.filter.rank.entropy(
        np.mean(img, axis=-1).astype('uint16'),
        skimage.morphology.disk(5))
    entropy = entropy / float(img.size)
    entropy_sum = np.sum(entropy)
    entropy_mean = np.mean(entropy)
    entropy_var = np.var(entropy)

    # concatenate all the features together
    feature_vec = np.concatenate(
        [mean, median, cov, [entropy_sum, entropy_mean, entropy_var]])

    return feature_vec


def get_image_categories(images):
    """Get the true categories of a set of paths to images, based on the
    directory they are located in.

    The paths should have the form:
        path/to/image/category/image.jpg

    Where the image filename is the last item in the path, and the
    directory (category name) is the second to last item in the path.

    Parameters
    ----------
    images : list
        List of paths to images

    Returns
    -------
    categories : numpy.ndarray
        An array of integers in order of the images, corresponding to
        each image's category
    category_map : list
        A list of category names. The category integers in
        `categories` are indices into this list.

    """
    get_category = lambda x: os.path.split(os.path.split(x)[0])[1]
    categories = map(get_category, images)
    category_map = sorted(set(categories))
    categories = np.array(map(category_map.index, categories))
    return categories, category_map


def load_and_extract(images):
    # placeholder variable for feature array
    features = None

    # go through each image and calculate features, saving them in the
    # feature array
    for i, image_path in enumerate(images):
        # display progress
        msg = "[%d / %d] %s" % (i, len(images), image_path)
        sys.stdout.write(msg + "\r")
        sys.stdout.flush()

        # load and extract features
        img = load_image(image_path)
        img_features = extract_features(img)
        if i == 0:
            features = np.empty((len(images), img_features.size), dtype='f4')
            sys.stdout.write(" "*len(msg) + "\r")
            print("Feature array has shape %s" % str(features.shape))
        features[i] = img_features

        # clear the output (the \r moves the cursor back to the
        # beginning of the line, so we can overwrite it)
        sys.stdout.write(" "*len(msg) + "\r")

    return features


if __name__ == "__main__":
    # get the list of images
    images = glob("./50_categories/*/*.jpg")

    # compute feature matrix
    features = load_and_extract(images)

    # create an integer mapping to categories
    categories, category_map = get_image_categories(images)

    # concatenate the features (X) and categories (Y)
    dataset = np.hstack([features, categories[:, None]])

    # save to disk
    filename = "./image_dataset.npy"
    np.save(filename, dataset)
    print("Saved features to '%s'" % filename)

    filename = "./image_categories.npy"
    np.save(filename, category_map)
    print("Saved categories to '%s'" % filename)

In [5]:
# %load hw_4-machine-learning-parallel-strawman.py
#!/usr/bin/env python
"""
AY 250 - Scientific Research Computing with Python
Homework Assignment 4 - Parallel Feature Extraction Example
Author: Christopher Klein, Joshua Bloom
"""
from os import listdir
from multiprocessing import Pool, cpu_count
from pylab import imread
from time import time

## CHANGE THIS NEXT LINE!
#MYDIRECTORY = "/Users/jbloom/Classes/ay250-py4sci/week4/50_categories"
MYDIRECTORY = "/Users/kevinli/src/Python_For_Data_Science/Kevin/Homework4/50_categories"
# FUNCTION DEFINITIONS
# Quick function to divide up a large list into multiple small lists, 
# attempting to keep them all the same size. 
def split_seq(seq, size):
        newseq = []
        splitsize = 1.0/size*len(seq)
        for i in range(size):
            newseq.append(seq[int(round(i*splitsize)):
                int(round((i+1)*splitsize))])
        return newseq
# Our simple feature extraction function. It takes in a list of image paths, 
# does some measurement on each image, then returns a list of the image paths
# paired with the results of the feature measurement.
def extract_features(image_path_list):
    feature_list = []
    for image_path in image_path_list:
        image_array = imread(image_path)
        feature = image_array.size # This feature is simple. You can modify this
        # code to produce more complicated features and to produce multiple
        # features in one function call.
        feature_list.append([image_path, feature])
    return feature_list



### Main program starts here ###################################################
# We first collect all the local paths to all the images in one list
image_paths = []
categories = listdir(MYDIRECTORY)
for category in categories:
    image_names = listdir(MYDIRECTORY  + "/" + category)
    for name in image_names:
        image_paths.append(MYDIRECTORY + "/" + category + "/" + name)

print ("There should be 4244 images, actual number is " + 
    str(len(image_paths)) + ".")

# Then, we run the feature extraction function using multiprocessing.Pool so 
# so that we can parallelize the process and run it much faster.
numprocessors = cpu_count() # To see results of parallelizing, set numprocessors
                            # to less than cpu_count().
# numprocessors = 1

# We have to cut up the image_paths list into the number of processes we want to
# run. 
split_image_paths = split_seq(image_paths, numprocessors)

# Ok, this block is where the parallel code runs. We time it so we can get a 
# feel for the speed up.
start_time = time()
p = Pool(numprocessors)
result = p.map_async(extract_features, split_image_paths)
poolresult = result.get()
end_time = time()

# All done, print timing results.
print ("Finished extracting features. Total time: " + 
    str(round(end_time-start_time, 3)) + " s, or " + 
    str( round( (end_time-start_time)/len(image_paths), 5 ) ) + " s/image.")
# This took about 10-11 seconds on my 2.2 GHz, Core i7 MacBook Pro. It may also
# be affected by hard disk read speeds.

# To tidy-up a bit, we loop through the poolresult to create a final list of
# the feature extraction results for all images.
combined_result = []
for single_proc_result in poolresult:
    for single_image_result in single_proc_result:
        combined_result.append(single_image_result)


NotADirectoryError: [Errno 20] Not a directory: '/Users/kevinli/src/Python_For_Data_Science/Kevin/Homework4/50_categories/.DS_Store'