In [1]:
# import the necessary packages
from keras.applications import ResNet50
from keras.applications import imagenet_utils
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import load_img
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from imutils import paths
import numpy as np
import progressbar
import random
import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# since we are not using command line arguments (like we typically
# would inside Deep Learning for Computer Vision with Python, let's
# "pretend" we are by using an `args` dictionary -- this will enable
# us to easily reuse and swap out code depending if we are using the
# command line or Jupyter Notebook
args = {
    "dataset": "/home/james/data/dogscats/train",
    "batch_size": 32,
}

# store the batch size in a convenience variable
bs = args["batch_size"]

In [3]:
# grab the list of images in the Kaggle Dogs vs. Cats download and# grab  
# shuffle them to allow for easy training and testing splits via
# array slicing during training time
imagePaths = list(paths.list_images(args["dataset"]))

random.seed(2027)
random.shuffle(imagePaths)
print(len(imagePaths))

25000


All files in the Dogs vs. Cats dataset have filenames such as `cat.153.jpg` or `dog.4375.jpg` – since the class labels are baked right into the filenames, we can easily extract them before the dot.

In [4]:
# extract the class labels from the image paths then encode the
# labels
labels = [p.split(os.path.sep)[-1].split(".")[0] for p in imagePaths]
le = LabelEncoder()
labels = le.fit_transform(labels)

Download the `ResNet50` weights and load the model. This took around half an hour on a slow connection.

Python would not download the weights on this connections, so downloaded separately from https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5 and stored in `~/.keras/models`.

In order to perform feature extraction, we need a pre-trained network – `ResNet50` is a good choice for this application. Notice how we have set `include_top=False` to leave off the fully-connected layers, enabling us to easily perform feature extraction.

In [5]:
# load the ResNet50 network (i.e., the network we'll be using for
# feature extraction)

model = ResNet50(weights="imagenet", include_top=False)

Once we have all image paths we need to loop over them individually and build batches to pass through the network for feature extraction.

In [17]:
percentage = 0.1

k = int(len(imagePaths) * percentage)
# indices = random.sample(range(len(imagePaths)), k)

imagePaths_sample = imagePaths[:k]
labels_sample = labels[:k]

In [18]:
len(imagePaths_sample)

2500

In [19]:
len(labels_sample)

2500

In [20]:
# initialize the progress bar
widgets = ["Extracting Features: ", progressbar.Percentage(), " ",
    progressbar.Bar(), " ", progressbar.ETA()]
pbar = progressbar.ProgressBar(maxval=len(imagePaths_sample),
    widgets=widgets).start()

Extracting Features:   0% |                                    | ETA:  --:--:--

In [21]:
%%time
# initialize our data matrix (where we will store our extracted
# features)
data = None

# loop over the images in batches
for i in np.arange(0, len(imagePaths_sample), bs):
    # extract the batch of images and labels, then initialize the
    # list of actual images that will be passed through the network
    # for feature extraction
    batchPaths = imagePaths_sample[i:i + bs]
    batchLabels = labels_sample[i:i + bs]
    batchImages = []
    
    # loop over the images and labels in the current batch
    for (j, imagePath) in enumerate(batchPaths):
        # load the input image using the Keras helper utility
        # while ensuring the image is resized to 224x224 pixels
        image = load_img(imagePath, target_size=(224, 224))
        image = img_to_array(image)

        # preprocess the image by (1) expanding the dimensions and
        # (2) subtracting the mean RGB pixel intensity from the
        # ImageNet dataset
        image = np.expand_dims(image, axis=0)
        image = imagenet_utils.preprocess_input(image)

        # add the image to the batch
        batchImages.append(image)

    # pass the images through the network and use the outputs as
    # our actual features
#     print(1)
#     print(image.shape)
    batchImages = np.vstack(batchImages)
#     print(2)
    features = model.predict(batchImages, batch_size=bs)

    # reshape the features so that each image is represented by
    # a flattened feature vector of the `MaxPooling2D` outputs
#     print(features.shape)
    features = features.reshape((features.shape[0], 2048))
    
    # if our data matrix is None, initialize it
    if data is None:
        data = features
    
    # otherwise, stack the data and features together
    else:
        data = np.vstack([data, features])
    
    # update the progress bar
    pbar.update(i)

# finish up the progress bar
pbar.finish()

Extracting Features:  98% |#################################### | ETA:  0:00:08

CPU times: user 35min 31s, sys: 1min 59s, total: 37min 30s
Wall time: 10min 5s


Extracting Features:  99% |#################################### | ETA:  0:00:00Extracting Features: 100% |#####################################| Time: 0:10:09


This looks like it took about 1 hour 15 minutes. Will need to trim down to try in class. Sample to say 5000 of each?

A 10% sample took 37min 55s.  
Next run 20% took 21min 15s.  
40% took 2 hours or 40 minutes??

In [22]:
# data = np.load('dogscats_features.npy')

In [23]:
# show the data matrix shape and amount of memory it consumes
print(data.shape)
print(data.nbytes)

(2500, 2048)
20480000


In [24]:
%%time
# determine the index of the training and testing split (75% for
# training and 25% for testing)
i = int(data.shape[0] * 0.75)

# define the set of parameters that we want to tune then start a
# grid search where we evaluate our model for each value of C
print("[INFO] tuning hyperparameters...")
params = {"C": [0.0001, 0.001, 0.01, 0.1, 1.0]}
clf = GridSearchCV(LogisticRegression(), params, cv=3, n_jobs=-1)
clf.fit(data[:i], labels_sample[:i])
print("[INFO] best hyperparameters: {}".format(clf.best_params_))

[INFO] tuning hyperparameters...
[INFO] best hyperparameters: {'C': 1.0}
CPU times: user 1.53 s, sys: 177 ms, total: 1.71 s
Wall time: 4.11 s


In [25]:
# generate a classification report for the model
print("[INFO] evaluating...")
preds = clf.predict(data[i:])
print(classification_report(labels_sample[i:], preds, target_names=le.classes_))

# compute the raw accuracy with extra precision
acc = accuracy_score(labels_sample[i:], preds)
print("[INFO] score: {}".format(acc))


[INFO] evaluating...
             precision    recall  f1-score   support

        cat       0.99      0.99      0.99       297
        dog       0.99      0.99      0.99       328

avg / total       0.99      0.99      0.99       625

[INFO] score: 0.9872


> **To do:** pickle the file that I create.
  Use H2O as well, for example, AutoML.
  
 10%: 0.9872!!

In [26]:
np.save('dogscats_features', data)

Save the predictions and the labels as well so we don't need to run through the feature extraction.

In [14]:
np.save('labels_sample', labels_sample)
np.save('predictions', preds)

# save the paths
np.save('imagePaths_sample', imagePaths_sample)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)