In [1]:
# imports
import os
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import cv2
from sklearn.decomposition import RandomizedPCA as PCA
from time import time
import re

# for plotting in jupyter
%matplotlib inline 

# constants
TRAIN_DIR = './imgs/train/'
STD_SIZE = (100, 75)

# Get Training Image Paths and Split into Train and Validation

In [2]:
all_imgs = []
for subdir in os.listdir(TRAIN_DIR):
    path = TRAIN_DIR + subdir + '/'
    all_imgs += [path + f for f in os.listdir(path)]
    
all_imgs = np.array(all_imgs)
n_samples = len(all_imgs)

In [3]:
np.random.seed(1123)
val_pct = 1./4
in_train = np.random.uniform(size = n_samples) > val_pct
train_imgs, val_imgs = all_imgs[in_train], all_imgs[~in_train]
print '{} Images split into {} training and {} validation images.'.format(n_samples, len(train_imgs), len(val_imgs))

22424 Images split into 16738 training and 5686 validation images.


Process the training images for input into PCA

In [4]:
def preprocess_images(file_ary, resize):
    """
    Given a list of image files, resize and convert to flattened array of grayscale pixel intensities.
    Parameters
    ----------
    file_ary : ndarray
        file paths to be processed
    resize : tuple of ints
        width, height of new image
    Return
    ------
    numpy array (n_sample, w*h) of grayscale pixel intensities.
    """
    n = len(file_ary)
    pixel_matrix = np.zeros((n, np.prod(resize)))
    t0 = time()
    for i, fpath in enumerate(file_ary):
        img = cv2.imread(fpath, 0) # 0 flag converts to grayscale during load
        resized = cv2.resize(img, resize)
        pixel_matrix[i] = resized.reshape(-1)
    
    print 'Processed {0} images in {1:.{2}f} seconds'.format(n, time() - t0, 3) 
        
    return pixel_matrix

In [5]:
train_proc = preprocess_images(file_ary=train_imgs, resize=STD_SIZE)

Processed 16738 images in 63.720 seconds


In [6]:
N_COMPONENTS = 200
SEED = 2718
pca = PCA(n_components=N_COMPONENTS, whiten=True, random_state=SEED)

t0 = time()
pca.fit(train_proc)
print '{0} principal components fit in {1:.{2}f} seconds'.format(N_COMPONENTS, time() - t0, 3)

200 principal components fit in 10.685 seconds


# Prepare Images for Learning

In [7]:
def get_labels(file_ary):
    """
    Get the class labels for a list of images, given their path.
    Parameters
    ----------
    file_ary : ndarray
        Image file paths
    Returns
    -------
    dictionary with two entries: img, y
    Each entry contains an aligned sequence of img file names and their class label
    """
    d = {'img' : [], 'y' : []}
    for fname in file_ary:
        img_id = re.search('img_.*', fname).group(0)
        label = re.search('c[0123456789]', fname).group(0)
        d['img'].append(img_id)
        d['y'].append(label)
        
    return d

In [8]:
train_labels = get_labels(train_imgs)
trainX = pca.transform(train_proc)
trainY = train_labels['y']

Prepare Validation Data

In [9]:
val_proc = preprocess_images(file_ary=val_imgs, resize=STD_SIZE)
val_labels = get_labels(val_imgs)
valX = pca.transform(val_proc)
valY = val_labels['y']

Processed 5686 images in 22.885 seconds


Prepare Test Images

In [10]:
TEST_DIR = './imgs/test/'
test_files = os.listdir(TEST_DIR)
test_paths = [TEST_DIR + f for f in test_files]
test_proc = preprocess_images(file_ary=test_paths, resize=STD_SIZE)
testX = pca.transform(test_proc)

Processed 79726 images in 284.428 seconds


# Learn Classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import log_loss

We will build a random forest classifier and a support vector machine, and use a validation set to compare the two. The best model will be retrained with the full training set before making predictions on the test data for submission.

In [40]:
rf = RF(n_estimators = 800, n_jobs=-1, verbose=1, random_state=SEED)
rf.fit(trainX, trainY)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.4min


Training time 87.797 seconds


[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.4min finished


In [15]:
def get_error(model):
    pred_tr = model.predict_proba(trainX)
    pred_val = model.predict_proba(valX)
    loss_tr = log_loss(y_true = trainY, y_pred = pred_tr)
    loss_val = log_loss(y_true = valY, y_pred = pred_val)
    print 'Training Log Loss : {}, Validation Log Loss : {}'.format(loss_tr, loss_val)

In [None]:
get_error(rf)

The random forest model is overfitting a little bit, but 0.377 on the validation set is a great start. The variance can be reduced by squashing the max tree depth, which may be worth investigating in the future. First, see if SVM gives similar results.

In [12]:
svm = SVC(kernel='rbf', probability=True, cache_size=3000, class_weight='balanced', random_state=SEED) 
param_grid = {'C' : np.logspace(-3, 8, num=3), 'gamma' : np.logspace(-6, -3, num=3)}
clf = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='log_loss', n_jobs=-1, cv=3, verbose=1)

In [13]:
t0 = time()
clf.fit(trainX, trainY)
print 'Training time: {0:.3f} seconds'.format(time() - t0)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed: 79.7min finished


Training time: 5474.668 seconds


In [17]:
clf.best_estimator_

SVC(C=316.2277660168379, cache_size=3000, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=True, random_state=2718, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
get_error(clf)

Training Log Loss : 0.00290193608158, Validation Log Loss : 0.0295990117968


In [26]:
def make_predictions(clf, filename):
    """
    Parameters
    ----------
    clf : object
        must have a predict_proba method
    filename : str
        CSV filename for output
    """
    predictions = clf.predict_proba(testX)
    df1 = pd.DataFrame({'img' : test_files})
    if hasattr(clf, 'best_estimator_'):
        col_names = clf.best_estimator_.classes_
    else:
        col_names = clf.classes_
    df2 = pd.DataFrame(predictions, columns = col_names)
    pd.concat([df1, df2], axis=1).to_csv(filename, index=False)

In [45]:
make_predictions(rf, 'testY_8pm_519.csv')

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.3s
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed:    5.3s finished


In [27]:
make_predictions(clf, 'testY_10am_520.csv')