In [25]:
import numpy as np
import visual_bow as bow
from sklearn.cluster import MiniBatchKMeans
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.externals import joblib
import glob
import random
import warnings

SCORING = 'f1_micro'
print('Scoring grid search with metric: %s' % SCORING)

Scoring grid search with metric: f1_micro


In [2]:
# Get all possible negative images and label them False
positive_folder='panda'
all_negs = [(path, False) for path in bow.neg_img_cal101(positive_folder)]
print('%i total negative imgs to choose from' % len(all_negs))
print(all_negs[:5])

9106 total negative imgs to choose from
[('101_ObjectCategories/rooster/image_0014.jpg', False), ('101_ObjectCategories/rooster/image_0023.jpg', False), ('101_ObjectCategories/rooster/image_0040.jpg', False), ('101_ObjectCategories/rooster/image_0013.jpg', False), ('101_ObjectCategories/rooster/image_0038.jpg', False)]


In [3]:
# Get all the positive images you have (in the panda_rip folder) and label them True
positive_imgs = [(path, True) for path in glob.glob('panda_rip/*')]
print('%i positive images' % len(positive_imgs))
print(positive_imgs[:5])

674 positive images
[('panda_rip/image_0014.jpg', True), ('panda_rip/75.JPEG', True), ('panda_rip/345.JPEG', True), ('panda_rip/30.JPEG', True), ('panda_rip/106.JPEG', True)]


In [4]:
# take N random negative images, where N is no of positive images
# then concatenate N pos + N neg and shuffle.
chosen_negs = random.sample(all_negs, len(positive_imgs))
imgs = chosen_negs + positive_imgs

np.random.shuffle(imgs)

print('%i total images (1:1 positive:negative)' % len(imgs))
print(imgs[:5])

1348 total images (1:1 positive:negative)
[('101_ObjectCategories/gerenuk/image_0017.jpg', False), ('101_ObjectCategories/Faces/image_0392.jpg', False), ('101_ObjectCategories/watch/image_0014.jpg', False), ('panda_rip/72.JPEG', True), ('panda_rip/184.JPEG', True)]


In [None]:
%%time

img_descs, y = bow.gen_sift_features(imgs)

In [36]:
# joblib.dump(img_descs, 'pickles/img_descs/img_descs.pickle')
# joblib.dump(y, 'pickles/img_descs/y.pickle')

['pickles/img_descs/y.pickle', 'pickles/img_descs/y.pickle_01.npy']

In [7]:
# generate indexes for train/test/val split
training_idxs, test_idxs, val_idxs = bow.train_test_val_split_idxs(
    total_rows=len(imgs), 
    percent_test=0.15, 
    percent_val=0.15
)

Train-test-val split: 944 training rows, 202 test rows, 202 validation rows


# Cluster the SIFT descriptors

In [None]:
%%time

K_CLUSTERS = 250

# MiniBatchKMeans annoyingly throws tons of deprecation warnings that fill up the notebook. Ignore them.
warnings.filterwarnings('ignore')

X, cluster_model = bow.cluster_features(
    img_descs, 
    training_idxs=training_idxs, 
    cluster_model=MiniBatchKMeans(n_clusters=K_CLUSTERS)
)

warnings.filterwarnings('default')

X_train, X_test, X_val, y_train, y_test, y_val = bow.perform_data_split(X, y, training_idxs, test_idxs, val_idxs)

## Uncomment to pickle the clustered Visual BoW features

In [32]:
# for obj, obj_name in zip( [X_train, X_test, X_val, y_train, y_test, y_val], 
#                          ['X_train', 'X_test', 'X_val', 'y_train', 'y_test', 'y_val'] ):
#     joblib.dump(obj, 'pickles/feature_data/%s.pickle' % obj_name)

## Uncomment to LOAD pickle of clustered Visual BoW features

In [None]:
# for obj_name in ['X_train', 'X_test', 'X_val', 'y_train', 'y_test', 'y_val']:
#     exec("{obj_name} = joblib.load('pickles/feature_data/{obj_name}.pickle')".format(obj_name=obj_name))
#     exec("print obj_name, len({0})".format(obj_name))

# Classify with SVM

In [30]:
%%time

# c_vals = [0.0001, 0.01, 0.1, 1, 10, 100, 1000]
c_vals = [0.1, 1, 5, 10]
# c_vals = [1]

gamma_vals = [0.5, 0.1, 0.01, 0.0001, 0.00001]
# gamma_vals = [0.5, 0.1]
# gamma_vals = [0.1]

param_grid = [
  {'C': c_vals, 'kernel': ['linear']},
  {'C': c_vals, 'gamma': gamma_vals, 'kernel': ['rbf']},
 ]

svc = GridSearchCV(SVC(), param_grid, n_jobs=-1, scoring=SCORING)
svc.fit(X_train, y_train)
print('train score (%s):'%SCORING, svc.score(X_train, y_train))
print('test score (%s):'%SCORING, svc.score(X_test, y_test))

print(svc.best_estimator_)

train score (f1_micro): 0.91313559322
test score (f1_micro): 0.886138613861
SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
CPU times: user 1.66 s, sys: 109 ms, total: 1.77 s
Wall time: 14 s


# We have our estimator, this is how it could classify random pictures

In [15]:
for img_path, label in random.sample(all_negs, 10):
    print(img_path, svc.predict(bow.img_to_vect(img_path, cluster_model)))

101_ObjectCategories/BACKGROUND_Google/image_0460.jpg ['False']
101_ObjectCategories/minaret/image_0032.jpg ['False']
101_ObjectCategories/Motorbikes/image_0370.jpg ['False']
101_ObjectCategories/pagoda/image_0047.jpg ['False']
101_ObjectCategories/sunflower/image_0062.jpg ['False']
101_ObjectCategories/bonsai/image_0066.jpg ['True']
101_ObjectCategories/wheelchair/image_0027.jpg ['False']
101_ObjectCategories/Motorbikes/image_0776.jpg ['True']
101_ObjectCategories/Faces/image_0195.jpg ['False']
101_ObjectCategories/trilobite/image_0054.jpg ['False']


## Uncomment to pickle the best SVC classifier & kmeans

In [None]:
# joblib.dump(svc.best_estimator_, 'pickles/svc/svc.pickle')
# joblib.dump(cluster_model, 'pickles/cluster_model/cluster_model.pickle')

# Try AdaBoost, it's a common choice for SIFT features

In [31]:
%%time

ada_params = {
    'n_estimators':[100, 250, 500, 750],
    'learning_rate':[0.8, 0.9, 1.0, 1.1, 1.2]
}

# ada = AdaBoostClassifier(n_estimators=MAX_ESTIMATORS, learning_rate=0.8)
ada = GridSearchCV(AdaBoostClassifier(), ada_params, n_jobs=-1, scoring=SCORING)
ada.fit(X_train, y_train)
print('train score (%s):'%SCORING, ada.score(X_train, y_train))
print('test score (%s):'%SCORING, ada.score(X_test, y_test))
print(ada.best_estimator_)

train score (f1_micro): 1.0
test score (f1_micro): 0.816831683168
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.1, n_estimators=250, random_state=None)
CPU times: user 4.09 s, sys: 97 ms, total: 4.19 s
Wall time: 1min 44s


## Uncomment to pickle the AdaBoostClassifier

In [None]:
# joblib.dump(ada.best_estimator_, 'pickles/ada/ada.pickle');
# print('picked adaboost')

# TODO

* Separate out the clustering from the feature generation. They should be 2 different functions, the clustering should take the SIFT **training** data as an argument. It has labels already, right? Then you can save the SIFT data before clustering. Finally, you can do a grid search across K_CLUSTERS.

* Also it would be cool to graph the above.