### SVM image classification using RGB color space as feature vector

source: https://gist.github.com/gcardone/c49e3f66dc83be33666d

In [96]:
'''Images binary classifier based on scikit-learn SVM classifier.
It uses the RGB color space as feature vector.
'''

from __future__ import division
from __future__ import print_function
from PIL import Image
from sklearn import cross_validation
from sklearn import grid_search
from sklearn import svm
from sklearn import metrics
from io import StringIO
from io import BytesIO
from urllib.parse import urlparse
import pandas as pd
import pickle
import csv
import urllib
import urllib.request
import requests
import sys
import os
import datetime
import time

In [4]:
def process_directory(directory):
    '''Returns an array of feature vectors for all the image files in a
    directory (and all its subdirectories). Symbolic links are ignored.
    Args:
      directory (str): directory to process.
    Returns:
      list of list of float: a list of feature vectors.
    '''
    training = []
    for root, _, files in os.walk(directory):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            img_feature = process_image_file(file_path)
            if img_feature:
                training.append(img_feature)
    return training

In [16]:
def process_image_file(image_path):
    '''Given an image path it returns its feature vector.
    Args:
      image_path (str): path of the image file to process.
    Returns:
      list of float: feature vector on success, None otherwise.
    '''
    image_fp = BytesIO(open(image_path, 'rb').read())
    try:
        image = Image.open(image_fp)
        return process_image(image)
    except IOError:
        return None

In [70]:
def process_image_url(image_url):
    '''Given an image URL it returns its feature vector
    Args:
      image_url (str): url of the image to process.
    Returns:
      list of float: feature vector.
    Raises:
      Any exception raised by urllib2 requests.
      IOError: if the URL does not point to a valid file.
    '''
#     parsed_url = urlparse(image_url)
#     request = urllib.request.urlopen(image_url)
#     # set a User-Agent and Referer to work around servers that block a typical
#     # user agents and hotlinking. Sorry, it's for science!
#     request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux ' \
#             'x86_64; rv:31.0) Gecko/20100101 Firefox/31.0')
#     request.add_header('Referrer', parsed_url.netloc)
#     # Wrap network data in StringIO so that it looks like a file
#     net_data = StringIO(urllib.build_opener().open(request).read())
#     image = Image.open(net_data)
    response = requests.get(image_url)
    image = Image.open(BytesIO(response.content))
    return process_image(image)

In [7]:
def process_image(image, blocks=4):
    '''Given a PIL Image object it returns its feature vector.
    Args:
      image (PIL.Image): image to process.
      blocks (int, optional): number of block to subdivide the RGB space into.
    Returns:
      list of float: feature vector if successful. None if the image is not
      RGB.
    '''
    if not image.mode == 'RGB':
        return None
    feature = [0] * blocks * blocks * blocks
    pixel_count = 0
    for pixel in image.getdata():
        ridx = int(pixel[0]/(256/blocks))
        gidx = int(pixel[1]/(256/blocks))
        bidx = int(pixel[2]/(256/blocks))
        idx = ridx + gidx * blocks + bidx * blocks * blocks
        feature[idx] += 1
        pixel_count += 1
    return [x/pixel_count for x in feature]

In [8]:
def show_usage():
    '''Prints how to use this program
    '''
    print("Usage: %s [class A images directory] [class B images directory]" %
            sys.argv[0])
    sys.exit(1)


In [189]:
def train(training_path_a, training_path_b, print_metrics=True):
    '''Trains a classifier. training_path_a and training_path_b should be
    directory paths and each of them should not be a subdirectory of the other
    one. training_path_a and training_path_b are processed by
    process_directory().
    Args:
      training_path_a (str): directory containing sample images of class A.
      training_path_b (str): directory containing sample images of class B.
      print_metrics  (boolean, optional): if True, print statistics about
        classifier performance.
    Returns:
      A classifier (sklearn.svm.SVC).
    '''
    if not os.path.isdir(training_path_a):
        raise IOError('%s is not a directory' % training_path_a)
    if not os.path.isdir(training_path_b):
        raise IOError('%s is not a directory' % training_path_b)
    time_now = time.asctime( time.localtime(time.time()) )
    print('\n')
    print("current/start time :", time_now)
    print('processing training path A...')
    training_a = process_directory(training_path_a)
    
    time_now = time.asctime( time.localtime(time.time()) )    
    print('\n')
    print("current time :", time_now)
    print('processing training path B...')
    training_b = process_directory(training_path_b)
    
    # data contains all the training data (a list of feature vectors)
    data = training_a + training_b
    
    # target is the list of target classes for each feature vector: a '1' for
    # class A and '0' for class B
    target = [1] * len(training_a) + [0] * len(training_b)
    
    # split training data in a train set and a test set. The test set will
    # containt 20% of the total
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(data,
            target, test_size=0.20)
    # define the parameter search space
    parameters = {'kernel': ['linear', 'rbf'], 'C': [1, 10, 100, 1000],
            'gamma': [0.01, 0.001, 0.0001]}
    
    # search for the best classifier within the search space and return it
    time_now = time.asctime( time.localtime(time.time()) )
    print('\n')
    print("current time :", time_now)
    print('training classifier. grab some coffee...')
    clf = grid_search.GridSearchCV(svm.SVC(), parameters).fit(x_train, y_train)
    classifier = clf.best_estimator_
    if print_metrics:
        print()
        print('Parameters:', clf.best_params_)
        print()
        print('Best classifier score')
        print(metrics.classification_report(y_test,
            classifier.predict(x_test)))
        
    time_now = time.asctime( time.localtime(time.time()) )
    print('\n')
    print("end time:", time_now)
    print('done!')
    return classifier, data, target 

In [10]:
def main(training_path_a, training_path_b):
    '''Main function. Trains a classifier and allows to use it on images
    downloaded from the Internet.
    Args:
      training_path_a (str): directory containing sample images of class A.
      training_path_b (str): directory containing sample images of class B.
    '''
    print('Training classifier...')
    classifier = train(training_path_a, training_path_b)
    while True:
        try:
            print("Input an image url (enter to exit): "),
            image_url = raw_input()
            if not image_url:
                break
            features = process_image_url(image_url)
            print(classifier.predict(features))
        except (KeyboardInterrupt, EOFError):
            break
        except:
            exception = sys.exc_info()[0]
            print(exception)

In [12]:
tattoo_directory_path = '/Users/kylefrankovich/Desktop/training_data/tattoo'
non_tattoo_directory_path = '/Users/kylefrankovich/Desktop/training_data/non_tattoo'

### training SVM on full dataset:

In [190]:
classifier, data, train = train(tattoo_directory_path,non_tattoo_directory_path)



current/start time : Wed Jan 24 18:00:19 2018
processing training path A...


current time : Wed Jan 24 18:22:05 2018
processing training path B...


current time : Wed Jan 24 18:42:49 2018
training classifier. grab some coffee...

Parameters: {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}

Best classifier score
             precision    recall  f1-score   support

          0       0.68      0.63      0.65       197
          1       0.67      0.70      0.68       203

avg / total       0.67      0.67      0.67       400



end time: Wed Jan 24 18:43:00 2018
done!


In [54]:
# classifier trained on half of our data:
classifier = train(tattoo_directory_path, non_tattoo_directory_path)



current/start time : Wed Jan 24 11:39:17 2018
processing training path A...


current time : Wed Jan 24 11:50:38 2018
processing training path B...


current time : Wed Jan 24 11:50:38 2018
training classifier. grab some coffee...

Parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'linear'}

Best classifier score
             precision    recall  f1-score   support

          0       0.74      0.55      0.63       115
          1       0.55      0.74      0.63        85

avg / total       0.66      0.63      0.63       200



In [191]:
# save the model to disk

'/Users/kylefrankovich/Desktop/training_data/tattoo'

filename = '/Users/kylefrankovich/Desktop/insight_project/trained_models/svm_model_1000.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [86]:
image_url = 'https://d3qi0qp55mx5f5.cloudfront.net/www/i/homepage/spotlight/urban-chicago-spotlight.jpg?mtime=1473347326'
features = process_image_url(image_url)

In [87]:
print(classifier.predict([features]))

[0]


In [194]:
len(train)

1999

In [195]:
# save features and target:
# test out saving features/target w/ smalle datasets:
features_filename = '/Users/kylefrankovich/Desktop/training_data/features.csv'
target_filename = '/Users/kylefrankovich/Desktop/training_data/target.csv'


# export list (features):
with open(features_filename, 'wb') as fp:
    pickle.dump(data, fp)
    
# export list (target):
with open(target_filename, 'wb') as fp:
    pickle.dump(train, fp)

In [196]:
# read back in:
with open (features_filename, 'rb') as fp:
    itemlist = pickle.load(fp)

In [199]:
itemlist[0]

[0.23405692729766803,
 0.01912551440329218,
 1.7146776406035666e-05,
 0.0,
 0.00023662551440329217,
 0.010528120713305899,
 0.0007098765432098765,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.008285322359396434,
 0.0005006858710562414,
 0.0,
 0.0,
 0.006841563786008231,
 0.3069650205761317,
 0.039152949245541836,
 0.0013648834019204389,
 0.0,
 0.0010939643347050755,
 0.019015775034293553,
 0.017421124828532236,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.00013374485596707818,
 0.0043621399176954736,
 0.00038751714677640606,
 0.00018861454046639232,
 0.0,
 0.004945130315500686,
 0.014300411522633744,
 0.05198902606310014,
 0.0,
 0.0,
 8.573388203017832e-05,
 0.004478737997256516,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.02880658436214e-05,
 0.00020576131687242798,
 0.0,
 0.0,
 0.0,
 3.08641975308642e-05,
 0.2535665294924554]

In [146]:
print(classifier.predict(your_list[0][9]))

ValueError: could not convert string to float: '[0.4441337584825443, 0.07742314610416155, 0.012683958793230317, 1.02199329572398e-06, 1.328591284441174e-05, 0.022704603057803942, 0.0034563813261385004, 2.04398659144796e-06, 0.0, 0.0, 0.00020542065244051998, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0025304554002125745, 0.015145940642629384, 0.000972937617529229, 3.8835745237511245e-05, 0.00015432098765432098, 0.20251819148066388, 0.017492437249611643, 0.0019765350339301774, 0.0, 1.02199329572398e-06, 0.0186881694056087, 0.00022688251165072358, 0.0, 0.0, 0.0, 1.02199329572398e-06, 0.0, 1.02199329572398e-06, 1.02199329572398e-05, 0.0, 0.0, 0.005800833946529311, 0.016169977924944814, 9.095740331943422e-05, 0.0, 9.19793966151582e-06, 0.06100482380835582, 0.010996647861990025, 0.0, 0.0, 0.0, 0.039928256070640177, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.04398659144796e-06, 1.02199329572398e-06, 0.0, 0.0, 0.0007787588913416728, 0.0009279699125173739, 0.0, 0.0, 2.248385250592756e-05, 0.043885414111683424]'