In [2]:
import cv2
import os
import numpy as np
from matplotlib import pyplot as plt
import random
import sys
from glob import glob
import math
from scipy import ndimage
import skimage as ski
from skimage.feature import hog
from os import listdir
from multiprocessing import Pool, cpu_count
from pylab import imread
from time import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import KFold
from sklearn import preprocessing

%matplotlib inline

train_folder='train/'
test_folder='test/'

WIDTH = 256
HEIGHT = 256

In [3]:
image_list = glob('./50_categories/*/*.jpg')
cat_list = glob('./50_categories/*')
categories = []
cat_class = {}
class_num = 0
for cat in cat_list:
    cat_name = cat.split("\\")[1]
    if not cat in categories:
        categories.append(cat_name)
        cat_class[cat_name] = class_num
        class_num = class_num + 1


In [4]:
if not (os.path.exists(train_folder) and os.path.exists(test_folder) ) :
    os.makedirs(train_folder)
    os.makedirs(test_folder)
    for im in image_list:
        # 20% test data
        in_train = random.random();
        #resize and save in different folder
        im_file = im.split("\\")[2]
        im_cat = im.split("\\")[1]
        im_class = cat_class[im_file.split("_")[0]] 
        if in_train < 0.8 :
            im_dest_file = train_folder + im_file
            with open(train_folder+"train.txt", "a") as f:
                f.write(im_file+"\t"+str(im_class)+"\n")
        else:
            im_dest_file = test_folder + im_file
            with open(test_folder+"test.txt", "a") as f:
                f.write(im_file+"\t"+str(im_class)+"\n")
        img = cv2.imread(im)
        img_res = cv2.resize(img,(WIDTH, HEIGHT), interpolation = cv2.INTER_AREA)
        cv2.imwrite(im_dest_file, img_res)
else:
    print("Data already exist!")

Data already exist!


In [5]:
def extract_features(image_path_list):
    feature_list = []
    for image_path in image_path_list:
        img= imread(image_path)
        RGB = img.reshape((-1, 3)).T

        # mean of each channel
        mean = np.mean(RGB, axis=1)
        # median of each channel
        median = np.median(RGB, axis=1)
        # covariance between channels
        cov = np.cov(RGB).ravel()
        # (normalized) entropy of the grayscale image
        entropy = ski.filters.rank.entropy(
            np.mean(img, axis=-1).astype('uint16'),
            ski.morphology.disk(5))
        entropy = entropy / float(img.size)
        entropy_sum = np.sum(entropy)
        entropy_mean = np.mean(entropy)
        entropy_var = np.var(entropy)
    
        #hog :http://scikit-image.org/docs/dev/auto_examples/plot_hog.html
        img_grey = ski.color.rgb2gray(img)
        fd, hog_image = hog(img_grey, orientations=8, pixels_per_cell=(16, 16),
                    cells_per_block=(1, 1), visualise=True)
        # concatenate all the features together
        feature_vec = np.concatenate(
            [mean, median, cov, [entropy_sum, entropy_mean, entropy_var], fd])
        # code to produce more complicated features and to produce multiple
        # features in one function call.
        feature_list.append([image_path, feature_vec])
    return feature_list

In [6]:
def split_seq(seq, size):
        newseq = []
        splitsize = 1.0/size*len(seq)
        for i in range(size):
            newseq.append(seq[int(round(i*splitsize)):
                int(round((i+1)*splitsize))])
        return newseq

In [None]:
train_file = train_folder + 'train.txt'
test_file = test_folder + 'test.txt'

image_paths_train = []
ymap = {}
with open(train_file) as f:
    for line in f:
        train_img_file = line.split('\t')[0]
        class_num = line.split('\t')[1]
        image_paths_train.append(train_folder + train_img_file)
        y[train_folder + train_img_file] = int(class_num)
        
image_paths_test = []
ytmap = {}
with open(test_file) as f:
    for line in f:
        test_img_file = line.split('\t')[0]
        class_num = line.split('\t')[1]
        image_paths_test.append(test_folder + test_img_file)
        yt[test_folder + test_img_file] = int(class_num)


In [None]:
numprocessors = cpu_count()

# train data

split_image_paths_train = split_seq(image_paths_train, numprocessors)

# Ok, this block is where the parallel code runs. We time it so we can get a 
# feel for the speed up.
start_time = time()
p = Pool(numprocessors)
result_train = p.map_async(extract_features, split_image_paths_train)
poolresult_train = result_train.get()
end_time = time()

# All done, print timing results.
print ("Finished extracting features. Total time: " + 
    str(round(end_time-start_time, 3)) + " s, or " + 
    str( round( (end_time-start_time)/len(image_paths), 5 ) ) + " s/image.")

combined_result_train = []
for single_proc_result in poolresult_train:
    for single_image_result in single_proc_result:
        combined_result_train.append(single_image_result)

#test data
split_image_paths_test= split_seq(image_paths_test, numprocessors)

# Ok, this block is where the parallel code runs. We time it so we can get a 
# feel for the speed up.
start_time = time()
p = Pool(numprocessors)
result_test = p.map_async(extract_features, split_image_paths_test)
poolresult_test = result_test.get()
end_time = time()

# All done, print timing results.
print ("Finished extracting features. Total time: " + 
    str(round(end_time-start_time, 3)) + " s, or " + 
    str( round( (end_time-start_time)/len(image_paths), 5 ) ) + " s/image.")

combined_result_test = []
for single_proc_result in poolresult_test:
    for single_image_result in single_proc_result:
        combined_result_test.append(single_image_result)
        

In [None]:
#extract x and y

In [None]:
##
#random forest not 
rf = RandomForestClassifier(n_estimators=i, n_jobs=2,oob_score=True)
rf.fit(X, y)
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(x.shape[1]-1):
    if (importances[indices[f]])>0.01:
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

oob_error = 1 - rf.oob_score_

yts=rf.predict(XT)
ets= 1- sum(yts==yt)/len(yt)
F1=metrics.f1_score(yts,yt)
print('%d %5.3f %5.3f %5.3f' % (i,oob_error,ets,F1))
    
