In [1]:
import matplotlib.pyplot as plt
import numpy as np
from keras.preprocessing import image
from scipy import misc
import glob
import os
import time
import xgboost as xgb
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from skimage import feature

Using Theano backend.


In [2]:
def random_flip(img, u=0.5):
    if np.random.random() < u:
        img = image.flip_axis(img, 1)
    return img

def random_brightness(img, limit=(-0.3, 0.3), u=0.5):
    if np.random.random() < u:
        alpha = 1.0 + np.random.uniform(limit[0], limit[1])
        img = alpha * img
        img = np.clip(img, 0., 1.)
    return img

def random_augmentation(img):
    img = random_brightness(img, limit=(-0.5, 0.5), u=0.5)
    img = random_flip(img, u=0.3)
    return img

In [3]:
def extract_features(imgs, augmented_count = 5):
    features = []
    for img in imgs:
        image = misc.imresize(misc.imread(img), (50, 50))
        features.append(feature.canny(image))
        #features.append(image)
        for i in range(augmented_count):
            file_features = random_augmentation(image)
            features.append(feature.canny(file_features))
            #features.append(file_features)
    return features

In [4]:
husein = glob.glob('Husein/*')
unknown = glob.glob('Unknown/*')
print('len husein:', len(husein))
print('len unknown:', len(unknown))

len husein: 122
len unknown: 347


In [5]:
husein_features = extract_features(husein)
print ('positive samples:', len(husein_features))

positive samples: 732


In [6]:
unknown_features = extract_features(unknown)
print ('positive samples:', len(unknown_features))

positive samples: 2082


In [7]:
X = np.vstack((husein_features, unknown_features)).astype(np.float64)
X = X.reshape([X.shape[0], -1])

pca = PCA(n_components = 100).fit(X)
scaled_X = pca.transform(X)

y = np.hstack((np.ones(len(husein_features)), np.zeros(len(unknown_features))))
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size = 0.2, random_state=22)

In [8]:
params_xgd = {
    'min_child_weight': 10.0,
    'objective': 'binary:logistic',
    'max_depth': 7,
    'max_delta_step': 1.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'eta': 0.01,
    'gamma': 0.65,
    'num_boost_round' : 700
    }

d_train = xgb.DMatrix(X_train, y_train)
d_valid = xgb.DMatrix(X_test, y_test)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
t=time.time()
clf = xgb.train(params_xgd, d_train, 1600, watchlist, early_stopping_rounds=100,
                maximize=False, verbose_eval=5)
print(round(time.time()-t, 3), 'Seconds to train xgb')
output = clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit)
print(np.mean(y_test == np.around(output)))

[0]	train-error:0.139049	valid-error:0.134991
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 100 rounds.
[5]	train-error:0.124833	valid-error:0.12611
[10]	train-error:0.125722	valid-error:0.12611
[15]	train-error:0.11817	valid-error:0.124334
[20]	train-error:0.109285	valid-error:0.115453
[25]	train-error:0.112839	valid-error:0.117229
[30]	train-error:0.107952	valid-error:0.113677
[35]	train-error:0.108396	valid-error:0.115453
[40]	train-error:0.110173	valid-error:0.115453
[45]	train-error:0.112394	valid-error:0.117229
[50]	train-error:0.109729	valid-error:0.115453
[55]	train-error:0.108841	valid-error:0.115453
[60]	train-error:0.107508	valid-error:0.113677
[65]	train-error:0.107508	valid-error:0.113677
[70]	train-error:0.107508	valid-error:0.113677
[75]	train-error:0.107508	valid-error:0.113677
[80]	train-error:0.107508	valid-error:0.113677
[85]	train-error:0.107508	valid-error:0.113677
[90]	train-

In [10]:
params_xgd2 = {
    'objective': 'binary:logistic',
    'max_depth': 4,
    'max_delta_step': 1.8,
    'min_child_weight': 6,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'eta': 0.07,
    'gamma': 10,
    'reg_alpha':8,
    'reg_lambda':1.3
    }
t=time.time()
clf2 = xgb.train(params_xgd2, d_train, 1600, watchlist, early_stopping_rounds=100,
                maximize=False, verbose_eval=5)
print(round(time.time()-t, 3), 'Seconds to train xgb')
output = clf2.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit)
print(np.mean(y_test == np.around(output)))

[0]	train-error:0.127943	valid-error:0.134991
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 100 rounds.
[5]	train-error:0.126166	valid-error:0.12611
[10]	train-error:0.124833	valid-error:0.12611
[15]	train-error:0.123056	valid-error:0.127886
[20]	train-error:0.119502	valid-error:0.124334
[25]	train-error:0.114171	valid-error:0.120782
[30]	train-error:0.113727	valid-error:0.120782
[35]	train-error:0.111506	valid-error:0.119005
[40]	train-error:0.110618	valid-error:0.115453
[45]	train-error:0.111506	valid-error:0.120782
[50]	train-error:0.112839	valid-error:0.120782
[55]	train-error:0.113283	valid-error:0.120782
[60]	train-error:0.113283	valid-error:0.120782
[65]	train-error:0.113727	valid-error:0.122558
[70]	train-error:0.113283	valid-error:0.120782
[75]	train-error:0.113283	valid-error:0.120782
[80]	train-error:0.113283	valid-error:0.120782
[85]	train-error:0.113283	valid-error:0.120782
[90]	train

In [11]:
params_xgd3 = {
    'min_child_weight': 10.0,
    'objective': 'binary:logistic',
    'subsample': 0.8,
    'eta': 0.025,
    'gamma': 0.65,
    'num_boost_round' : 700
    }

t=time.time()
clf3 = xgb.train(params_xgd3, d_train, 1600, watchlist, early_stopping_rounds=100,
                maximize=False, verbose_eval=5)
print(round(time.time()-t, 3), 'Seconds to train xgb')
output = clf3.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit)
print(np.mean(y_test == np.around(output)))

[0]	train-error:0.134163	valid-error:0.131439
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 100 rounds.
[5]	train-error:0.130164	valid-error:0.124334
[10]	train-error:0.130164	valid-error:0.124334
[15]	train-error:0.130609	valid-error:0.12611
[20]	train-error:0.125722	valid-error:0.124334
[25]	train-error:0.122612	valid-error:0.122558
[30]	train-error:0.11195	valid-error:0.115453
[35]	train-error:0.111506	valid-error:0.113677
[40]	train-error:0.111506	valid-error:0.115453
[45]	train-error:0.109285	valid-error:0.113677
[50]	train-error:0.107952	valid-error:0.113677
[55]	train-error:0.107952	valid-error:0.113677
[60]	train-error:0.107952	valid-error:0.115453
[65]	train-error:0.108396	valid-error:0.113677
[70]	train-error:0.108396	valid-error:0.117229
[75]	train-error:0.109285	valid-error:0.117229
[80]	train-error:0.109285	valid-error:0.117229
[85]	train-error:0.109285	valid-error:0.117229
[90]	train

In [12]:
import pickle
with open('pca.p', 'wb') as fopen:
    pickle.dump(pca, fopen, protocol=2)
with open('xgb.p', 'wb') as fopen:
    pickle.dump(clf, fopen, protocol=2)
with open('xgb2.p', 'wb') as fopen:
    pickle.dump(clf2, fopen, protocol=2)
with open('xgb3.p', 'wb') as fopen:
    pickle.dump(clf3, fopen, protocol=2)

In [13]:
from sklearn import metrics
print(metrics.classification_report(y_test, np.around(clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit)), target_names = ['Unknown', 'Husein']))

             precision    recall  f1-score   support

    Unknown       0.87      1.00      0.93       416
     Husein       0.99      0.57      0.72       147

avg / total       0.90      0.89      0.88       563



In [14]:
print(metrics.classification_report(y_test, np.around(clf.predict(xgb.DMatrix(X_test), ntree_limit=clf2.best_ntree_limit)), target_names = ['Unknown', 'Husein']))

             precision    recall  f1-score   support

    Unknown       0.87      1.00      0.93       416
     Husein       0.99      0.57      0.72       147

avg / total       0.90      0.89      0.88       563



In [15]:
print(metrics.classification_report(y_test, np.around(clf3.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit)), target_names = ['Unknown', 'Husein']))

             precision    recall  f1-score   support

    Unknown       0.86      0.99      0.92       416
     Husein       0.96      0.56      0.71       147

avg / total       0.89      0.88      0.87       563



In [16]:
from scipy.stats.mstats import hmean

In [25]:
out_T = np.vstack([clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit), clf2.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit),clf3.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit)]).T

In [31]:
print(metrics.classification_report(y_test, np.around(hmean(out_T, axis=1)), target_names = ['Unknown', 'Husein']))

             precision    recall  f1-score   support

    Unknown       0.87      1.00      0.93       416
     Husein       0.99      0.56      0.72       147

avg / total       0.90      0.88      0.87       563



In [34]:
np.around(hmean(out_T, axis=1)).astype('int')[0]

0