#import package

In [1]:
from time import time
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC

from skimage import data, color
from skimage.transform import rescale, resize, downscale_local_mean
import matplotlib.pyplot as plt
import PIL
import scipy.misc
import glob
import os
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning) 
%matplotlib inline

In [2]:
# Load image
def convert_to_lfw(image_file_name, width=47, height=62):
    """
    Convert a given PIL-compatible file to an LFW grayscale
    image with the requested width and height.
    """
    # Load, grayscale, downscale
    image = PIL.Image.open(image_file_name)\
        .convert(mode="L")\
        .resize((width, height), PIL.Image.LANCZOS)
    
    # Return 1D np array
    return np.array(image).ravel(order="C")


def load_lfw_directory(path, width=47, height=62):
    """
    Load all JPG files from given path into LFW
    format np array.
    """
    # Iterate through all files
    lfw_array = np.array(())
    lfw_labels = []
    for file_name in os.listdir(path):
        if file_name.lower().endswith("jpg") or file_name.lower().endswith("jpeg") or file_name.lower().endswith("png"):
            lfw_labels.append(file_name.split("_")[0])
            if lfw_array.shape[0] == 0:
                lfw_array = convert_to_lfw(os.path.join(path,  file_name), width=width, height=height)
            else:
                lfw_array = np.vstack([lfw_array, convert_to_lfw(os.path.join(path, file_name),
                                                                    width=width, height=height)])
    
    # Return
    return lfw_array, lfw_labels

In [4]:
#fetching data from sklearn. data include people with more than 15 facial images in size reduced into 50%
lfw_people = fetch_lfw_people(min_faces_per_person=20, resize=0.5)

# n_samples, images height & width
n_samples, h, w = lfw_people.images.shape

# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
X = lfw_people.data
#n_featrues = h * w
n_features = X.shape[1]

# id number of targets
y = lfw_people.target
# target names
target_names = lfw_people.target_names
# the number of people
n_classes = target_names.shape[0]

print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)
print(target_names)
print(y)
print(h,w)

Total dataset size:
n_samples: 3023
n_features: 2914
n_classes: 62
['Alejandro Toledo' 'Alvaro Uribe' 'Amelie Mauresmo' 'Andre Agassi'
 'Angelina Jolie' 'Ariel Sharon' 'Arnold Schwarzenegger'
 'Atal Bihari Vajpayee' 'Bill Clinton' 'Carlos Menem' 'Colin Powell'
 'David Beckham' 'Donald Rumsfeld' 'George Robertson' 'George W Bush'
 'Gerhard Schroeder' 'Gloria Macapagal Arroyo' 'Gray Davis'
 'Guillermo Coria' 'Hamid Karzai' 'Hans Blix' 'Hugo Chavez' 'Igor Ivanov'
 'Jack Straw' 'Jacques Chirac' 'Jean Chretien' 'Jennifer Aniston'
 'Jennifer Capriati' 'Jennifer Lopez' 'Jeremy Greenstock' 'Jiang Zemin'
 'John Ashcroft' 'John Negroponte' 'Jose Maria Aznar' 'Juan Carlos Ferrero'
 'Junichiro Koizumi' 'Kofi Annan' 'Laura Bush' 'Lindsay Davenport'
 'Lleyton Hewitt' 'Luiz Inacio Lula da Silva' 'Mahmoud Abbas'
 'Megawati Sukarnoputri' 'Michael Bloomberg' 'Naomi Watts'
 'Nestor Kirchner' 'Paul Bremer' 'Pete Sampras' 'Recep Tayyip Erdogan'
 'Ricardo Lagos' 'Roh Moo-hyun' 'Rudolph Giuliani' 'Saddam Hus

In [5]:
our_image_matrix, our_image_labels = load_lfw_directory('/Users/jun/Downloads/drive-download-20171202T075306Z-001/')

In [6]:
our_names = {'NATASHA':len(target_names),'JUNG':len(target_names)+1,'ELIJAH':len(target_names)+2,'ANDREW':len(target_names)+3}
our_index = np.array([our_names[i] for i in our_image_labels])

In [7]:
for name in our_names.keys():
    target_names = np.append(target_names, name)

array(['Alejandro Toledo', 'Alvaro Uribe', 'Amelie Mauresmo',
       'Andre Agassi', 'Angelina Jolie', 'Ariel Sharon',
       'Arnold Schwarzenegger', 'Atal Bihari Vajpayee', 'Bill Clinton',
       'Carlos Menem', 'Colin Powell', 'David Beckham', 'Donald Rumsfeld',
       'George Robertson', 'George W Bush', 'Gerhard Schroeder',
       'Gloria Macapagal Arroyo', 'Gray Davis', 'Guillermo Coria',
       'Hamid Karzai', 'Hans Blix', 'Hugo Chavez', 'Igor Ivanov',
       'Jack Straw', 'Jacques Chirac', 'Jean Chretien', 'Jennifer Aniston',
       'Jennifer Capriati', 'Jennifer Lopez', 'Jeremy Greenstock',
       'Jiang Zemin', 'John Ashcroft', 'John Negroponte',
       'Jose Maria Aznar', 'Juan Carlos Ferrero', 'Junichiro Koizumi',
       'Kofi Annan', 'Laura Bush', 'Lindsay Davenport', 'Lleyton Hewitt',
       'Luiz Inacio Lula da Silva', 'Mahmoud Abbas',
       'Megawati Sukarnoputri', 'Michael Bloomberg', 'Naomi Watts',
       'Nestor Kirchner', 'Paul Bremer', 'Pete Sampras',
       'Rece

In [8]:
# split into a training and testing set, test_size = 25% of total, random seed = 24
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=24)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2267, 2914)
(756, 2914)
(2267,)
(756,)


In [9]:
our_image_train, our_image_test, our_index_train, our_index_test = train_test_split(
    our_image_matrix, our_index, test_size=0.25)

print(our_image_train.shape)
print(our_image_test.shape)
print(our_index_train.shape)
print(our_index_test.shape)

(62, 2914)
(21, 2914)
(62,)
(21,)


In [10]:
X_train = np.vstack([X_train, our_image_train])
y_train = np.append(y_train, our_index_train)
X_test = np.vstack([X_test, our_image_test])
y_test = np.append(y_test, our_index_test)
print(y_train.shape)
print(X_train.shape)

(2329,)
(2329, 2914)


In [61]:
#PCA components
n_components = 200

print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]))
t0 = time()
#using PCA module in sklearn, we are fitting trainset
pca = PCA(n_components=n_components, svd_solver='randomized', whiten=True).fit(X_train)
print("done in %0.3fs" % (time() - t0))

#constructing eigenfaces using PCA fitted with trainset
eigenfaces = pca.components_.reshape((n_components, h, w))

print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()

#converting train and test values using PCA fitted with trainset
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(our_image_test)
print("done in %0.3fs" % (time() - t0))

Extracting the top 200 eigenfaces from 2329 faces
done in 0.516s
Projecting the input data on the eigenfaces orthonormal basis
done in 0.023s


In [63]:
pca.components_.shape

(200, 2914)

In [59]:
manual_pca = np.matmul(our_image_test, np.transpose(pca.components_))
manual_pca.shape

(21, 200)

In [62]:
X_test_pca == manual_pca

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ..., 
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]], dtype=bool)

In [25]:
print("Fitting the classifier to the training set")
t0 = time()

#parameter grid, cost to be 1000, 5000, 10000, 50000, 100000
# gamma to be 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
#constructing classfication model using Support Vector Machine with kernel Radial basis function
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)

#fitting pca train set with id of people in trainset
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")

#best estimator with each cost and gamma parameter
print(clf.best_estimator_)

Fitting the classifier to the training set
done in 276.561s
Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0005, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [44]:
print("Predicting people's names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))
print(classification_report(our_index_test, y_pred, target_names = target_names[[i for i in sorted(set(y_pred))]]))

Predicting people's names on the test set
done in 0.019s
                      precision    recall  f1-score   support

        Ariel Sharon       0.00      0.00      0.00         0
Atal Bihari Vajpayee       0.00      0.00      0.00         0
      Lleyton Hewitt       0.00      0.00      0.00         0
        Winona Ryder       0.00      0.00      0.00         0
             NATASHA       1.00      0.43      0.60         7
                JUNG       1.00      1.00      1.00         4
              ELIJAH       1.00      1.00      1.00         5
              ANDREW       1.00      1.00      1.00         5

         avg / total       1.00      0.81      0.87        21



  'recall', 'true', average, warn_for)


In [43]:
x = [i for i in set(y_pred)]
sorted(x)


[5, 7, 39, 61, 62, 63, 64, 65]

In [29]:
y_pred

array([64, 64, 62, 64,  7, 63, 63, 39, 65, 64, 65, 65, 63,  5, 61, 62, 63,
       62, 64, 65, 65])

In [30]:
our_index_test

array([64, 64, 62, 64, 62, 63, 63, 62, 65, 64, 65, 65, 63, 62, 62, 62, 63,
       62, 64, 65, 65])

In [16]:
y_pred

array([64, 64, 62, 64, 14, 63, 63, 39, 65, 64, 65, 65, 63,  6, 61, 62, 63,
       62, 64, 65, 65])

In [27]:
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    """Helper function to plot a gallery of portraits"""
    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    for i in range(n_row * n_col):
        plt.subplot(n_row, n_col, i + 1)
        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
        plt.title(titles[i], size=12)
        plt.xticks(())
        plt.yticks(())


# plot the result of the prediction on a portion of the test set

def title(y_pred, y_test, target_names, i):
    pred_name = target_names[int(y_pred[i])].rsplit(' ', 1)[-1]
    true_name = target_names[int(our_index_test[i])].rsplit(' ', 1)[-1]
    return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)



In [71]:
prediction_titles = [title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0])]

plot_gallery(our_image_test, prediction_titles, h, w)

# plot the gallery of the most significative eigenfaces

eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)



'prediction_titles = [title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0])]\n\nplot_gallery(our_image_test, prediction_titles, h, w)\n\n# plot the gallery of the most significative eigenfaces\n\neigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]\nplot_gallery(eigenfaces, eigenface_titles, h, w)\n'