
# Faces recognition example using eigenfaces and SVMs


The dataset used in this example is a preprocessed excerpt of the
"Labeled Faces in the Wild", aka LFW_:

  http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz (233MB)

.. _LFW: http://vis-www.cs.umass.edu/lfw/

Expected results for the top 5 most represented people in the dataset:

================== ============ ======= ========== =======
                   precision    recall  f1-score   support
================== ============ ======= ========== =======
     Ariel Sharon       0.67      0.92      0.77        13
     Colin Powell       0.75      0.78      0.76        60
  Donald Rumsfeld       0.78      0.67      0.72        27
    George W Bush       0.86      0.86      0.86       146
Gerhard Schroeder       0.76      0.76      0.76        25
      Hugo Chavez       0.67      0.67      0.67        15
       Tony Blair       0.81      0.69      0.75        36

      avg / total       0.80      0.80      0.80       322
================== ============ ======= ========== =======



In [1]:
from __future__ import print_function

from time import time
import logging
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC

#from sklearn.cross_validation import KFold
import numpy as np

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

Automatically created module for IPython interactive environment


Download the data, if not already on disk and load it as numpy arrays


In [2]:
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
#X_train = np.load('X_train.npy')
#y_train = np.load('y_train.npy')
#X_test = np.load('X_test.npy')

# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape

# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
X = lfw_people.data
#n_features = X_train.shape[1]
n_features = X.shape[1]

# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
#print(target_names)
n_classes = target_names.shape[0]
#n_classes = y_train.shape[0]

print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)
print('height: ', h)
print('width: ', w)
#print('X_train: ', X_train)
#print('y_train: ', y_train)

Total dataset size:
n_samples: 1288
n_features: 1850
n_classes: 7
height:  50
width:  37


Split into a training set and a test set using a stratified k fold


In [3]:
# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
dataset): unsupervised feature extraction / dimensionality reduction


In [4]:
n_components = 150
#h = 50
#w = 37

print("Extracting the top %d eigenfaces from %d faces"
      % (n_components, X_train.shape[0]))
t0 = time()
pca = PCA(n_components=n_components, svd_solver='randomized',
          whiten=True).fit(X_train)
print("done in %0.3fs" % (time() - t0))

eigenfaces = pca.components_.reshape((n_components, h, w))

print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))

Extracting the top 150 eigenfaces from 966 faces
done in 0.293s
Projecting the input data on the eigenfaces orthonormal basis
done in 0.026s


Train a SVM classification model


In [6]:
print("Fitting the classifier to the training set")
t0 = time()
param_grid = [
{'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
]
clf = GridSearchCV(SVC(class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")

fo = open('y_test_2.txt', 'w')
fo.write(str(clf.best_estimator_) + "\n")
print(clf.best_estimator_)

Fitting the classifier to the training set
done in 76.883s
Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.005, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


Quantitative evaluation of the model quality on the test set


In [7]:
print("Predicting people's names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))
#print(y_pred)

print(classification_report(y_test, y_pred, target_names=target_names))
#print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

fo.write("ImageId,PredictedClass"+"\n")
for i in range(y_pred.size):
    fo.write(str(i) + "," + str(y_pred[i]) + "\n")
    print(str(i) + "," + str(y_pred[i]))

Predicting people's names on the test set
done in 0.057s
                   precision    recall  f1-score   support

     Ariel Sharon       0.75      0.46      0.57        13
     Colin Powell       0.79      0.87      0.83        60
  Donald Rumsfeld       0.89      0.59      0.71        27
    George W Bush       0.84      0.98      0.91       146
Gerhard Schroeder       0.95      0.80      0.87        25
      Hugo Chavez       0.89      0.53      0.67        15
       Tony Blair       0.97      0.81      0.88        36

      avg / total       0.86      0.85      0.84       322

0,3
1,3
2,6
3,3
4,3
5,3
6,4
7,1
8,3
9,3
10,3
11,3
12,3
13,6
14,3
15,3
16,3
17,3
18,3
19,4
20,1
21,3
22,3
23,3
24,3
25,1
26,1
27,3
28,3
29,3
30,2
31,3
32,3
33,3
34,3
35,3
36,3
37,1
38,3
39,1
40,3
41,1
42,3
43,1
44,1
45,1
46,4
47,3
48,3
49,3
50,3
51,3
52,0
53,3
54,6
55,2
56,3
57,3
58,5
59,3
60,1
61,1
62,0
63,4
64,3
65,1
66,6
67,4
68,1
69,3
70,1
71,6
72,3
73,3
74,3
75,2
76,1
77,6
78,4
79,4
80,3
81,0
82,4
83,3

Qualitative evaluation of the predictions using matplotlib


In [None]:
#def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
     #Helper function to plot a gallery of portraits
#    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
#    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
#    for i in range(n_row * n_col):
#        plt.subplot(n_row, n_col, i + 1)
#        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
#        plt.title(titles[i], size=12)
#        plt.xticks(())
#        plt.yticks(())


# plot the result of the prediction on a portion of the test set

#def title(y_pred, y_test, target_names, i):
#    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
#    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
#    return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)

#prediction_titles = [title(y_pred, y_test, target_names, i)
#                     for i in range(y_pred.shape[0])]

#plot_gallery(X_test, prediction_titles, h, w)

# plot the gallery of the most significative eigenfaces

#eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
#plot_gallery(eigenfaces, eigenface_titles, h, w)

#plt.show()