In [1]:
import math
import numpy as np
from numpy import expand_dims
from matplotlib import pyplot
from PIL import Image
from numpy import asarray
from mtcnn.mtcnn import MTCNN
from keras_vggface.vggface import VGGFace
from keras_vggface.utils import preprocess_input
from keras_vggface.utils import decode_predictions
import os
import pickle

Using TensorFlow backend.


In [2]:
# extract a single face from a given photograph
def extract_face(filename, required_size=(224, 224)):
    # load image from file
    pixels = pyplot.imread(filename)
    # create the detector, using default weights
    detector = MTCNN()
    # detect faces in the image
    results = detector.detect_faces(pixels)
    # extract the bounding box from the first face
    x1, y1, width, height = results[0]['box']
    x2, y2 = x1 + width, y1 + height
    # extract the face
    face = pixels[y1:y2, x1:x2]
    # resize pixels to the model size
    image = Image.fromarray(face)
    image = image.resize(required_size)
    face_array = asarray(image)
    return face_array

# extract faces and calculate face embeddings for a list of photo files
def get_embeddings(filenames):
    faces = []
    
    # extract faces
    for f_name in filenames:
        try:
            faces.append(extract_face(f_name))
        except ValueError:
            print(f_name)
    
    # convert into an array of samples
    samples = asarray(faces, 'float32')
    # prepare the face for the model, e.g. center pixels
    samples = preprocess_input(samples, version=2)
    # create a vggface model
    model = VGGFace(model='resnet50', include_top=False, input_shape=(224, 224, 3), pooling='avg')
    # perform prediction
    yhat = model.predict(samples)
    return yhat

# get all the paths of all files (including subdirectories) of given format
def get_paths_of_files_of_format(path, file_format="jpg"):
    files = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            if file_format in file:
                files.append(os.path.join(r, file))
    
    return files

# extract names of the images in the used files and composes them as (first name) (given name) (i.e. with a space)
def extract_full_names(path_files,dataset="lfw"):
    names = []
    for f in path_files:
        fname = f.split("\\")[-1]              # the files always were separated by "\\" in the end
        
        if dataset=="lfw":
            fname = fname.split("_")
            names.append(fname[0]+" "+fname[1])
        else:
            raise NotImplementedError("Only code for the lfw dataset have been implemented")
    return names

# averages vectors to associated name; expects numpy vectors as input
class NamedAvgVector:
    def __init__(self, name, numpy_vec="a"):
        self.name = name
        self.vec = numpy_vec
        if numpy_vec != "a":
            self.n = 1
        else:
            self.n = 0
    
    def update(self, numpy_vec):
        if self.n != 0:
            self.vec *= self.n
            self.vec += numpy_vec
            self.n += 1
            self.vec /= self.n
        else:
            self.n = 1
            self.vec = numpy_vec
            
# creates a dictionary with all the first names as keys and associated empty NamedAvgVector data structures
def create_initial_dict(full_names):
    dict_list = []
    for name in full_names:
        first_name = name.split(" ")[0]
        dict_list.append((first_name, NamedAvgVector(first_name)))
    return dict(dict_list)

# updates the dictionary of NamedAvgVectors with the embeddings
# assumes that all embeddings are already in dictionary
# i_0_f_names : is the starting point of the calculated embeddings in full_names
def update_dict(d, full_names, i_0_f_names, emb):
    for i in range(len(emb)):
        fname = full_names[i_0_f_names + i].split(" ")[0]
        d[fname].update(np.array(emb[i]))
        
# calculates the averaged embeddings for all the first names given a dataset
def calculate_embeddings(path_dataset, step_size=100, file_format="jpg", dataset_name="lfw"):
    im_paths = get_paths_of_files_of_format(path_lfw, file_format)
    size = len(im_paths)
    full_names = extract_full_names(im_paths)
    dict_navecs = create_initial_dict(full_names)
    
    for i in range(math.floor(size/step_size)):
        print("calculate_embeddings i:", i)
        im_paths_tmp = im_paths[i*step_size : (i+1)*step_size]
        emb_tmp = get_embeddings(im_paths_tmp)
        update_dict(dict_navecs, full_names, i*step_size, emb_tmp)
        
        # save progress so far
        f = open("dict_{}.pkl".format(i), "wb")
        pickle.dump(dict_navecs, f, pickle.HIGHEST_PROTOCOL)
        f.close()

In [49]:
# returns the maximal coordinate value of the embeddings in the dictionary and prints the number of entries w/o any value
def get_max_coor_val(dic):
    m=0                       # weak
    c=0
    for key in dic.keys():
        if isinstance(dic[key].vec, int):
            print("{} : int".format(key))
            c+=1
            continue
        elif isinstance(dic[key].vec, str):
            print("{} : str".format(key))
            c+=1
            continue
        m=max(m,max(dic[key].vec))
    print("no entry #",c)
    return m

# returns the average number of used embeddings per name
def get_avg_n(dic):
    accum = 0
    cnt = 0
    for val in dic.values():
        if isinstance(val.vec, int) or isinstance(val.vec, str):
            continue
        cnt +=1
        accum += val.n
    return accum/cnt

# returns the percentage of entries in the dictionary which used less than n embeddings
def perct_less_than(dic,n):
    cnt = 0
    for val in dic.values():
        if isinstance(val.vec, int) or isinstance(val.vec, str) or val.n < n:
            cnt += 1
    return cnt / len(dic.values())

# prints the keys in the dictionary which used more than n embeddings
def names_more_than(dic,n):
    for key in dic.keys():
        val = dic[key]
        if isinstance(val.vec, int) or isinstance(val.vec, str) or val.n <= n:
            continue
        else:
            print(key)

In [94]:
# returns the closest name in dictionary given an embedding
def find_closest_name(dic, vec_emb_numpy):
    d_distances_l = []
    for key in dic.keys():
        val = d_num[key]
        if isinstance(val.vec, int) or isinstance(val.vec, str):        # if no value has been assigned
            continue
        dist = np.linalg.norm(val.vec-vec_emb_numpy)
        d_distances_l.append((dist,key))
    d_distances = dict(d_distances_l)
    return d_distances[min(d_distances.keys())]

# predicts a name for each file provided in the given list based on the provided dictionary
def predict_name(dic, filenames):
    embeddings = get_embeddings(filenames)
    name_predictions = []
    for vec in embeddings:
        name_predictions.append(find_closest_name(dic,np.array(vec)))
    return name_predictions

In [108]:
f = open("dict_lfw.pkl".format(num), "rb")
dic = pickle.load(f)
f.close()

In [98]:
cnt_wrong = 0
for key in dic.keys():
    val = dic[key]
    if isinstance(val.vec, int) or isinstance(val.vec, str):        # if no value has been assigned
        continue
    if key != find_next_name(dic, val.vec):
        cnt_wrong += 1

In [99]:
print(cnt_wrong)                # --> 0; i.e. model predicts perfectly name of persons

0


In [None]:
filenames = []                    # add file names
predictions = predict_name(dic, filenames)
for name in predictions:
    print(name)