In [1]:
import os
from PIL import Image
import face_recognition
import numpy as np
import matplotlib.pyplot as plt
import wget
import tarfile
import pandas as pd

# Load dataset

In [2]:
# wget.download('http://vis-www.cs.umass.edu/lfw/lfw.tgz')

In [3]:
with tarfile.open('lfw.tgz') as tar:
    tar.extractall()
os.rename('lfw', 'dataset')  

# Collect names with more then 2 images

In [4]:
names = []
for name in os.listdir('dataset'):
    if len(os.listdir(f'dataset/{name}')) > 1:
        names.append(name+'\n')
len(names)

1680

In [5]:
with open('names.txt', 'w') as f:
    f.writelines(names)


# Collect names with more then 2 images with faces


In [7]:
detector = face_recognition.face_locations

with open('names.txt') as f:
    names = [line.rstrip() for line in f.readlines()]
os.mkdir(f'face_images')
for name in names:
    os.mkdir(f'face_images/{name}')
    for f_name in os.listdir(f'dataset/{name}'):
        path = f'dataset/{name}/{f_name}'
        img = Image.open(path)
        img = np.array(img)
        
        boxes = detector(img)
        if boxes:
            top, right, bottom, left = boxes[0]
            top = max(0, int(0.9*top))
            bottom = min(img.shape[0], int(bottom))
            left = max(0, int(0.9*left))
            right = min(img.shape[1], int(1.1*right))
        
            crop_img = img[top:bottom, left:right,:]
            Image.fromarray(crop_img).save(f'face_images/{name}/{f_name}')

In [8]:
names = []
for name in os.listdir('face_images/'):
    if len(os.listdir(f'face_images/{name}')) > 1:
        names.append(name+'\n')
len(names)

1671

In [9]:
with open('names.txt', 'w') as f:
    f.writelines(names)

# Collect triplet for training

In [10]:
df = pd.DataFrame({'anchor_path':[], 'positive_path':[],'negative_path':[]})

with open('names.txt') as f:
    names = [line.rstrip() for line in f.readlines()]
    
for name in names:
    f_names = os.listdir(f'face_images/{name}')
    for f_name in f_names:
        anchor_path = f'face_images/{name}/{f_name}'
        
        pos_f_names = set(f_names)
        pos_f_names.remove(f_name)
        pos_f_name = np.random.choice(list(pos_f_names))
        pos_path = f'face_images/{name}/{pos_f_name}'
        
        neg_names = set(names)
        neg_names.remove(name)
        neg_name = np.random.choice(list(neg_names))
        neg_f_name = np.random.choice(os.listdir(f'face_images/{neg_name}'))
        neg_path = f'face_images/{neg_name}/{neg_f_name}'
        
        row = row = {'anchor_path': anchor_path, 'positive_path':pos_path, 'negative_path':neg_path}
        df = df.append(row, ignore_index=True)
        
df 

Unnamed: 0,anchor_path,positive_path,negative_path
0,face_images/Aaron_Peirsol/Aaron_Peirsol_0001.jpg,face_images/Aaron_Peirsol/Aaron_Peirsol_0002.jpg,face_images/Jay_Rasulo/Jay_Rasulo_0002.jpg
1,face_images/Aaron_Peirsol/Aaron_Peirsol_0002.jpg,face_images/Aaron_Peirsol/Aaron_Peirsol_0004.jpg,face_images/Anne_McLellan/Anne_McLellan_0001.jpg
2,face_images/Aaron_Peirsol/Aaron_Peirsol_0003.jpg,face_images/Aaron_Peirsol/Aaron_Peirsol_0004.jpg,face_images/James_Blake/James_Blake_0010.jpg
3,face_images/Aaron_Peirsol/Aaron_Peirsol_0004.jpg,face_images/Aaron_Peirsol/Aaron_Peirsol_0003.jpg,face_images/Amanda_Beard/Amanda_Beard_0002.jpg
4,face_images/Aaron_Sorkin/Aaron_Sorkin_0001.jpg,face_images/Aaron_Sorkin/Aaron_Sorkin_0002.jpg,face_images/Oswaldo_Paya/Oswaldo_Paya_0001.jpg
...,...,...,...
9117,face_images/Zinedine_Zidane/Zinedine_Zidane_00...,face_images/Zinedine_Zidane/Zinedine_Zidane_00...,face_images/Mark_Richt/Mark_Richt_0001.jpg
9118,face_images/Zoran_Djindjic/Zoran_Djindjic_0001...,face_images/Zoran_Djindjic/Zoran_Djindjic_0002...,face_images/Roy_Moore/Roy_Moore_0003.jpg
9119,face_images/Zoran_Djindjic/Zoran_Djindjic_0002...,face_images/Zoran_Djindjic/Zoran_Djindjic_0004...,face_images/Hashim_Thaci/Hashim_Thaci_0001.jpg
9120,face_images/Zoran_Djindjic/Zoran_Djindjic_0003...,face_images/Zoran_Djindjic/Zoran_Djindjic_0004...,face_images/Adam_Sandler/Adam_Sandler_0003.jpg


In [11]:
df.to_csv('data.csv')