**This file sets up the LFW dataset for training with a Siamese network**

Download the Labeled Faces in the Wild (LFW) dataset:
http://vis-www.cs.umass.edu/lfw/#download

Then, run this notebook in the same directory as your dataset to create the following files: 

- `lfw_names.csv`: a CSV file containing names of all people that have more than one image in the LFW dataset, as well as a list of all files for each name
- `LFW_pos_pairs.csv`: a CSV file listing info for randomly selected positive (matching) image pairs (i.e., two different images of the same person)
- `LFW_neg_pairs.csv`: a CSV file listing info for randomly selected negative (nonmatching) image pairs
- `LFW_info.csv`: a CSV file that combines `LFW_neg_pairs.csv` and `LFW_pos_pairs.csv`, and labels them according to whether or not they're of the same person, and whether or not they're in the training set (80% train : 20% test)
- `train_info.csv`: the training part of `LFW_info.csv`
- `test_info.csv`: the test / validation part of `LFW_info.csv`

This notebook will also create the following subdirectories:

- `/faces`: a folder containing all images listed in `lfw_names.csv`
- `/LFW_train`: a folder containing all images in the training set, as specified in `train_info.csv`
- `/LFW_test`: a folder containing all images in the testing or validation set, as specified in `test_info.csv`


**Note**: re-running this file will overwrite any folders or files in this directory that have the same name as those above

The folders and files needed for training are:

- `train_info.csv`
- `test_info.csv`
- `/LFW_train`
- `/LFW_test`

In [1]:
# unzip LFW dataset
# !tar zxvf lfw.tgz

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from subprocess import call
import os

In [3]:
path = Path("./lfw")
folders = os.listdir(path)

len(folders)

5749

In [7]:
# remove folders with less than 2 photos
for folder in folders:
    length = len(os.listdir(path/folder))
    if length < 2:
        call(f"rm -r {path}/{folder}".split()) # remove folder

folders = os.listdir(path)
len(folders)

1680

In [36]:
lfw_names = pd.DataFrame(pd.Series(folders), columns=["names"])

lfw_names.head()

Unnamed: 0,names
0,Sharon_Davis
1,Lauren_Killian
2,Hassan_Nasrallah
3,Rick_Wagoner
4,Javier_Solana


In [37]:
os.listdir(path/folders[0])

['Sharon_Davis_0001.jpg', 'Sharon_Davis_0002.jpg']

In [38]:
# create a list files for each name
files = []
for folder in folders:
    files.append(os.listdir(path/folder))

In [41]:
# add the list to lfw_names
lfw_names["files"] = files

In [42]:
lfw_names

Unnamed: 0,names,files
0,Sharon_Davis,"[Sharon_Davis_0001.jpg, Sharon_Davis_0002.jpg]"
1,Lauren_Killian,"[Lauren_Killian_0001.jpg, Lauren_Killian_0002...."
2,Hassan_Nasrallah,"[Hassan_Nasrallah_0002.jpg, Hassan_Nasrallah_0..."
3,Rick_Wagoner,"[Rick_Wagoner_0001.jpg, Rick_Wagoner_0002.jpg]"
4,Javier_Solana,"[Javier_Solana_0006.jpg, Javier_Solana_0002.jp..."
...,...,...
1675,Nancy_Pelosi,"[Nancy_Pelosi_0006.jpg, Nancy_Pelosi_0011.jpg,..."
1676,John_McEnroe,"[John_McEnroe_0002.jpg, John_McEnroe_0001.jpg]"
1677,Martin_Scorsese,"[Martin_Scorsese_0001.jpg, Martin_Scorsese_000..."
1678,Ernesto_Zedillo,"[Ernesto_Zedillo_0002.jpg, Ernesto_Zedillo_000..."


In [43]:
lfw_names.to_csv("lfw_names.csv", index=False) # save to csv

In [58]:
# create an empty folder `./faces`
call("rm -r faces".split())
call("mkdir faces".split()) 

0

In [71]:
# move all files to folder `./faces`
for (i, folder) in enumerate(folders):
    filelist = lfw_names.iloc[i,1]
    for file in filelist:
        call(f"cp {path}/{folder}/{file} faces".split())

In [74]:
len(os.listdir("faces"))

9164

In [121]:
total_samples = 21796 # numbers based on Asian Politician Dataset
total_neg_pairs = 10898
total_pos_pairs = 10898

# MAKE NEG PAIRS DF
length = len(lfw_names)
names = lfw_names["names"].tolist()

neg_pairs = []

for _ in range(total_neg_pairs):
    # get files for one name
    idx1 = np.random.randint(0, high=length) # name1 id
    name1_files = files[idx1]
    
    # get files for another name
    idx2 = np.random.randint(0, high=length) # name2 id
    
    # choose name2 id again if matching name1 id
    while (idx1 == idx2):
        idx2 = np.random.randint(0, high=length)
        
    name2_files = files[idx2]
        
    # randomly select a file for each name
    file_id1 = np.random.randint(0, high=len(name1_files))
    file_id2 = np.random.randint(0, high=len(name2_files))
    
    file1 = name1_files[file_id1]
    file2 = name2_files[file_id2]
    neg_pairs.append([file1, file2, False])
    

In [123]:
neg_df = pd.DataFrame(neg_pairs, columns=["file1","file2","label"])

In [124]:
neg_df

Unnamed: 0,file1,file2,label
0,Jan-Michael_Gambill_0001.jpg,Kristin_Davis_0002.jpg,False
1,Jean-David_Levitte_0002.jpg,Geoff_Hoon_0003.jpg,False
2,Vince_Gill_0001.jpg,Don_Nickles_0002.jpg,False
3,Ronaldo_Luis_Nazario_de_Lima_0003.jpg,Kofi_Annan_0002.jpg,False
4,Tiger_Woods_0003.jpg,Aron_Ralston_0002.jpg,False
...,...,...,...
10893,Lindsay_Benko_0002.jpg,Paul_McCartney_0002.jpg,False
10894,Jelena_Dokic_0002.jpg,Jacques_Chirac_0027.jpg,False
10895,Pete_Rose_0002.jpg,Sebastien_Grosjean_0001.jpg,False
10896,Andre_Agassi_0013.jpg,Rebekah_Chantay_Revels_0004.jpg,False


In [130]:
neg_df.to_csv("LFW_neg_pairs.csv", index=False)

In [133]:
total_samples = 21796
total_neg_pairs = 10898
total_pos_pairs = 10898

# MAKE POS PAIRS DF
length = len(lfw_names)
names = lfw_names["names"].tolist()

pos_pairs = []

for _ in range(total_pos_pairs):
    # get files for one name
    idx = np.random.randint(0, high=length) # name1 id
    name_files = files[idx]
        
    # randomly select two files for that name
    file_id1 = np.random.randint(0, high=len(name_files))
    file_id2 = np.random.randint(0, high=len(name_files))
    
    # ensure no duplicates
    while (file_id1 == file_id2):
        file_id2 = np.random.randint(0, high=len(name_files))
    
    file1 = name_files[file_id1]
    file2 = name_files[file_id2]
    pos_pairs.append([file1, file2, True])
    

In [134]:
pos_df = pd.DataFrame(pos_pairs, columns=["file1","file2","label"])
pos_df.head()

Unnamed: 0,file1,file2,label
0,Norm_Coleman_0003.jpg,Norm_Coleman_0007.jpg,True
1,Carlos_Moya_0015.jpg,Carlos_Moya_0011.jpg,True
2,Michael_Ballack_0001.jpg,Michael_Ballack_0002.jpg,True
3,Anthony_Hopkins_0002.jpg,Anthony_Hopkins_0001.jpg,True
4,Angela_Bassett_0006.jpg,Angela_Bassett_0005.jpg,True


In [135]:
pos_df.to_csv("LFW_pos_pairs.csv", index=False)

In [138]:
combined = pd.concat([pos_df, neg_df], ignore_index=True)
combined

Unnamed: 0,file1,file2,label
0,Norm_Coleman_0003.jpg,Norm_Coleman_0007.jpg,True
1,Carlos_Moya_0015.jpg,Carlos_Moya_0011.jpg,True
2,Michael_Ballack_0001.jpg,Michael_Ballack_0002.jpg,True
3,Anthony_Hopkins_0002.jpg,Anthony_Hopkins_0001.jpg,True
4,Angela_Bassett_0006.jpg,Angela_Bassett_0005.jpg,True
...,...,...,...
21791,Lindsay_Benko_0002.jpg,Paul_McCartney_0002.jpg,False
21792,Jelena_Dokic_0002.jpg,Jacques_Chirac_0027.jpg,False
21793,Pete_Rose_0002.jpg,Sebastien_Grosjean_0001.jpg,False
21794,Andre_Agassi_0013.jpg,Rebekah_Chantay_Revels_0004.jpg,False


In [139]:
combined["train"] = True

In [143]:
# select 20% of combined neg and pos pairs, and declare them as part of the test/validation set
combined.loc[combined.sample(frac=0.2).index, "train"] = False

In [148]:
# grab all file names for training and validation, respectively
file1_valid = combined[combined["train"] == False]["file1"]
file2_valid = combined[combined["train"] == False]["file2"]

file1_train = combined[combined["train"] == True]["file1"]
file2_train = combined[combined["train"] == True]["file2"]

In [151]:
valid_files = pd.concat([file1_valid, file2_valid]).drop_duplicates()
train_files = pd.concat([file1_train, file2_train]).drop_duplicates()

In [164]:
# create empty folders `/LFW_train` and `/LFW_test`
call("rm -r LFW_train".split())
call("mkdir LFW_train".split())

call("rm -r LFW_test".split())
call("mkdir LFW_test".split())



0

In [166]:
# copy all files not in training to `/LFW_test`
for valid_file in valid_files:
    call(f"cp faces/{valid_file} LFW_test".split())

In [167]:
# copy all files in training to `/LFW_train`
for train_file in train_files:
    call(f"cp faces/{train_file} LFW_train".split())

In [175]:
# shuffle combined
combined = combined.sample(frac=1).reset_index(drop=True)

In [176]:
# save info to CSV
combined.to_csv("LFW_info.csv", index=False)

In [168]:
# check folder sizes
len(os.listdir("LFW_train")), len(os.listdir("LFW_test"))

(6713, 4368)

In [169]:
len(train_files), len(valid_files)

(6713, 4368)

In [177]:
# Save info for test and train folders separately as well

mask = combined["train"] == True
train_info = combined[mask]

mask = combined["train"] == False
test_info = combined[mask]

train_info = train_info.drop(["train"], axis=1).reset_index(drop=True)
test_info = test_info.drop(["train"], axis=1).reset_index(drop=True)

train_info.to_csv("train_info.csv")
test_info.to_csv("test_info.csv")

test_info.head()

Unnamed: 0,file1,file2,label
0,Valentino_Rossi_0003.jpg,Valentino_Rossi_0001.jpg,True
1,Kevin_Costner_0004.jpg,Kevin_Costner_0007.jpg,True
2,Vladimiro_Montesinos_0003.jpg,Samira_Makhmalbaf_0001.jpg,False
3,Cate_Blanchett_0003.jpg,Oscar_Elias_Biscet_0002.jpg,False
4,Jorge_Arce_0001.jpg,Jorge_Arce_0002.jpg,True


In [178]:
train_info.head()

Unnamed: 0,file1,file2,label
0,Ludivine_Sagnier_0001.jpg,Robert_De_Niro_0003.jpg,False
1,John_Timoney_0002.jpg,Chok_Tong_Goh_0002.jpg,False
2,Sam_Bith_0002.jpg,Bob_Geldof_0002.jpg,False
3,Joan_Claybrook_0001.jpg,Spencer_Abraham_0003.jpg,False
4,Jacqueline_Obradors_0001.jpg,Jacqueline_Obradors_0002.jpg,True
