<a href="https://colab.research.google.com/github/hurricane195/Intro-to-Machine-Learning/blob/Final-Project/faceScrub_dataset_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv
import random
import shutil
import os
import numpy as np
from pathlib import Path
from PIL import Image
from random import randrange
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
import pandas as pd
import torch
from torch.utils.data import DataLoader, ConcatDataset
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from google.colab import drive

In [None]:
path_drive  = '/content/drive'
drive.mount(path_drive)

path_dataset = path_drive + '/MyDrive/faceScrub' # path to the original faceScrub dataset
path_actors_faces = path_dataset + '/faces/'
path_faces_file = path_dataset + '/faces.txt'
path_output_dataset = './data' # change this to whatever directory you wish to store your output dataset


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def create_directory(dataset, dataset_dir, resolution, verbose=1):
    if verbose:
        print('Creating {} directory...'.format(dataset_dir))
    Path(dataset_dir).mkdir(parents=True)
    for face in dataset:
        actor_name = face['name']
        try:
           (Path(dataset_dir) / actor_name).mkdir()
        except:
          pass
        img1 = Image.open(face['face'])
        if verbose==2:
            print('Resizing {} '.format( str(Path(face['face']).name)) )
        if Path(face['face']).suffix != '.jpg' or Path(face['face']).suffix != '.jpeg':
            img1 = img1.convert('RGB')
        img1 = img1.resize(resolution)
        save_path = str(Path(dataset_dir) / actor_name / Path(face['face']).stem ) +  '.png'
        img1.save( save_path, 'PNG')
        face[actor_name] = save_path



def create_info_file(dataset, dataset_dir, file_name, verbose=1):
    if verbose:
        print('Creating {} file...'.format(file_name))
    with open( str(Path(dataset_dir) / file_name), 'w', newline='') as file:
        fieldnames = dataset[0].keys()
        writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        for face in dataset:
            writer.writerow(face)



def create_datasets(train_size, val_size, test_size, resolution, verbose=1):
    # Read download directory
    if verbose:
        print('\nReading actors folder...')

    faces = [] # this list contains all the download information

    # check if faces.txt file exists
    if Path(path_faces_file).is_file():
        with open(path_faces_file, newline='') as faces_file:
            faces_reader = csv.DictReader(faces_file, delimiter='\t')
            for face in faces_reader:
                faces.append(face)
    else:
        # it takes some minutes to scan the whole directory!
        for actor_entry in tqdm(Path(path_actors_faces).iterdir(), desc ="Reading faces"):
            if actor_entry.is_dir(): # read only directories
                for face_entry in actor_entry.iterdir():
                    faces.append( {'name':actor_entry.name, 'face':str(face_entry)} ) # add info to faces list

        create_info_file(faces, str(Path(path_faces_file).parent), Path(path_faces_file).name) # create faces.txt file

    # Shuffle list
    random.shuffle(faces)

    # clear data dir
    if Path('data').is_dir():
        shutil.rmtree('data')

    if verbose:
        print('\nCreating test set...')
    # Create test set list
    test_set = []
    actors = [] # list to keep track which actors have been used
    for idx, face in enumerate(faces):
        # Examine image and dicard if not RGB, e.g, type L (b/w)
        img = Image.open(face['face'])
        if img.mode == 'RGB':
            test_set.append(face)
            actors.append(face['name'])
        if len(test_set) == test_size:
            break
    actors = list(set(actors)) # delete duplicates
    create_directory(test_set, path_output_dataset + '/test/', resolution, verbose=verbose)

    if verbose:
        print('\nCreating validation set...')
    # Create validation set list, make sure that no actor from test set is here
    validation_set = []
    for idx, face in enumerate(faces[idx+1:], start=idx+1): # continue reading from idx
        img = Image.open(face['face'])
        if face['name'] not in actors and img.mode == 'RGB':
            validation_set.append(face)
        if len(validation_set) == val_size:
            break
    create_directory(validation_set, path_output_dataset + '/valid/', resolution, verbose=verbose)

    if verbose:
        print('\nCreating training set...')
    # Create training set list, make sure that no actor from test set is here
    training_set = []
    for face in faces[idx+1:]:
        img = Image.open(face['face'])
        if face['name'] not in actors and img.mode == 'RGB':
            training_set.append(face)
        if len(training_set) == train_size:
            break
    create_directory(training_set, path_output_dataset + '/train/', resolution, verbose=verbose)

    if verbose:
        print('\nDatasets created sucessfully!\n')

    return training_set, validation_set, test_set


In [None]:
# Create dataset
tain_set, valid_set, test_set = create_datasets(
    train_size = 1200,
    val_size = 300,
    test_size = 300,
    resolution = [128, 128])


Reading actors folder...

Creating test set...
Creating ./data/test/ directory...

Creating validation set...
Creating ./data/valid/ directory...

Creating training set...
Creating ./data/train/ directory...

Datasets created sucessfully!



In [None]:
 # Load data from folders
train_dir = path_output_dataset + '/train'
valid_dir = path_output_dataset + '/valid'

train_dataset = datasets.ImageFolder(
    train_dir
)
valid_dataset = datasets.ImageFolder(
    valid_dir
)

    # Create data loaders
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=128, shuffle=True,
    num_workers=0)
valid_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=128, shuffle=False,
    num_workers=0)

In [None]:
train_dataset

Dataset ImageFolder
    Number of datapoints: 1200
    Root location: ./data/train

In [None]:
valid_dataset

Dataset ImageFolder
    Number of datapoints: 300
    Root location: ./data/valid