In [201]:
from PIL import Image
import os
import random
import sys
import shutil

import pandas as pd
import numpy as np
import torch

import torchvision
from torchvision import transforms

### IMFDB Data - Preliminary

Organize the data into labelled folders, separated by training, test, and validation.

In [199]:
def org_IMFDB(raw_data_path):
    """
    To clean the IMFDB data, we need to do the following:
    
    - remove Disgust and Surprise datasets
    - split into each label
    - split into training, test, and validation datasets
    
    """
    
    allowed_emotions = ['HAPPINESS', 'NEUTRAL', 'SADNESS', 'FEAR', 'ANGER']
    emotion_mapping = {'HAPPINESS':'Happy', 'NEUTRAL':'Neutral', 'SADNESS':'Sad', 'FEAR':'Fear','ANGER':'Anger'}

    # Organize images into: label/dataset folders, where label is an allowed emotion and dataset is train/validation/test

    random.seed(50)

    # First divide actors into train/validation/test
    actors = os.listdir(IMFDB_raw_path)
    random.shuffle(actors)

    actor_dict = {}
    train_actors = actors[:int(len(actors)*0.8)] # 80% test dataset
    validation_actors = actors[int(len(actors)*0.8):int(len(actors)*0.9)] # 10% validation dataset
    test_actors = actors[int(len(actors)*0.9):] # 10% validation dataset

    for actor in train_actors:
        actor_dict[actor] = 'training'

    for actor in validation_actors:
        actor_dict[actor] = 'validation'

    for actor in test_actors:
        actor_dict[actor] = 'test'

    # Iterate through each actor
    for actor in os.listdir(raw_data_path):
        # Iterate through each movie
        for movie in os.listdir(os.path.join(raw_data_path, actor)):
            text_file = os.path.join(raw_data_path, actor, movie, movie+'.txt')
            image_folder = os.path.join(raw_data_path, actor, movie, 'images')


            try:
                data = pd.read_csv(text_file, header=None, sep='\t').rename({1:'image', 10:'emotion'}, axis=1)

                i = 1 # Find the emotion column
                while data.iloc[0,i+9] not in ['HAPPINESS', 'NEUTRAL', 'SADNESS', 'FEAR', 'ANGER', 'SURPRISE', 'DISGUST'] and i < len(data.columns)-1:
                    data = data.rename({'image':i, i+1:'image','emotion':i+9,i+10:'emotion'}, axis=1)
                    i+=1

                emotion_dict = dict(zip(data.image, data.emotion))

                for image_file in os.listdir(image_folder):
                    if image_file in emotion_dict.keys() and emotion_dict[image_file] in allowed_emotions:
                        # copy to new folder
                        shutil.copy(os.path.join(image_folder,image_file), os.path.join(os.getcwd(), 'data_IMFDB',
                                                                                        actor_dict[actor], 
                                                                                        emotion_mapping[emotion_dict[image_file]],
                                                                                        image_file))
            except:
                print(data.head())
                print(actor, movie, sys.exc_info()[0])
    
    print("Data successfully moved from raw files folder split by actor and movie to label folders split by training, validation, and testing.")

### IMFDB Data - Cleaning

Loading into datasets and applying cleaning.

In [207]:
torch.manual_seed(50)
def clean_IMFDB(raw_data_path):
    """
    To clean the IMFDB data, we need to do the following:
    
    - resize datasets to 48x48 pixels
    - convert to greyscale (1 channel)
    - normalize the pixel values
    
    """

    data_transforms = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.Resize(48), # Change dimensions to 48x48
        transforms.CenterCrop(48), # Convert to square aspect
        transforms.ToTensor() # Convert pixels to 0-1 range
    ])

    train_datasets = torchvision.datasets.ImageFolder(root=os.path.join(raw_data_path,'training'),
                                                      transform=data_transforms)
    val_datasets = torchvision.datasets.ImageFolder(root=os.path.join(raw_data_path,'validation'),
                                                    transform=data_transforms)
    test_datasets = torchvision.datasets.ImageFolder(root=os.path.join(raw_data_path,'test'),
                                                     transform=data_transforms)
    
    return train_datasets, val_datasets, test_datasets

In [217]:
IMFDB_raw_path = os.path.join(os.getcwd(), "raw_facial_data_IMFDB")
# org_IMFDB(IMFDB_raw_path)

IMFDB_path = os.path.join(os.getcwd(), "data_IMFDB")
IMFDB_train, IMFDB_val, IMFDB_test = clean_IMFDB(IMFDB_path)

In [218]:
print("Number of IMFDB training images: ", len(IMFDB_train))
print("Number of IMFDB validation images: ", len(IMFDB_val))
print("Number of IMFDB test images: ", len(IMFDB_test))

Number of IMFDB training images:  16392
Number of IMFDB validation images:  2147
Number of IMFDB test images:  2170


In [225]:
from collections import Counter
train_counts = dict(Counter(sample_tup[1] for sample_tup in IMFDB_train))
val_counts = dict(Counter(sample_tup[1] for sample_tup in IMFDB_val))
test_counts = dict(Counter(sample_tup[1] for sample_tup in IMFDB_test))

['Anger', 'Fear', 'Happy', 'Neutral', 'Sad']
Number of train images in each class {0: 1975, 1: 471, 2: 5566, 3: 5859, 4: 2521}
Number of validation images in each class {0: 307, 1: 79, 2: 566, 3: 940, 4: 255}
Number of test images in each class {0: 213, 1: 17, 2: 651, 3: 855, 4: 434}


In [230]:
print(IMFDB_train.classes)
print('Number of train images in each IMFDB class \t\t', train_counts)
print('Number of validation images in each IMFDB class \t', val_counts)
print('Number of test images in each IMFDB class \t\t', test_counts)

['Anger', 'Fear', 'Happy', 'Neutral', 'Sad']
Number of train images in each IMFDB class 		 {0: 1975, 1: 471, 2: 5566, 3: 5859, 4: 2521}
Number of validation images in each IMFDB class 	 {0: 307, 1: 79, 2: 566, 3: 940, 4: 255}
Number of test images in each IMFDB class 		 {0: 213, 1: 17, 2: 651, 3: 855, 4: 434}


### Kaggle Data

In [262]:
def convert_str_to_array(row, normalize=False):
    """
    Convert a string space-separated pixel values in row major order into a 2D numpy array
    """
    
    # Convert into 1D numpy array
    arr_1D = np.fromstring(row, dtype=int, sep=" ")
    
    # Convert into 2D numpy array with 48 x 48 shape
    arr_2D = np.reshape(arr_1D, (48, 48))
    
    if normalize:
        arr_2D = arr_2D/255
    
    return arr_2D
    
def clean_Kaggle(raw_data_path):
    """
    To clean the Kaggle data, we need to do the following:
    
    - remove Disgust and Surprise datasets
    - normalize the pixels
    - set training data to training data
    - set public test data to validation data
    - set private test data to test data
    
    """
    
    kaggle_raw = pd.read_csv(kaggle_raw_path)
    
    # Remove disgust (1) and surprise(5) data
    kaggle_df = kaggle_raw.loc[(kaggle_raw.emotion != 1) & (kaggle_raw.emotion != 5)]
    
    emotion_labels = {0:0, 2:1, 3:2, 4:4, 6:3}
    kaggle_df = kaggle_df.replace(emotion_labels)

    # Convert pixel values from space-separted pixel values in row major order to numpy arrays and normalize
    # Return as tensors
    kaggle_df['normalized_pixels'] = kaggle_df[' pixels'].apply(lambda row: convert_str_to_array(row, normalize=True))
    
    # Split into training, validation, and test datasets
    train_df = kaggle_df[['emotion', 'normalized_pixels']].loc[kaggle_df[' Usage']=='Training']
    val_df = kaggle_df[['emotion', 'normalized_pixels']].loc[kaggle_df[' Usage']=='PublicTest']
    test_df = kaggle_df[['emotion', 'normalized_pixels']].loc[kaggle_df[' Usage']=='PrivateTest']
    
    train_values = train_df.normalized_pixels.values
    print(train_values)
    
    return

In [259]:
for i in IMFDB_train:
    print(i)
    break

(tensor([[[0.2039, 0.2314, 0.2549,  ..., 0.0902, 0.0824, 0.0784],
         [0.1608, 0.2000, 0.2314,  ..., 0.0980, 0.0902, 0.0863],
         [0.1137, 0.1569, 0.2039,  ..., 0.1098, 0.1020, 0.0980],
         ...,
         [0.1059, 0.1294, 0.1569,  ..., 0.1608, 0.1608, 0.1608],
         [0.0706, 0.1059, 0.1451,  ..., 0.1529, 0.1529, 0.1569],
         [0.0667, 0.1020, 0.1412,  ..., 0.1529, 0.1529, 0.1529]]]), 0)


In [263]:
kaggle_raw_path = os.path.join(os.getcwd(), "raw_facial_data_Kaggle","icml_face_data.csv")

clean_Kaggle(kaggle_raw_path)

[array([[0.2745098 , 0.31372549, 0.32156863, ..., 0.20392157, 0.16862745,
        0.16078431],
       [0.25490196, 0.23921569, 0.22745098, ..., 0.21960784, 0.20392157,
        0.17254902],
       [0.19607843, 0.16862745, 0.21176471, ..., 0.19215686, 0.21960784,
        0.18431373],
       ...,
       [0.35686275, 0.25490196, 0.16470588, ..., 0.28235294, 0.21960784,
        0.16862745],
       [0.30196078, 0.32156863, 0.30980392, ..., 0.41176471, 0.2745098 ,
        0.18039216],
       [0.30196078, 0.28235294, 0.32941176, ..., 0.41568627, 0.42745098,
        0.32156863]])
 array([[0.59215686, 0.58823529, 0.57647059, ..., 0.50588235, 0.54901961,
        0.47058824],
       [0.59215686, 0.58431373, 0.58431373, ..., 0.47843137, 0.55294118,
        0.5372549 ],
       [0.59215686, 0.59215686, 0.61176471, ..., 0.42745098, 0.48235294,
        0.57254902],
       ...,
       [0.7372549 , 0.7372549 , 0.4745098 , ..., 0.7254902 , 0.7254902 ,
        0.72941176],
       [0.7372549 , 0.73333333, 0

In [248]:
kaggle_train_df.head()

Unnamed: 0,emotion,normalized_pixels
0,0,"[[0.27450980392156865, 0.3137254901960784, 0.3..."
1,0,"[[0.592156862745098, 0.5882352941176471, 0.576..."
2,1,"[[0.9058823529411765, 0.8313725490196079, 0.61..."
3,4,"[[0.09411764705882353, 0.12549019607843137, 0...."
4,3,"[[0.01568627450980392, 0.0, 0.0, 0.0, 0.0, 0.0..."


In [231]:
def print_stats(data, dataset):

    emotion_labels = {0:'Angry', 1:'Fear', 2:'Happy', 3:'Neutral', 4:'Sad'}
    num_samples = {}

    for label in emotion_labels.keys():
        num_samples[emotion_labels[label]] = len(data.loc[data.emotion==label])
   
    print("Number of", dataset, "images in Kaggle dataset:\t", num_samples)

In [251]:
print("Number of Kaggle training images: ", len(kaggle_train_df))
print("Number of Kaggle validation images: ", len(kaggle_val_df))
print("Number of Kaggle test images: ", len(kaggle_test_df))

Number of Kaggle training images:  25102
Number of Kaggle validation images:  3118
Number of Kaggle test images:  3118


In [234]:
print_stats(kaggle_train_df, 'training')
print_stats(kaggle_val_df, 'validation')
print_stats(kaggle_test_df, 'test')

Number of training images in Kaggle dataset:	 {'Angry': 3995, 'Fear': 4097, 'Happy': 7215, 'Sad': 4830, 'Neutral': 4965}
Number of validation images in Kaggle dataset:	 {'Angry': 467, 'Fear': 496, 'Happy': 895, 'Sad': 653, 'Neutral': 607}
Number of test images in Kaggle dataset:	 {'Angry': 491, 'Fear': 528, 'Happy': 879, 'Sad': 594, 'Neutral': 626}
