In [1]:
from PIL import Image
import os
import random
import sys
import shutil

import pandas as pd
import numpy as np
import torch


from PIL import Image
from torch.utils.data import DataLoader, Dataset, ConcatDataset

import torchvision
from torchvision import transforms

### IMFDB Data - Preliminary

Organize the data into labelled folders, separated by training, test, and validation.

In [2]:
def org_IMFDB(raw_data_path):
    """
    To clean the IMFDB data, we need to do the following:
    
    - remove Disgust and Surprise datasets
    - split into each label
    - split into training, test, and validation datasets
    
    """
    
    allowed_emotions = ['HAPPINESS', 'NEUTRAL', 'SADNESS', 'FEAR', 'ANGER']
    emotion_mapping = {'HAPPINESS':'Happy', 'NEUTRAL':'Neutral', 'SADNESS':'Sad', 'FEAR':'Fear','ANGER':'Anger'}

    # Organize images into: label/dataset folders, where label is an allowed emotion and dataset is train/validation/test

    random.seed(50)

    # First divide actors into train/validation/test
    actors = os.listdir(IMFDB_raw_path)
    random.shuffle(actors)

    actor_dict = {}
    train_actors = actors[:int(len(actors)*0.8)] # 80% test dataset
    validation_actors = actors[int(len(actors)*0.8):int(len(actors)*0.9)] # 10% validation dataset
    test_actors = actors[int(len(actors)*0.9):] # 10% validation dataset

    for actor in train_actors:
        actor_dict[actor] = 'training'

    for actor in validation_actors:
        actor_dict[actor] = 'validation'

    for actor in test_actors:
        actor_dict[actor] = 'test'

    # Iterate through each actor
    for actor in os.listdir(raw_data_path):
        # Iterate through each movie
        for movie in os.listdir(os.path.join(raw_data_path, actor)):
            text_file = os.path.join(raw_data_path, actor, movie, movie+'.txt')
            image_folder = os.path.join(raw_data_path, actor, movie, 'images')


            try:
                data = pd.read_csv(text_file, header=None, sep='\t').rename({1:'image', 10:'emotion'}, axis=1)

                i = 1 # Find the emotion column
                while data.iloc[0,i+9] not in ['HAPPINESS', 'NEUTRAL', 'SADNESS', 'FEAR', 'ANGER', 'SURPRISE', 'DISGUST'] and i < len(data.columns)-1:
                    data = data.rename({'image':i, i+1:'image','emotion':i+9,i+10:'emotion'}, axis=1)
                    i+=1

                emotion_dict = dict(zip(data.image, data.emotion))

                for image_file in os.listdir(image_folder):
                    if image_file in emotion_dict.keys() and emotion_dict[image_file] in allowed_emotions:
                        # copy to new folder
                        shutil.copy(os.path.join(image_folder,image_file), os.path.join(os.getcwd(), 'data_IMFDB',
                                                                                        actor_dict[actor], 
                                                                                        emotion_mapping[emotion_dict[image_file]],
                                                                                        image_file))
            except:
                print(data.head())
                print(actor, movie, sys.exc_info()[0])
    
    print("Data successfully moved from raw files folder split by actor and movie to label folders split by training, validation, and testing.")

### IMFDB Data - Cleaning

Loading into datasets and applying cleaning.

In [31]:
class FacialDataset(Dataset):
    
    def __init__(self, df, transform=None):
        self.data = df
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        image = self.data.iloc[index, 1]
        label = self.data.iloc[index, 0]
        
        if self.transform is not None:
            image = self.transform(image)

        return image, label

In [65]:
torch.manual_seed(50)
def clean_IMFDB(raw_data_path):
    """
    To clean the IMFDB data, we need to do the following:
    
    - resize datasets to 48x48 pixels
    - convert to greyscale (1 channel)
    - normalize the pixel values
    
    """

    data_transforms = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.Resize(48), # Change dimensions to 48x48
        transforms.CenterCrop(48), # Convert to square aspect
        transforms.ToTensor() # Convert pixels to 0-1 range
    ])

    train_datasets = torchvision.datasets.ImageFolder(root=os.path.join(raw_data_path,'training'),
                                                      transform=data_transforms)
    train_list = []
    for (data, label) in train_datasets:
        train_list.append((label, data.numpy()))

    train_df = pd.DataFrame(train_list)
    train_dataset = FacialDataset(train_df, transform=transforms.ToTensor())
    
    val_datasets = torchvision.datasets.ImageFolder(root=os.path.join(raw_data_path,'validation'),
                                                    transform=data_transforms)
    val_list = []
    for (data, label) in val_datasets:
        val_list.append((label, data.numpy()))

    val_df = pd.DataFrame(val_list)
    val_dataset = FacialDataset(val_df, transform=transforms.ToTensor())
    
    test_datasets = torchvision.datasets.ImageFolder(root=os.path.join(raw_data_path,'test'),
                                                     transform=data_transforms)
    test_list = []
    for (data, label) in test_datasets:
        test_list.append((label, data.numpy()))

    test_df = pd.DataFrame(test_list) 
    test_dataset = FacialDataset(test_df, transform=transforms.ToTensor())
    
    return train_dataset, val_dataset, test_dataset

In [66]:
IMFDB_raw_path = os.path.join(os.getcwd(), "raw_facial_data_IMFDB")
# org_IMFDB(IMFDB_raw_path)

IMFDB_path = os.path.join(os.getcwd(), "data_IMFDB")
IMFDB_train, IMFDB_val, IMFDB_test = clean_IMFDB(IMFDB_path)

In [67]:
print("Number of IMFDB training images: ", len(IMFDB_train))
print("Number of IMFDB validation images: ", len(IMFDB_val))
print("Number of IMFDB test images: ", len(IMFDB_test))

Number of IMFDB training images:  16392
Number of IMFDB validation images:  2147
Number of IMFDB test images:  2170


In [68]:
from collections import Counter
train_counts = dict(Counter(sample_tup[1] for sample_tup in IMFDB_train))
val_counts = dict(Counter(sample_tup[1] for sample_tup in IMFDB_val))
test_counts = dict(Counter(sample_tup[1] for sample_tup in IMFDB_test))

In [69]:
print('Number of train images in each IMFDB class \t\t', train_counts)
print('Number of validation images in each IMFDB class \t', val_counts)
print('Number of test images in each IMFDB class \t\t', test_counts)

Number of train images in each IMFDB class 		 {0: 1975, 1: 471, 2: 5566, 3: 5859, 4: 2521}
Number of validation images in each IMFDB class 	 {0: 307, 1: 79, 2: 566, 3: 940, 4: 255}
Number of test images in each IMFDB class 		 {0: 213, 1: 17, 2: 651, 3: 855, 4: 434}


### Kaggle Data

In [8]:
def convert_str_to_array(row):
    """
    Convert a string space-separated pixel values in row major order into a 2D numpy array
    """
    
    # Convert into 1D numpy array
    arr_1D = np.fromstring(row, dtype=int, sep=" ")
    
    # Convert into normalized 2D numpy array with 48 x 48 shape
    arr_2D = np.reshape(arr_1D, (48, 48))/255
    
    img = Image.fromarray(arr_2D)
    
    return img

    
    
def clean_Kaggle(raw_data_path):
    """
    To clean the Kaggle data, we need to do the following:
    
    - remove Disgust and Surprise datasets
    - normalize the pixels
    - set training data to training data
    - set public test data to validation data
    - set private test data to test data
    
    """
    
    kaggle_raw = pd.read_csv(kaggle_raw_path)
    
    # Remove disgust (1) and surprise(5) data
    kaggle_df = kaggle_raw.loc[(kaggle_raw.emotion != 1) & (kaggle_raw.emotion != 5)]
    
    emotion_labels = {0:0, 2:1, 3:2, 4:4, 6:3}
    kaggle_df = kaggle_df.replace(emotion_labels)

    # Convert pixel values from space-separted pixel values in row major order to normalized PIL images
    kaggle_df['pil_imgs'] = kaggle_df[' pixels'].apply(lambda row: convert_str_to_array(row))
    
    
    # Split into training, validation, and test datasets
    train_df = kaggle_df[['emotion', 'pil_imgs']].loc[kaggle_df[' Usage']=='Training']
    val_df = kaggle_df[['emotion', 'pil_imgs']].loc[kaggle_df[' Usage']=='PublicTest']
    test_df = kaggle_df[['emotion', 'pil_imgs']].loc[kaggle_df[' Usage']=='PrivateTest']
    
    data_transforms = transforms.Compose([
        transforms.ToTensor() # Convert to tensors
    ])
    
    train_dataset = FacialDataset(train_df, transform=data_transforms)
    val_dataset = FacialDataset(val_df, transform=data_transforms)
    test_dataset = FacialDataset(test_df, transform=data_transforms)
    
    
    return train_dataset, val_dataset, test_dataset

In [9]:
kaggle_raw_path = os.path.join(os.getcwd(), "raw_facial_data_Kaggle","icml_face_data.csv")

Kaggle_train, Kaggle_val, Kaggle_test = clean_Kaggle(kaggle_raw_path)

In [10]:
print("Number of Kaggle training images: ", len(Kaggle_train))
print("Number of Kaggle validation images: ", len(Kaggle_val))
print("Number of Kaggle test images: ", len(Kaggle_test))

Number of Kaggle training images:  25102
Number of Kaggle validation images:  3118
Number of Kaggle test images:  3118


In [11]:
train_counts = dict(Counter(sample_tup[1] for sample_tup in Kaggle_train))
val_counts = dict(Counter(sample_tup[1] for sample_tup in Kaggle_val))
test_counts = dict(Counter(sample_tup[1] for sample_tup in Kaggle_test))

print('Number of train images in each Kaggle class \t\t', train_counts)
print('Number of validation images in each Kaggle class \t', val_counts)
print('Number of test images in each Kaggle class \t\t', test_counts)

  img = torch.from_numpy(np.array(pic, np.float32, copy=False))


Number of train images in each Kaggle class 		 {0: 3995, 1: 4097, 4: 4830, 3: 4965, 2: 7215}
Number of validation images in each Kaggle class 	 {0: 467, 4: 653, 3: 607, 2: 895, 1: 496}
Number of test images in each Kaggle class 		 {0: 491, 3: 626, 4: 594, 1: 528, 2: 879}


### Combine the Datasets

In [72]:
train_datasets = [Kaggle_train, IMFDB_train]
val_datasets = [Kaggle_val, IMFDB_val]
test_datasets = [Kaggle_test, IMFDB_test]

train_dataset = ConcatDataset(train_datasets)
val_dataset = ConcatDataset(val_datasets)
test_dataset = ConcatDataset(test_datasets)

In [73]:
print("Number of training images: ", len(train_dataset))
print("Number of validation images: ", len(val_dataset))
print("Number of test images: ", len(test_dataset))

Number of training images:  41494
Number of validation images:  5265
Number of test images:  5288


In [75]:
train_counts = dict(Counter(sample_tup[1] for sample_tup in train_dataset))
val_counts = dict(Counter(sample_tup[1] for sample_tup in val_dataset))
test_counts = dict(Counter(sample_tup[1] for sample_tup in test_dataset))

print('Number of train images in each class \t\t', train_counts)
print('Number of validation images in each class \t', val_counts)
print('Number of test images in each class \t\t', test_counts)

Number of train images in each class 		 {0: 5970, 1: 4568, 4: 7351, 3: 10824, 2: 12781}
Number of validation images in each class 	 {0: 774, 4: 908, 3: 1547, 2: 1461, 1: 575}
Number of test images in each class 		 {0: 704, 3: 1481, 4: 1028, 1: 545, 2: 1530}


### Save Datasets

In [None]:
torch.save(train_dataset, 'clean_facial_data/train_dataset')
torch.save(val_dataset, 'clean_facial_data/val_dataset')
torch.save(test_dataset, 'clean_facial_data/test_dataset')

### Load Datasets

In [16]:
train_set = torch.load('clean_facial_data/train_dataset')

In [20]:
train_loader = DataLoader(IMFDB_train, batch_size=64, shuffle=True)