# Importing libs

In [1]:
import os
import tarfile
import random
import copy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import tensorflow as tf

from PIL import Image

# Globals

In [24]:
PATH = os.path.join('..', 'dataset')
BATCH_SIZE = 32

# Preprocessing

In [3]:
arch = tarfile.open(os.path.join( PATH, 'crop_part1.tar.gz'), 'r')
arch.getnames()[:3]

['crop_part1/24_1_2_20170104020224692.jpg.chip.jpg',
 'crop_part1/3_1_3_20161219230106056.jpg.chip.jpg',
 'crop_part1/35_0_0_20170105162448427.jpg.chip.jpg']

In [4]:
landmarks_df = pd.read_csv(os.path.join(PATH, 'landmark_list_part1.txt'), sep= ' ', header= None)
landmarks_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,128,129,130,131,132,133,134,135,136,137
0,1_0_2_20161219140530307.jpg,-4,71,-4,96,-3,120,-1,144,9,...,136,130,135,108,139,98,140,88,139,
1,1_0_2_20161219140525218.jpg,13,76,13,96,15,117,18,137,25,...,137,121,141,102,141,94,142,85,143,
2,1_0_2_20161219140540938.jpg,11,62,14,84,18,105,23,127,33,...,135,135,136,109,147,99,148,90,146,


In [5]:
landmarks_df = landmarks_df.drop(columns= [137])

In [6]:
file_names = []
for name in arch.getnames():
    name = name.replace('crop_part1/', '')
    name = name.replace('.chip.jpg', '')
    file_names.append(name)
file_names[:3]

['24_1_2_20170104020224692.jpg',
 '3_1_3_20161219230106056.jpg',
 '35_0_0_20170105162448427.jpg']

In [7]:
for name in file_names:
    if name not in landmarks_df[0].values:
        print(name)

crop_part1


In [8]:
file_names.remove('crop_part1')

In [9]:
len(file_names)

9780

In [10]:
landmarks_df[landmarks_df[0] == '24_1_2_20170104020224692.jpg'].values[0][1:]

array([14, 48, 15, 70, 18, 93, 22, 114, 31, 132, 44, 147, 60, 159, 77,
       168, 93, 170, 108, 168, 120, 157, 131, 144, 142, 128, 150, 111,
       157, 93, 162, 73, 165, 53, 30, 31, 43, 23, 58, 22, 73, 25, 87, 31,
       116, 30, 128, 24, 141, 21, 154, 22, 163, 31, 101, 56, 101, 73, 102,
       91, 102, 108, 85, 114, 93, 117, 101, 119, 108, 117, 115, 113, 45,
       57, 55, 52, 67, 53, 77, 63, 66, 65, 53, 64, 120, 62, 129, 52, 140,
       50, 149, 56, 142, 62, 130, 64, 69, 133, 82, 132, 93, 131, 100, 132,
       106, 130, 114, 131, 125, 131, 114, 140, 106, 145, 99, 147, 92, 146,
       82, 142, 74, 134, 93, 137, 99, 137, 106, 136, 120, 132, 106, 136,
       99, 137, 93, 137], dtype=object)

## Collect neccesarry metrics (mean, std)

In [20]:
np_imgs = []
for f_name in arch.getnames():
    try:
        img = arch.extractfile(f_name)
        img = Image.open(img)
        img = np.asarray(img)
        np_imgs.append(img)
    except:
        print(type(img))

<class 'NoneType'>


In [184]:
np_imgs = np.array(np_imgs)
np_imgs.shape

(9780, 200, 200, 3)

In [22]:
# uncomment for long calcs
#print(f'Mean: {np_imgs.mean()}, Std: {np_imgs.std()}')

Mean: 131.10104298994546, Std: 63.88147475116352


## FacesDataset

In [185]:
class UTKFacesDataset:
    def __init__(self, 
                 mode, 
                 array, 
                 landmarks_df, 
                 mean, std, 
                 transforms = None):
        self.mode = mode
        self.imgarr = copy.deepcopy(array)
        self.labels = copy.deepcopy(landmarks_df)
        self.mean = mean
        self.std = std
        self.transforms = transforms

    def __len__(self):
        return len(self.imgarr)

    def __getitem__(self,idx):
        x = self.imgarr[idx]
        x = arch.extractfile(x)
        x = Image.open(x)
        x = self.augment(x)

        f_name = self.imgarr[idx]
        f_name = name.replace('crop_part1/', '')
        f_name = name.replace('.chip.jpg', '')

        y = landmarks_df[landmarks_df[0] == f_name].values[0][1:]
        y = np.array(y, dtype= np.int32)
        return x, y
    
    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)
            
            if i == self.__len__()-1:
                self.on_epoch_end()
    
    def normalize(self, x):
        return tf.image.per_image_standardization(x)
            
    #shuffles the dataset at the end of each epoch
    def on_epoch_end(self):
        reidx = random.sample(population = list(range(self.__len__())),k = self.__len__())
        self.imgarr = self.imgarr[reidx]
    
    #applies randomly selected augmentations to each clip (same for each frame in the clip)
    def augment(self, x):
        if self.transforms is not None:
            x = self.transforms(x)
        x = self.normalize(x)
        return x

In [186]:
datagen = UTKFacesDataset('train', arch.getnames(), landmarks_df,  mean= 131.10, std= 63.88)

In [187]:
out_shape = (tf.TensorShape([200, 200, 3]), tf.TensorShape([136]))
out_type = (tf.float32, tf.float32)

dataset = tf.data.Dataset.from_generator(datagen, output_shapes= out_shape, output_types= out_type)
dataset = dataset.batch(BATCH_SIZE)

TypeError: The dataset length is unknown.

In [191]:
cnt = 0
tmp = []
for batch in dataset:
    x,y = batch
    tmp = x
    cnt+= 1
    if cnt == 1:
        break





In [183]:
len(np.unique(tmp, axis= 0))

32