# 0. Set-up
Mount Google Drive, import packages and import data.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

## Importing Packages

In [0]:
from keras import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
import random
from keras.layers import ZeroPadding2D, Convolution2D,Activation
from keras.preprocessing.image import load_img,img_to_array,ImageDataGenerator
from keras.applications.vgg16 import preprocess_input
from keras.utils import np_utils
import pickle
from random import shuffle
from tqdm import tqdm
from keras.applications.vgg16 import VGG16
from keras import Sequential
from keras.layers import Dense, MaxPooling2D, Conv2D, Flatten, Dropout, Input
from keras import *
from keras.optimizers import *
from keras import metrics
from keras.callbacks import *
from sklearn.metrics import accuracy_score, confusion_matrix
import h5py # to save best models

## Functions to get the data

In [0]:


def get_images(path,max_folders = 1000):
  '''Loops through folders to get the images.
  path: the path to the main folder in which subfolders containing images are located
  max_folders: the maximum number of folders to load (necessary as RAM can't handle everything)
  '''
  images = []
  IDs = []
  x = 0
  for foldername in tqdm(os.listdir(path)):
    if x > max_folders:
      break
    x += 1
    if os.path.isdir(os.path.join(path,foldername)):
      for filename in os.listdir(os.path.join(path,foldername)):
        if filename.endswith(".jpg"):
          img = load_img(os.path.join(path,foldername,filename),target_size = (224,224))
          img_ID = os.path.basename(filename)[22:]
          img = img_to_array(img)
          img = img.reshape((1,img.shape[0],img.shape[1],img.shape[2]))
          img = img/255

          images.append(img)
          IDs.append(img_ID)
  return(images,IDs)

def get_data(path,max_folders,age_dict,val_split,vgg_processing):
  '''path: the path leading to the data
  max_folders: how many folders to get
  age_dict: dictionary which keeps the age group for a certain image ID
  val_split: how many images to keep for validation
  vgg_processing: whether to preprocess for vgg_16'''

  images, IDs = get_images(path,max_folders,vgg_processing)
  _y = []
  imgs = []
  image_ids = []
  x = 0
  for i in IDs:
    try:
      value = age_dict[i]
      _y.append(value)
      imgs.append(images[x])
      image_ids.append(i)
      x += 1
    except KeyError:
      x += 1
  # shuffle
  tmp = list(zip(_y,imgs,image_ids))
  random.shuffle(tmp)
  
  _y, imgs,image_ids = zip(*tmp)
  print(type(imgs))
  train_x = np.concatenate(imgs[:round(len(imgs)*(1-val_split))],axis = 0)
  print(type(train_x))
  print(train_x.shape)
  train_y = _y[:round(len(imgs)*(1-val_split))]
  
  train_y = utils.np_utils.to_categorical(train_y)
  
  val_x = np.concatenate(imgs[round(len(imgs)*(1-val_split)):], axis = 0)
  val_y = _y[round(len(imgs)*(1-val_split)):]
  
  return(train_x,train_y,val_x,val_y,image_ids)

## Functions to get image labels

In [0]:

# Define a binary label for each image ID
def older_than_21(a_df,image_column,age_column):
  '''Parameters:
  a_df: a dataframe from which the data is taken
  image_column: index indicating which column the image IDs are in
  age_column: index indicating which columns the ages are in'''
  a_dict = {}
  for index, i in a_df.iterrows():
    if any(ages == i[age_column] for ages in['(4, 6)', '(0, 2)', '(8, 13)', '(15, 20)']):
        image_id = i[image_column]
        a_dict[image_id] = 0
    elif any(ages == i[age_column] for ages in ['(25, 32)','(38, 43)','(48, 53)','(60, 100)']):
        image_id = i[image_column]
        a_dict[image_id] = 1
        

  return(a_dict)

# Get the metadata on all images
def get_text(path):
  test = True # Can't come up with something more elegant; used to load and store first file as it can't be appended to something empty
  for filename in os.listdir(path):
    if filename.endswith(".txt"):
      if test:
        file = pd.read_csv(os.path.join(path,filename),sep = '\t')
        test = False
      else:
        file_tmp = pd.read_csv(os.path.join(path,filename), sep = '\t')
        file = file.append(file_tmp)
  # get actual ID
  
  file['ID'] = file['face_id'].astype(str) + '.' + file['original_image']
  
  return(file)

# 1. Preprocess and store data

In [0]:
# Load the text files
text_files = get_text('gdrive/My Drive/data/aligned/')

In [0]:
text_files['complete_id'] = text_files['face_id'].astype(str) + '.' + text_files['original_image']
text_files.head()

In [0]:
text_files['age'].value_counts()

Shows the different labels; we will use the top 8 in this study.

Now we recode these labels into a binary variable of older/young than 21.

In [0]:
age_group_dict = {}
for index, i in text_files.iterrows():
  image_ID = i[12]
  age_group = i[3]
  age_group_dict[image_ID] = age_group
  
age_dict = older_than_21(text_files,12,3)

In [0]:
# Also make a dictionary containing only images in the range 18-30 years old:
IDs_18_32 = text_files[(text_files['age'] == '(25, 32)') | (text_files['age'] == '(15, 20)')]
IDs_18_32.head()

In [0]:
# Get a dictionary which only contains images from people between 18 and 32 years of age, and label these as older/younger

age_dict_18_32 = {}
for index, i in IDs_18_32.iterrows():
  if i[3] == '(25, 32)':
    Older = 1
  else:
    Older = 0
  age_dict_18_32[i[12]] = Older

In [0]:
# Now we load images from the files, but only if they exist in the dictionary defined above\
def get_images_18_32(path, age_dict):
  images = []
  IDs = []
  Labels = []
  for foldername in tqdm(os.listdir(path)):
      if os.path.isdir(os.path.join(path,foldername)):
        for filename in os.listdir(os.path.join(path,foldername)):
          if filename.endswith(".jpg"):
            img_ID = os.path.basename(filename)[22:]
            if img_ID in age_dict:
              img = load_img(os.path.join(path,foldername,filename),target_size = (224,224))
              img = img_to_array(img)
              img = img.reshape((1,img.shape[0],img.shape[1],img.shape[2]))
              img = img/255
              images.append(img)
              IDs.append(img_ID)
              Labels.append(age_dict[img_ID])
              
  
  return(images,IDs,Labels)

In [0]:
%%time
path = 'gdrive/My Drive/data/aligned/'
Labels = []
images = []
IDs = []

# This loads all data of people between 18 and 32 years of age
for i in ['A','B','C','D']:
    img, ids, label = get_images_18_32(path + i,age_dict_18_32)
    images.extend(img)
    IDs.extend(ids)
    Labels.extend(label)

In [0]:
train_x2 = np.concatenate(images[:round(len(images)*(0.99))],axis = 0)
train_y2 = utils.np_utils.to_categorical(Labels)

In [0]:
train_x2.shape

In [0]:
sum(train_y2)

In [0]:
# Save these files; I ran this several times to also get parts of all data to use for training/holdout etc.
with open('gdrive/My Drive/Data/preprocessed/_18_32_All_X.txt', 'wb') as fp:
    pickle.dump(train_x2,fp)
  
with open('gdrive/My Drive/Data/preprocessed/_18_32_All_y.txt', 'wb') as fp:
    pickle.dump(train_y2,fp)


### I pickle dump these values so they do not need to be loaded again.

In [0]:
with open('gdrive/My Drive/Data/preprocessed/x_D.txt','wb') as fp:
    pickle.dump(A,fp)
    
with open('gdrive/My Drive/Data/preprocessed/y_D.txt','wb') as fp:
    pickle.dump(B,fp)

with open('gdrive/My Drive/Data/preprocessed/val_x_D.txt','wb') as fp:
    pickle.dump(C,fp)
  
with open('gdrive/My Drive/Data/preprocessed/val_y_D.txt','wb') as fp:
    pickle.dump(D,fp)
    
with open('gdrive/My Drive/Data/preprocessed/own_ID_C.txt','wb') as fp:
   pickle.dump(ID,fp)

## Also make datasets using all data

In [0]:
%%time
path = 'gdrive/My Drive/data/aligned/B'
own_x_vgg,own_y_vgg,own_v_x_vgg,own_v_y_vgg,own_ID_vgg = get_data(path,max_folders = 140,age_dict = age_dict, val_split = 0.1,vgg_processing = True)