## AGE GENDER And RACE DETECTION

### Data Preparation

In [None]:
# !pip install opencv-python

In [1]:
import os
import pandas as pd
import numpy as np
from scipy import ndimage
import cv2
from tqdm import tqdm

In [2]:
faces_path = '../../age_detection/age_gender_race/'
faces_image_names = os.listdir(faces_path)

In [3]:
print(faces_image_names[0])#name of image => age , gender , race , date
len(faces_image_names) #no of image 

65_0_0_20170103183632050.jpg.chip.jpg


23708

In [4]:
def label_name_split(image_name): #functions to split the name into labels
    image_labels = image_name.split('_')
    age = image_labels[0]
    gender = image_labels[1]
    race = image_labels[2]
    return age, gender, race

print(label_name_split(faces_image_names[0]))

('65', '0', '0')


In [5]:
#dictionary to map image name to their labels
faces_age_labels = np.array([])
faces_ages_df = pd.DataFrame(columns = ['image','age'],dtype='int')
for i , image in enumerate(faces_image_names):
    age, gender, race = label_name_split(image)
    faces_age_labels = np.append(faces_age_labels,age)
    faces_ages_df.loc[i,'image'] = faces_path + image
    faces_ages_df.loc[i,'age'] = age
    faces_ages_df.loc[i,'target_age'] = 0
    faces_ages_df.loc[i,'target_gender'] = gender
    faces_ages_df.loc[i,'target_race'] = race

In [6]:
unique_ages , unique_ages_count = np.unique(faces_age_labels,return_counts=True) #unqiue age and how many images are there

In [7]:
#dictionary ages and how many image of that image
ages_count = {}
for age,count in zip(unique_ages,unique_ages_count):
    ages_count[age] = count
images_df = pd.DataFrame(ages_count.values(), index=ages_count.keys(), columns=['images'])
print(images_df.head)

<bound method NDFrame.head of      images
1      1123
10      156
100      11
101       2
103       1
..      ...
92       13
93        5
95        9
96       17
99        9

[104 rows x 1 columns]>


### Label for age and Class Imabalance 

Age-ranges (classes)<br>
0  1 - 4	
1   5 - 18	
2	19 - 24	
3	25 - 26	
4	27 - 30	
5	31 - 35	
6	36 - 42	
7	43 - 53	
8	54 - 65	
9	66 - 116

In [8]:
import pandas as pd

In [9]:
def split_classes(ser, n_classes):
     
    ages_label = {}
    
    n_images = int(sum(ser) / n_classes)

    classes_df = pd.DataFrame()

    age_index = 0

    for i in range(n_classes):
        if age_index<=103:
            age_start = ser.index[age_index]
            age_current = ser.index[age_index]
        else:
            break

        class_images = 0
        
        while class_images < n_images:
            class_images += ser[age_current]
            age_index += 1

            if age_index<=103:
                age_current = ser.index[age_index]
            else:
                break
                
        if age_index<=104:
            age_end = ser.index[age_index-1]
        else:
            break
            
        ages_label[i] = {
            'start':age_start,
            'end':age_end
        }
        classes_df.loc[i, 'age(start)'] = age_start 
        classes_df.loc[i, 'age(end)'] = age_end
        classes_df.loc[i, 'class balance'] = round((class_images / sum(ser)) * 100, 2)
    
    return classes_df , ages_label

In [10]:
a,label_age = split_classes(images_df['images'],11)
print(a)
label_age

  age(start) age(end)  class balance
0          1       16           9.14
1         17       23          10.34
2         24       26          15.99
3         27        3          10.09
4         30       34           9.66
5         35       39           9.48
6          4       47           9.35
7         48       56           9.90
8         57       70           9.22
9         71       99           6.84


{0: {'start': '1', 'end': '16'},
 1: {'start': '17', 'end': '23'},
 2: {'start': '24', 'end': '26'},
 3: {'start': '27', 'end': '3'},
 4: {'start': '30', 'end': '34'},
 5: {'start': '35', 'end': '39'},
 6: {'start': '4', 'end': '47'},
 7: {'start': '48', 'end': '56'},
 8: {'start': '57', 'end': '70'},
 9: {'start': '71', 'end': '99'}}

### save ages dataset

In [11]:
faces_ages_df.head(10)

Unnamed: 0,image,age,target_age,target_gender,target_race
0,../../age_detection/age_gender_race/65_0_0_201...,65,0.0,0,0
1,../../age_detection/age_gender_race/71_0_0_201...,71,0.0,0,0
2,../../age_detection/age_gender_race/38_0_1_201...,38,0.0,0,1
3,../../age_detection/age_gender_race/16_0_0_201...,16,0.0,0,0
4,../../age_detection/age_gender_race/1_1_4_2017...,1,0.0,1,4
5,../../age_detection/age_gender_race/60_0_3_201...,60,0.0,0,3
6,../../age_detection/age_gender_race/26_1_2_201...,26,0.0,1,2
7,../../age_detection/age_gender_race/22_1_2_201...,22,0.0,1,2
8,../../age_detection/age_gender_race/1_0_2_2016...,1,0.0,0,2
9,../../age_detection/age_gender_race/32_0_0_201...,32,0.0,0,0


In [12]:
def class_labels(age):
    if 1 <= age <= 2:
        return 0
    elif 3 <= age <= 9:
        return 1
    elif 10 <= age <= 20:
        return 2
    elif 21 <= age <= 27:
        return 3
    elif 28 <= age <= 45:
        return 4
    elif 46 <= age <= 65:
        return 5
    else:
        return 6
faces_ages_df['age'] = faces_ages_df['age'].astype(int)
faces_ages_df['target_age'] = faces_ages_df['age'].map(class_labels)

In [13]:
faces_ages_df.head(10)

Unnamed: 0,image,age,target_age,target_gender,target_race
0,../../age_detection/age_gender_race/65_0_0_201...,65,5,0,0
1,../../age_detection/age_gender_race/71_0_0_201...,71,6,0,0
2,../../age_detection/age_gender_race/38_0_1_201...,38,4,0,1
3,../../age_detection/age_gender_race/16_0_0_201...,16,2,0,0
4,../../age_detection/age_gender_race/1_1_4_2017...,1,0,1,4
5,../../age_detection/age_gender_race/60_0_3_201...,60,5,0,3
6,../../age_detection/age_gender_race/26_1_2_201...,26,3,1,2
7,../../age_detection/age_gender_race/22_1_2_201...,22,3,1,2
8,../../age_detection/age_gender_race/1_0_2_2016...,1,0,0,2
9,../../age_detection/age_gender_race/32_0_0_201...,32,4,0,0


### Data Augmentation

In [14]:
# Making a new folder to save all augmented images as a new dataset.
new_path = "../../age_detection/new_face_dataset"
path = '../../age_detection/'
train_df = faces_ages_df
train_aug_df = pd.DataFrame(columns=train_df.columns)
train_aug_df

Unnamed: 0,image,age,target_age,target_gender,target_race


In [15]:
# Running a for-loop through all the images in train_df and augmenting them.

for i in tqdm(range(train_df.shape[0])):

    # Reading the image filepath, name, age and target values.
    img_path = train_df.loc[i, 'image']
    img_name = img_path.split("/")[-1]
    img_age = train_df.loc[i, 'age']
    img_target_race = train_df.loc[i,'target_race']
    img_target_age = train_df.loc[i, 'target_age']
    img_target_gender = train_df.loc[i,'target_gender']

    # Reading the image from the filepath and converting it to proper colour format in cv2.
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    # Creating the augmented versions of the image as shown above.
    img_rot_pos40 = ndimage.rotate(img, 40, reshape=False)
    img_rot_pos20 = ndimage.rotate(img, 20, reshape=False)
    img_rot_neg20 = ndimage.rotate(img, -20, reshape=False)
    img_rot_neg40 = ndimage.rotate(img, -40, reshape=False)
    img_fliplr = np.fliplr(img)
    img_fliplr_rot_pos40 = ndimage.rotate(img_fliplr, 40, reshape=False)
    img_fliplr_rot_pos20 = ndimage.rotate(img_fliplr, 20, reshape=False)
    img_fliplr_rot_neg20 = ndimage.rotate(img_fliplr, -20, reshape=False)
    img_fliplr_rot_neg40 = ndimage.rotate(img_fliplr, -40, reshape=False)

    # Creating new image names for the augmented images.
    img_name_wo_jpg = img_name.split(".")[0]
    img_name2 = img_name_wo_jpg+"_rot_pos40.jpg"
    img_name3 = img_name_wo_jpg+"_rot_pos20.jpg"
    img_name4 = img_name_wo_jpg+"_rot_neg20.jpg"
    img_name5 = img_name_wo_jpg+"_rot_neg40.jpg"
    img_name6 = img_name_wo_jpg+"_fliplr.jpg"
    img_name7 = img_name_wo_jpg+"_fliplr_rot_pos40.jpg"
    img_name8 = img_name_wo_jpg+"_fliplr_rot_pos20.jpg"
    img_name9 = img_name_wo_jpg+"_fliplr_rot_neg20.jpg"
    img_name10 = img_name_wo_jpg+"_fliplr_rot_neg40.jpg"

    # Saving the original image and all the augmented images into a new folder combined_faces_train_augmented.
    img_dest1 = os.path.join(new_path, img_name)
    cv2.imwrite(img_dest1, img, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
    img_dest2 = os.path.join(new_path, img_name2)
    cv2.imwrite(img_dest2, img_rot_pos40, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
    img_dest3 = os.path.join(new_path, img_name3)
    cv2.imwrite(img_dest3, img_rot_pos20, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
    img_dest4 = os.path.join(new_path, img_name4)
    cv2.imwrite(img_dest4, img_rot_neg20, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
    img_dest5 = os.path.join(new_path, img_name5)
    cv2.imwrite(img_dest5, img_rot_neg40, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
    img_dest6 = os.path.join(new_path, img_name6)
    cv2.imwrite(img_dest6, img_fliplr, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
    img_dest7 = os.path.join(new_path, img_name7)
    cv2.imwrite(img_dest7, img_fliplr_rot_pos40, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
    img_dest8 = os.path.join(new_path, img_name8)
    cv2.imwrite(img_dest8, img_fliplr_rot_pos20, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
    img_dest9 = os.path.join(new_path, img_name9)
    cv2.imwrite(img_dest9, img_fliplr_rot_neg20, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
    img_dest10 = os.path.join(new_path, img_name10)
    cv2.imwrite(img_dest10, img_fliplr_rot_neg40, [int(cv2.IMWRITE_JPEG_QUALITY), 100])

    # Creating a new temp_df for the augmented images (similar to train_df).
    temp_df = pd.DataFrame(columns=train_df.columns)
    temp_df.loc[1] = [path+img_dest1, img_age, img_target_age,img_target_gender,img_target_race]
    temp_df.loc[2] = [path+img_dest2, img_age, img_target_age,img_target_gender,img_target_race]
    temp_df.loc[3] = [path+img_dest3, img_age, img_target_age,img_target_gender,img_target_race]
    temp_df.loc[4] = [path+img_dest4, img_age, img_target_age,img_target_gender,img_target_race]
    temp_df.loc[5] = [path+img_dest5, img_age, img_target_age,img_target_gender,img_target_race]
    temp_df.loc[6] = [path+img_dest6, img_age, img_target_age,img_target_gender,img_target_race]
    temp_df.loc[7] = [path+img_dest7, img_age, img_target_age,img_target_gender,img_target_race]
    temp_df.loc[8] = [path+img_dest8, img_age, img_target_age,img_target_gender,img_target_race]
    temp_df.loc[9] = [path+img_dest9, img_age, img_target_age,img_target_gender,img_target_race]
    temp_df.loc[10] = [path+img_dest10, img_age, img_target_age,img_target_race,img_target_gender]

    # Concatenating temp_df with the new train_aug_df.
    train_aug_df = pd.concat([train_aug_df, temp_df], axis=0, ignore_index=True)

    # Keeping track of progress and printing relevant statements for the user.
    if (i+1) % 500 == 0:
        print(f"Images augmented: {i+1} of {train_df.shape[0]}")

print("\nDone augmenting all training dataset images and saved them into combined_faces_train_augmented.")

  2%|▏         | 501/23708 [01:23<1:00:43,  6.37it/s]

Images augmented: 500 of 23708


  4%|▍         | 1001/23708 [02:46<1:04:58,  5.83it/s]

Images augmented: 1000 of 23708


  6%|▋         | 1500/23708 [04:15<1:07:09,  5.51it/s]

Images augmented: 1500 of 23708


  8%|▊         | 2001/23708 [05:43<59:36,  6.07it/s]  

Images augmented: 2000 of 23708


 11%|█         | 2500/23708 [07:15<59:03,  5.99it/s]  

Images augmented: 2500 of 23708


 13%|█▎        | 3000/23708 [09:11<1:27:01,  3.97it/s]

Images augmented: 3000 of 23708


 15%|█▍        | 3501/23708 [11:33<1:37:28,  3.46it/s]

Images augmented: 3500 of 23708


 17%|█▋        | 4000/23708 [14:06<2:34:44,  2.12it/s]

Images augmented: 4000 of 23708


 19%|█▉        | 4500/23708 [16:32<2:02:30,  2.61it/s]

Images augmented: 4500 of 23708


 21%|██        | 5000/23708 [19:07<1:23:35,  3.73it/s]

Images augmented: 5000 of 23708


 23%|██▎       | 5500/23708 [21:33<1:26:56,  3.49it/s]

Images augmented: 5500 of 23708


 25%|██▌       | 6001/23708 [24:21<1:09:59,  4.22it/s]

Images augmented: 6000 of 23708


 27%|██▋       | 6500/23708 [26:49<1:18:20,  3.66it/s]

Images augmented: 6500 of 23708


 30%|██▉       | 7000/23708 [29:23<1:23:58,  3.32it/s]

Images augmented: 7000 of 23708


 32%|███▏      | 7501/23708 [31:52<1:22:32,  3.27it/s]

Images augmented: 7500 of 23708


 34%|███▎      | 8000/23708 [34:42<1:18:05,  3.35it/s]

Images augmented: 8000 of 23708


 36%|███▌      | 8500/23708 [37:33<1:16:57,  3.29it/s]

Images augmented: 8500 of 23708


 38%|███▊      | 9000/23708 [40:23<1:07:00,  3.66it/s]

Images augmented: 9000 of 23708


 40%|████      | 9500/23708 [43:18<1:20:48,  2.93it/s]

Images augmented: 9500 of 23708


 42%|████▏     | 10000/23708 [46:00<1:17:39,  2.94it/s]

Images augmented: 10000 of 23708


 44%|████▍     | 10500/23708 [48:53<1:15:58,  2.90it/s]

Images augmented: 10500 of 23708


 46%|████▋     | 11000/23708 [52:04<1:14:57,  2.83it/s]

Images augmented: 11000 of 23708


 49%|████▊     | 11500/23708 [55:09<51:31,  3.95it/s]  

Images augmented: 11500 of 23708


 51%|█████     | 12000/23708 [58:00<48:09,  4.05it/s]  

Images augmented: 12000 of 23708


 53%|█████▎    | 12500/23708 [1:00:52<1:34:59,  1.97it/s]

Images augmented: 12500 of 23708


 55%|█████▍    | 13000/23708 [1:03:55<1:00:39,  2.94it/s]

Images augmented: 13000 of 23708


 57%|█████▋    | 13500/23708 [1:07:18<50:29,  3.37it/s]  

Images augmented: 13500 of 23708


 59%|█████▉    | 14000/23708 [1:10:37<2:11:09,  1.23it/s]

Images augmented: 14000 of 23708


 61%|██████    | 14500/23708 [1:13:41<49:51,  3.08it/s]  

Images augmented: 14500 of 23708


 63%|██████▎   | 15000/23708 [1:16:46<38:46,  3.74it/s]  

Images augmented: 15000 of 23708


 65%|██████▌   | 15500/23708 [1:19:54<48:49,  2.80it/s]  

Images augmented: 15500 of 23708


 67%|██████▋   | 16000/23708 [1:23:19<59:53,  2.14it/s]  

Images augmented: 16000 of 23708


 70%|██████▉   | 16500/23708 [1:26:45<1:00:23,  1.99it/s]

Images augmented: 16500 of 23708


 72%|███████▏  | 17000/23708 [1:30:27<39:10,  2.85it/s]  

Images augmented: 17000 of 23708


 74%|███████▍  | 17500/23708 [1:33:28<36:43,  2.82it/s]  

Images augmented: 17500 of 23708


 76%|███████▌  | 18000/23708 [1:36:18<33:28,  2.84it/s]  

Images augmented: 18000 of 23708


 78%|███████▊  | 18500/23708 [1:39:52<53:31,  1.62it/s]  

Images augmented: 18500 of 23708


 80%|████████  | 19000/23708 [1:43:21<22:34,  3.48it/s]  

Images augmented: 19000 of 23708


 82%|████████▏ | 19500/23708 [1:46:45<18:51,  3.72it/s]  

Images augmented: 19500 of 23708


 84%|████████▍ | 20000/23708 [1:50:05<34:58,  1.77it/s]  

Images augmented: 20000 of 23708


 86%|████████▋ | 20500/23708 [1:53:19<34:12,  1.56it/s]

Images augmented: 20500 of 23708


 89%|████████▊ | 21000/23708 [1:56:51<16:59,  2.66it/s]

Images augmented: 21000 of 23708


 91%|█████████ | 21500/23708 [2:00:24<20:06,  1.83it/s]

Images augmented: 21500 of 23708


 93%|█████████▎| 22000/23708 [2:03:42<13:24,  2.12it/s]

Images augmented: 22000 of 23708


 95%|█████████▍| 22500/23708 [2:07:08<07:22,  2.73it/s]

Images augmented: 22500 of 23708


 97%|█████████▋| 23000/23708 [2:10:48<04:23,  2.69it/s]

Images augmented: 23000 of 23708


 99%|█████████▉| 23500/23708 [2:14:31<00:57,  3.60it/s]

Images augmented: 23500 of 23708


100%|██████████| 23708/23708 [2:15:54<00:00,  2.91it/s]


Done augmenting all training dataset images and saved them into combined_faces_train_augmented.





In [17]:
train_aug_df.head(100)

Unnamed: 0,image,age,target_age,target_gender,target_race
0,../../age_detection/../../age_detection/new_fa...,65,5,0,0
1,../../age_detection/../../age_detection/new_fa...,65,5,0,0
2,../../age_detection/../../age_detection/new_fa...,65,5,0,0
3,../../age_detection/../../age_detection/new_fa...,65,5,0,0
4,../../age_detection/../../age_detection/new_fa...,65,5,0,0
...,...,...,...,...,...
95,../../age_detection/../../age_detection/new_fa...,32,4,0,0
96,../../age_detection/../../age_detection/new_fa...,32,4,0,0
97,../../age_detection/../../age_detection/new_fa...,32,4,0,0
98,../../age_detection/../../age_detection/new_fa...,32,4,0,0


In [18]:
train_aug_df.shape

(237080, 5)

In [20]:
from sklearn.utils import shuffle
train_aug_df = shuffle(train_aug_df).reset_index(drop=True)

In [21]:
train_aug_df.head(10)

Unnamed: 0,image,age,target_age,target_gender,target_race
0,../../age_detection/../../age_detection/new_fa...,36,4,1,0
1,../../age_detection/../../age_detection/new_fa...,2,0,1,3
2,../../age_detection/../../age_detection/new_fa...,1,0,1,1
3,../../age_detection/../../age_detection/new_fa...,10,2,1,3
4,../../age_detection/../../age_detection/new_fa...,70,6,0,1
5,../../age_detection/../../age_detection/new_fa...,28,4,1,3
6,../../age_detection/../../age_detection/new_fa...,24,3,1,4
7,../../age_detection/../../age_detection/new_fa...,11,2,0,4
8,../../age_detection/../../age_detection/new_fa...,26,3,1,4
9,../../age_detection/../../age_detection/new_fa...,13,2,1,4


In [25]:
train_aug_df.to_csv("train.csv", index=False)

In [23]:
def class_labels(age):
    return age[20:]
train_aug_df['image'] = train_aug_df['image'].map(class_labels)

In [24]:
train_aug_df.head(5)

Unnamed: 0,image,age,target_age,target_gender,target_race
0,../../age_detection/new_face_dataset/36_1_0_20...,36,4,1,0
1,../../age_detection/new_face_dataset/2_1_3_201...,2,0,1,3
2,../../age_detection/new_face_dataset/1_1_1_201...,1,0,1,1
3,../../age_detection/new_face_dataset/10_1_3_20...,10,2,1,3
4,../../age_detection/new_face_dataset/70_0_1_20...,70,6,0,1
