In [1]:
import json
import re
import collections
import numpy as np
import multiprocessing

import pandas as pd

from pyagender import PyAgender
from io import BytesIO
from PIL import Image
import requests

import cv2

Using TensorFlow backend.


In this notebook we get the age and gender based of the picture of a person.
For this we consider only a subset of all images, those containing people that have a name associated with them.

we do the following processing:

1. get ids for photos where only one person is in the image
2. get list of images associated with on person
3. use py-agender to get the age and gender

finally we do some evaluation

### 1. get ids of photos where only one person is in the image

In [2]:
df = pd.read_pickle('named_subject.pkl')

In [3]:
df

Unnamed: 0,descriptor,id,19e,Portraits,caricature,subject_name,named_subject
0,"Auguez, Mathilde (1868-1955) -- Portraits",https://gallica.bnf.fr/ark:/12148/btv1b53168872x,False,True,False,"[Auguez, Mathilde]","Auguez, Mathilde"
2,"Auguez, Mathilde (1868-1955) -- Portraits",https://gallica.bnf.fr/ark:/12148/btv1b531688701,False,True,False,"[Auguez, Mathilde]","Auguez, Mathilde"
4,"Bruant, Aristide (1851-1925) -- Portraits",https://gallica.bnf.fr/ark:/12148/btv1b53171770j,False,True,False,"[Bruant, Aristide]","Bruant, Aristide"
6,"Auguez, Mathilde (1868-1955) -- Portraits",https://gallica.bnf.fr/ark:/12148/btv1b53168873c,False,True,False,"[Auguez, Mathilde]","Auguez, Mathilde"
8,"Brasseur, Albert (1862-1932) -- Portraits",https://gallica.bnf.fr/ark:/12148/btv1b531651494,False,True,False,"[Brasseur, Albert]","Brasseur, Albert"
...,...,...,...,...,...,...,...
9589,Cham (1818-1879) -- Oeuvres -- Dessin,https://gallica.bnf.fr/ark:/12148/btv1b531189073,False,False,False,[Cham],Cham
9980,"Carvalho, Léon (1825-1897) -- Tombes",https://gallica.bnf.fr/ark:/12148/btv1b53117431d,False,False,False,"[Carvalho, Léon]","Carvalho, Léon"
11180,"Sand, George (1804-1876) -- Statues",https://gallica.bnf.fr/ark:/12148/btv1b530916961,False,False,False,"[Sand, George]","Sand, George"
11795,"Sand, George (1804-1876) -- Statues",https://gallica.bnf.fr/ark:/12148/btv1b53091695k,False,False,False,"[Sand, George]","Sand, George"


In [4]:
person_per_image = df.groupby('id').named_subject.count()

In [5]:
individual_portraits = person_per_image[person_per_image == 1].index

How many pictures do we have of one person?

In [6]:
df[df.id.isin(individual_portraits)].groupby('named_subject').id.count()

named_subject
Abbatucci, Séverin               5
Abbott, Emma                    50
Abbéma, Louise                  12
Abney, William de Wiveleslie     3
Abott, Bessie                    1
                                ..
Édouard VII                     25
Énault, Louis                    1
Éon, Charles de Beaumont d'      1
Étex, Antoine                    2
Étiévant, Henri                  1
Name: id, Length: 1903, dtype: int64

12 people don't have portraits. That's okay, we focus on the people that do.

In [7]:
df[df.id.isin(individual_portraits) & df.Portraits].groupby('named_subject').id.count()

named_subject
Abbatucci, Séverin               5
Abbott, Emma                    50
Abbéma, Louise                  12
Abney, William de Wiveleslie     3
Abott, Bessie                    1
                                ..
Édouard VII                     25
Énault, Louis                    1
Éon, Charles de Beaumont d'      1
Étex, Antoine                    2
Étiévant, Henri                  1
Name: id, Length: 1886, dtype: int64

### 2. get list associated to person

In [8]:
personal_portrait_image = df[df.id.isin(individual_portraits) &\
                             df.Portraits].groupby('named_subject').apply(lambda x: x.id.tolist())
personal_portrait_image = personal_portrait_image.rename('id').reset_index()

In [9]:
personal_portrait_image

Unnamed: 0,named_subject,id
0,"Abbatucci, Séverin",[https://gallica.bnf.fr/ark:/12148/btv1b530716...
1,"Abbott, Emma",[https://gallica.bnf.fr/ark:/12148/btv1b531637...
2,"Abbéma, Louise",[https://gallica.bnf.fr/ark:/12148/btv1b530715...
3,"Abney, William de Wiveleslie",[https://gallica.bnf.fr/ark:/12148/btv1b531651...
4,"Abott, Bessie",[https://gallica.bnf.fr/ark:/12148/btv1b531175...
...,...,...
1881,Édouard VII,[https://gallica.bnf.fr/ark:/12148/btv1b530980...
1882,"Énault, Louis",[https://gallica.bnf.fr/ark:/12148/btv1b530653...
1883,"Éon, Charles de Beaumont d'",[https://gallica.bnf.fr/ark:/12148/btv1b530664...
1884,"Étex, Antoine",[https://gallica.bnf.fr/ark:/12148/btv1b531271...


In [10]:
agender = PyAgender() 

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


### 3. get age-gender lables 

the api not only gives us an age and gender estimate, but also the rectangle pointing to the face. We keep that as it can be used for the face map 

In [11]:
def get_image(doc):
    url = doc+'/f1.highres.jpg'
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img = Image.open(BytesIO(response.content)).convert('RGB')
    img = np.array(img)
    return img

def get_age_gender_estimates(image_docs):
    i = 0
    estimates = []
    
    #handle case where for one image of personn we can't get estimates
    while i < len(image_docs) and len(estimates) == 0:
        img = get_image(image_docs[i])
        retries = 0
        while retries < 5 and len(estimates) == 0:
            estimates = agender.detect_genders_ages(img)
            retries += 1
        i = i+1
        
    if estimates:
        # use first estimate as it is most likely one
        result = estimates[0]
        result['number'] = len(estimates)
        result['id'] = image_docs[i-1]
        return result
    
    return {}

In [12]:
if True:
    age_gender_lables = personal_portrait_image.id.map(get_age_gender_estimates)  
    age_gender_lables = pd.DataFrame(age_gender_lables.tolist())
    age_gender_lables['name'] = personal_portrait_image.named_subject
    age_gender_lables.to_json('age_gender_labeles.json')

In [63]:
#age_gender_lables = pd.read_json('age_gender_labeles.json')

# evaluation of method


evaluation of algorithm itself is presented on wikipage of py-agender: https://github.com/yu4u/age-gender-estimation

In [80]:
age_gender_lables#.map(len).value_counts()

Unnamed: 0,left,top,right,bottom,width,height,gender,age,number,id,name
0,269.0,268.0,748.0,747.0,479.0,479.0,0.116277,47.741472,1.0,https://gallica.bnf.fr/ark:/12148/btv1b53071608k,"Abbatucci, Séverin"
1,433.0,316.0,580.0,463.0,147.0,147.0,0.585720,27.465579,2.0,https://gallica.bnf.fr/ark:/12148/btv1b531622212,"Abbott, Emma"
2,250.0,718.0,403.0,871.0,153.0,153.0,0.377803,40.358822,3.0,https://gallica.bnf.fr/ark:/12148/btv1b53071525p,"Abbéma, Louise"
3,155.0,134.0,872.0,851.0,717.0,717.0,0.104841,44.487285,1.0,https://gallica.bnf.fr/ark:/12148/btv1b53165139q,"Abney, William de Wiveleslie"
4,415.0,184.0,615.0,384.0,200.0,200.0,0.389818,27.194442,1.0,https://gallica.bnf.fr/ark:/12148/btv1b53117510h,"Abott, Bessie"
...,...,...,...,...,...,...,...,...,...,...,...
1803,266.0,367.0,641.0,742.0,375.0,375.0,0.011937,42.445805,1.0,https://gallica.bnf.fr/ark:/12148/btv1b53066482v,Émile
1804,355.0,310.0,596.0,551.0,241.0,241.0,0.033713,45.716630,1.0,https://gallica.bnf.fr/ark:/12148/btv1b53065399p,"Énault, Louis"
1805,288.0,437.0,583.0,732.0,295.0,295.0,0.358721,26.078685,1.0,https://gallica.bnf.fr/ark:/12148/btv1b530664193,"Éon, Charles de Beaumont d'"
1806,310.0,214.0,739.0,643.0,429.0,429.0,0.108013,47.732863,1.0,https://gallica.bnf.fr/ark:/12148/btv1b531271467,"Étex, Antoine"


can't get lables for 71 people

In [None]:
personal_portrait_image[age_gender_lables.map(len) == 0]

Inspection of 10 images gives two reasons why algorithm doesn't work: images are too dark, profile is from the side, or subject is wearing glasses.

This is an issue with the model and underlying traing data used

In [None]:
personal_portrait_image[age_gender_lables.map(len) == 0].id.map(lambda x: x[-1])

labels that we do get:

In [None]:
df_evaluate = pd.DataFrame(age_gender_lables.tolist())

number of faces that we got:

In [None]:
df_evaluate.number.value_counts()

In [None]:
df_evaluate.age.plot(kind='hist', bins=100)

mostly men

In [None]:
df_evaluate.gender.plot(kind='hist', bins=100)

Clearly, given the name something went wrong

In [None]:
sample = personal_portrait_image[df_evaluate.gender < .5].sample(20)

In [None]:
sample

In [None]:
df_evaluate.iloc[sample.index].age

checking the image, gender predictions seem iffy, but age seems okay! maybe there were not enough old women in the dataset ... 

In [None]:
df_evaluate.iloc[sample.index].id

## Example of multiple matches or mismatches

In [None]:
age_gender_estimates[1564]

In [None]:
import matplotlib.pyplot as plt

In [None]:
df_evaluate

In [None]:
font = {'family': 'serif',
        'color':  'yellow',
        'weight': 'normal',
        'size': 16,
        }
img = get_image(df_evaluate.id[1296])

for detect in [age_gender_lables[1296]]:
    gender =  'Woman' if detect['gender'] > .5 else 'Man'
    plt.figure(figsize=(10, 10))
    plt.text(detect['left'], detect['top']-10, str(detect['age'])[:2] + ' ' + gender, fontdict=font)
    plt.imshow(cv2.rectangle(img, (detect['left'], detect['top']), (detect['right'], detect['bottom']), (255, 255, 0), 3))