# Objective
- To classify gender of a person in an image using convolutional neural networks

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm
%matplotlib inline

Let's read the data file which contains urls of images and their corresponding labels (male or female). We'll check the data distribution and then proceed to download the images

In [21]:
df = pd.read_csv("./data_source.csv")
df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,please_select_the_gender_of_the_person_in_the_picture,please_select_the_gender_of_the_person_in_the_picture:confidence,image_url,please_select_the_gender_of_the_person_in_the_picture_gold,user_id
0,1023132475,False,finalized,1,8/19/2016 17:00:25,male,1.0,https://d1qb2nb5cznatu.cloudfront.net/users/40...,,40
1,1023132476,False,finalized,1,8/19/2016 17:00:48,male,1.0,https://d1qb2nb5cznatu.cloudfront.net/users/42...,,42
2,1023132477,False,finalized,1,8/19/2016 17:01:43,male,1.0,https://d1qb2nb5cznatu.cloudfront.net/users/44...,,44
3,1023132478,False,finalized,1,8/19/2016 17:01:04,male,1.0,https://d1qb2nb5cznatu.cloudfront.net/users/47...,,47
4,1023132479,False,finalized,1,8/19/2016 17:00:48,male,1.0,https://d1qb2nb5cznatu.cloudfront.net/users/50...,,50


In [20]:
print("Total records = ", len(df))

Total records =  64084


There are many columns that we don't care about and the column names are very long as well. We'll only select a subset of columns as well as rename them. Then we'll also select only the rows that has confidence of 1. pandas makes it very easy to do so

In [22]:
# select only the columns that we are interested in
df = df[["_unit_id", "please_select_the_gender_of_the_person_in_the_picture", 
    "please_select_the_gender_of_the_person_in_the_picture:confidence", "image_url"]]

# rename the columns
df.columns = ["id", "gender", "confidence", "url"]

# only select the rows that has confidence of 1.0
df = df[df["confidence"] == 1]

print("Total records = ", len(df))

Total records =  64075


Let's check how many samples we have for each gender

In [29]:
df.groupby("gender").count()

Unnamed: 0_level_0,id,confidence,url
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,7364,7364,7364
male,47592,47592,47592
unsure,9119,9119,9119


There are a lot of images for "male". There are also some images with gender "unsure". We'll visualize few data samples from each category and then sample the data in such a way that each category has more or less same number of samples. This is important to make sure that the model learns equally about all categories.

In [43]:
# helper function to display image urls
from IPython.display import HTML, display
def display_images(df, category_name="male", count=12):
    filtered_df = df[df["gender"] == category_name]
    p = np.random.permutation(len(filtered_df))
    p = p[:count]
    img_style = "width:180px; margin:0px; float:left;border:1px solid black;"
    images_list = "".join(["<img style='{}' src='{}'/>".format(img_style, u) for u in filtered_df.iloc[p].url])
    display(HTML(images_list))

In [47]:
display_images(df, category_name="male", count=15)

In [45]:
display_images(df, category_name="female", count=15)

In [46]:
display_images(df, category_name="unsure", count=15)

Images in "unsure" category are either not images of a person or it contains more than one person or the person's face is not facing the camera. There are also some images which could perfectly be labelled as a male or a female. Similarly in "male" and "female" category we can see some images of a cartoon or just text. For now we'll just ignore those for simplicity. If our model does not perform well, then we'll revisit the data cleaning part.

In [57]:
df_male = df[df["gender"] == "male"]
df_female = df[df["gender"] == "female"]

# to make both categories have equal number of samples
# we'll take the counts of the category that has lowest
# number
min_samples = min(len(df_male), len(df_female))

# for indexing randomly
p = np.random.permutation(min_samples)

df_male = df_male.iloc[p]
df_female = df_female.iloc[p]

print("Total male samples = ", len(df_male))
print("Total female samples = ", len(df_female))

df = pd.concat([df_male, df_female])

Total male samples =  7364
Total female samples =  7364


Now let's download the images.

In [None]:
import os
import requests
from io import BytesIO
from PIL import Image

def download_images(df, data_dir="./data"):
    genders = df["gender"].unique()
    for g in genders:
        g_dir = "{}/{}".format(data_dir, g)
        if not os.path.exists(g_dir):
            os.makedirs(g_dir)
            
    for index, row in tqdm.tqdm_notebook(df.iterrows()):
        filepath = "{}/{}/{}.jpg".format(data_dir, row["gender"], row["id"])
        if os.path.exists(filepath):
            continue
        try:
            resp = requests.get(row["url"])
            im = Image.open(BytesIO(resp.content))
            im.save(filepath)
        except:
            print("Error while downloading %s" % row["url"])

DATA_DIR = "./data"
download_images(df, data_dir=DATA_DIR)   

Lets split the data into training and testing set. There are some images that were not properly downloaded and are corrupted which we will remove.

In [85]:
import glob

TRAIN_DIR = DATA_DIR + "/train"
TEST_DIR = DATA_DIR + "/test"

for d in [TRAIN_DIR, TEST_DIR]:
    for g in df["gender"].unique():
        final_dir = "{}/{}".format(d, g)
        if not os.path.exists(final_dir):
            os.makedirs(final_dir)

In [86]:
from random import shuffle
import math
import shutil

split_ratio = 0.7 # we'll reserve 70% of the images for training set

def validate_and_move(files, target_dir):
    for f in tqdm.tqdm_notebook(files):
        # try to open the file to make sure that this is not corrupted
        try:
            im = Image.open(f)
            shutil.copy(f, target_dir)
        except:
            pass
#             os.remove(f)

for gender in df["gender"].unique():
    gender_dir = "{}/{}".format(DATA_DIR, gender)
    pattern = "{}/*.jpg".format(gender_dir)
    all_files = glob.glob(pattern)
    shuffle(all_files)
    
    train_up_to = math.ceil(len(all_files) * split_ratio)
    train_files = all_files[:train_up_to]
    test_files = all_files[train_up_to:]
    
    
    validate_and_move(train_files, TRAIN_DIR + "/" + gender)
    validate_and_move(test_files, TEST_DIR + "/" + gender)







So far we did some basic visualization and prepared our dataset. We'll build and train a model in the next part.