In [10]:
# Common imports
import numpy as np
import os
import seaborn as sns
from sklearn.preprocessing import Imputer
import matplotlib.image as mpimg

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = ""
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [2]:
import pandas as pd
import scipy.io
imdb_mat = scipy.io.loadmat('./datasets/imdb_crop/imdb.mat')
wiki_mat = scipy.io.loadmat('./datasets/wiki_crop/wiki.mat')

In [3]:
wiki = pd.DataFrame(np.hstack((wiki_mat['wiki'])))
imdb = pd.DataFrame(np.hstack((imdb_mat['imdb'])))

In [4]:
for c in wiki.columns:
    print(c, wiki[c][0][0].shape)
for c in imdb.columns:
    print(c, imdb[c][0][0].shape)
data_wiki = pd.DataFrame({c:wiki[c][0][0] for c in wiki.columns})
data_imdb = pd.DataFrame({c:imdb[c][0][0] for c in imdb.columns if (c !='celeb_names')})

('dob', (62328,))
('photo_taken', (62328,))
('full_path', (62328,))
('gender', (62328,))
('name', (62328,))
('face_location', (62328,))
('face_score', (62328,))
('second_face_score', (62328,))
('dob', (460723,))
('photo_taken', (460723,))
('full_path', (460723,))
('gender', (460723,))
('name', (460723,))
('face_location', (460723,))
('face_score', (460723,))
('second_face_score', (460723,))
('celeb_names', (20284,))
('celeb_id', (460723,))


In [5]:
print(data_wiki.shape, data_imdb.shape)

((62328, 8), (460723, 9))


Remove unuseful columns:

In [6]:
data_imdb.drop('face_location', axis=1, inplace=True)
data_imdb.drop('face_score', axis=1, inplace=True)
data_imdb.drop('second_face_score', axis=1, inplace=True)
data_imdb.drop('celeb_id', axis=1, inplace=True)
data_wiki.drop('face_location', axis=1, inplace=True)
data_wiki.drop('face_score', axis=1, inplace=True)
data_wiki.drop('second_face_score', axis=1, inplace=True)

Customize strings:

In [7]:
data_wiki['full_path'] = data_wiki['full_path'].astype(str).str.replace("]", "")
data_wiki['full_path'] = data_wiki['full_path'].astype(str).str.replace("'", "")
data_wiki['full_path'] = data_wiki['full_path'].astype(str).str.replace("u", "")
data_wiki['full_path'] = data_wiki['full_path'].astype(str).str.replace("[", "./datasets/wiki/")
data_wiki['full_path_crop'] = data_wiki['full_path'].astype(str).str.replace("wiki", "wiki_crop")
data_imdb['full_path'] = data_imdb['full_path'].astype(str).str.replace("]", "")
data_imdb['full_path'] = data_imdb['full_path'].astype(str).str.replace("'", "")
data_imdb['full_path'] = data_imdb['full_path'].astype(str).str.replace("u", "")
data_imdb['full_path'] = data_imdb['full_path'].astype(str).str.replace("[", "./datasets/imdb/")
data_imdb['full_path_crop'] = data_imdb['full_path'].astype(str).str.replace("imdb", "imdb_crop")

In [8]:
data_wiki['name'] = data_wiki['name'].astype(str).str.replace("]", "")
data_wiki['name'] = data_wiki['name'].astype(str).str.replace("[", "")
data_imdb['name'] = data_imdb['name'].astype(str).str.replace("]", "")
data_imdb['name'] = data_imdb['name'].astype(str).str.replace("[", "")

Add time information:

In [9]:
# Cut on date of birth to avoid overflow on timedate
data_imdb = data_imdb[data_imdb["dob"]>500000]

In [10]:
data_imdb[data_imdb['dob']>500000].describe()

Unnamed: 0,dob,gender,photo_taken
count,460594.0,452132.0,460594.0
mean,719189.034768,0.581996,2005.460894
std,5566.78265,0.493231,9.054951
min,632653.0,0.0,1961.0
25%,716375.0,0.0,2004.0
50%,719935.0,1.0,2008.0
75%,723073.0,1.0,2011.0
max,734963.0,1.0,2015.0


In [11]:
from datetime import timedelta, datetime

data_wiki['birthdate'] = data_wiki['dob'].apply(lambda x: datetime.fromordinal(x) + timedelta(days=x%1) - timedelta(days = 366))
data_imdb['birthdate'] = data_imdb['dob'].apply(lambda x: datetime.fromordinal(x) + timedelta(days=x%1) - timedelta(days = 366))

In [12]:
data_wiki['birthdate'] = pd.to_datetime(data_wiki['birthdate'], errors='coerce')
data_wiki["year_birth"] = data_wiki["birthdate"].dt.year
data_wiki["month_birth"] = data_wiki["birthdate"].dt.month
data_wiki["weekday_birth"] = data_wiki["birthdate"].dt.weekday
data_wiki["day_birth"] = data_wiki["birthdate"].dt.day

data_imdb['birthdate'] = pd.to_datetime(data_imdb['birthdate'], errors='coerce')
data_imdb["year_birth"] = data_imdb["birthdate"].dt.year
data_imdb["month_birth"] = data_imdb["birthdate"].dt.month
data_imdb["weekday_birth"] = data_imdb["birthdate"].dt.weekday
data_imdb["day_birth"] = data_imdb["birthdate"].dt.day


Compute age:

In [13]:
data_wiki["age"] = data_wiki['photo_taken']-data_wiki["year_birth"]
data_imdb["age"] = data_imdb['photo_taken']-data_imdb["year_birth"]

Compute photo size:

In [14]:
import os
data_wiki['filesize'] = data_wiki['full_path'].apply(lambda x: os.path.getsize(x))
data_wiki['filesize_crop'] = data_wiki['full_path_crop'].apply(lambda x: os.path.getsize(x))
data_imdb['filesize_crop'] = data_imdb['full_path_crop'].apply(lambda x: os.path.getsize(x))

Image shape:

In [16]:
import matplotlib.image as mpimg

data_imdb['imsize'] = data_imdb['full_path_crop'].apply(lambda x: len(mpimg.imread(x).shape))
data_wiki['imsize'] = data_wiki['full_path_crop'].apply(lambda x: len(mpimg.imread(x).shape))

KeyboardInterrupt: 

In [246]:
data_imdb

Unnamed: 0,dob,full_path,gender,name,photo_taken,full_path_crop,birthdate,year_birth,month_birth,weekday_birth,day_birth,age,filesize_crop
0,693726,./datasets/imdb/01/nm0000001_rm124825600_1899-...,1.0,u'Fred Astaire',1968,./datasets/imdb_crop/01/nm0000001_rm124825600_...,1899-05-10 00:00:00,1899,5,2,10,69,11975
1,693726,./datasets/imdb/01/nm0000001_rm3343756032_1899...,1.0,u'Fred Astaire',1970,./datasets/imdb_crop/01/nm0000001_rm3343756032...,1899-05-10 00:00:00,1899,5,2,10,71,8914
2,693726,./datasets/imdb/01/nm0000001_rm577153792_1899-...,1.0,u'Fred Astaire',1968,./datasets/imdb_crop/01/nm0000001_rm577153792_...,1899-05-10 00:00:00,1899,5,2,10,69,23480
3,693726,./datasets/imdb/01/nm0000001_rm946909184_1899-...,1.0,u'Fred Astaire',1968,./datasets/imdb_crop/01/nm0000001_rm946909184_...,1899-05-10 00:00:00,1899,5,2,10,69,22495
4,693726,./datasets/imdb/01/nm0000001_rm980463616_1899-...,1.0,u'Fred Astaire',1968,./datasets/imdb_crop/01/nm0000001_rm980463616_...,1899-05-10 00:00:00,1899,5,2,10,69,17269
5,702986,./datasets/imdb/02/nm0000002_rm1075631616_1924...,0.0,u'Lauren Bacall',1991,./datasets/imdb_crop/02/nm0000002_rm1075631616...,1924-09-16,1924,9,1,16,67,27281
6,702986,./datasets/imdb/02/nm0000002_rm1346607872_1924...,0.0,u'Lauren Bacall',2004,./datasets/imdb_crop/02/nm0000002_rm1346607872...,1924-09-16,1924,9,1,16,80,5731
7,702986,./datasets/imdb/02/nm0000002_rm1363385088_1924...,0.0,u'Lauren Bacall',2004,./datasets/imdb_crop/02/nm0000002_rm1363385088...,1924-09-16,1924,9,1,16,80,7478
8,702986,./datasets/imdb/02/nm0000002_rm1411175936_1924...,0.0,u'Lauren Bacall',1991,./datasets/imdb_crop/02/nm0000002_rm1411175936...,1924-09-16,1924,9,1,16,67,22915
9,702986,./datasets/imdb/02/nm0000002_rm1447271168_1924...,0.0,u'Lauren Bacall',2004,./datasets/imdb_crop/02/nm0000002_rm1447271168...,1924-09-16,1924,9,1,16,80,1849


# Clean dataset

In [247]:
data_wiki.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62328 entries, 0 to 62327
Data columns (total 14 columns):
dob               62328 non-null int32
full_path         62328 non-null object
gender            59685 non-null float64
name              62328 non-null object
photo_taken       62328 non-null uint16
full_path_crop    62328 non-null object
birthdate         62303 non-null datetime64[ns]
year_birth        62303 non-null float64
month_birth       62303 non-null float64
weekday_birth     62303 non-null float64
day_birth         62303 non-null float64
age               62303 non-null float64
filesize          62328 non-null int64
filesize_crop     62328 non-null int64
dtypes: datetime64[ns](1), float64(6), int32(1), int64(2), object(3), uint16(1)
memory usage: 6.1+ MB


In [248]:
data_imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 460594 entries, 0 to 460722
Data columns (total 13 columns):
dob               460594 non-null int32
full_path         460594 non-null object
gender            452132 non-null float64
name              460594 non-null object
photo_taken       460594 non-null uint16
full_path_crop    460594 non-null object
birthdate         460594 non-null datetime64[ns]
year_birth        460594 non-null int64
month_birth       460594 non-null int64
weekday_birth     460594 non-null int64
day_birth         460594 non-null int64
age               460594 non-null int64
filesize_crop     460594 non-null int64
dtypes: datetime64[ns](1), float64(1), int32(1), int64(6), object(3), uint16(1)
memory usage: 44.8+ MB


In [249]:
data_wiki[data_wiki["age"].isnull()].count()

dob               25
full_path         25
gender            25
name              25
photo_taken       25
full_path_crop    25
birthdate          0
year_birth         0
month_birth        0
weekday_birth      0
day_birth          0
age                0
filesize          25
filesize_crop     25
dtype: int64

We have 25 instances without date information.
We have many corrupted photos.

Let's cut all files with photos having less than 10 kB size:

In [250]:
data_wiki_clean = data_wiki[data_wiki["filesize"]>10000]

In [251]:
data_wiki.shape

(62328, 14)

In [252]:
data_wiki_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52486 entries, 0 to 62327
Data columns (total 14 columns):
dob               52486 non-null int32
full_path         52486 non-null object
gender            51429 non-null float64
name              52486 non-null object
photo_taken       52486 non-null uint16
full_path_crop    52486 non-null object
birthdate         52475 non-null datetime64[ns]
year_birth        52475 non-null float64
month_birth       52475 non-null float64
weekday_birth     52475 non-null float64
day_birth         52475 non-null float64
age               52475 non-null float64
filesize          52486 non-null int64
filesize_crop     52486 non-null int64
dtypes: datetime64[ns](1), float64(6), int32(1), int64(2), object(3), uint16(1)
memory usage: 5.5+ MB


In [253]:
data_wiki_clean = data_wiki_clean.dropna(axis=0)
data_wiki_clean = data_wiki_clean.drop(['dob'], axis=1)
data_imdb_clean = data_imdb.dropna(axis=0)
data_imdb_clean = data_imdb_clean.drop(['dob'], axis=1)

In [None]:
data_imdb_clean['imsize'] = data_imdb_clean['full_path_crop'].apply(lambda x: len(mpimg.imread(x).shape))


In [254]:
data_wiki_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51418 entries, 0 to 62327
Data columns (total 13 columns):
full_path         51418 non-null object
gender            51418 non-null float64
name              51418 non-null object
photo_taken       51418 non-null uint16
full_path_crop    51418 non-null object
birthdate         51418 non-null datetime64[ns]
year_birth        51418 non-null float64
month_birth       51418 non-null float64
weekday_birth     51418 non-null float64
day_birth         51418 non-null float64
age               51418 non-null float64
filesize          51418 non-null int64
filesize_crop     51418 non-null int64
dtypes: datetime64[ns](1), float64(6), int64(2), object(3), uint16(1)
memory usage: 5.2+ MB


In [255]:
data_imdb_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 452132 entries, 0 to 460722
Data columns (total 12 columns):
full_path         452132 non-null object
gender            452132 non-null float64
name              452132 non-null object
photo_taken       452132 non-null uint16
full_path_crop    452132 non-null object
birthdate         452132 non-null datetime64[ns]
year_birth        452132 non-null int64
month_birth       452132 non-null int64
weekday_birth     452132 non-null int64
day_birth         452132 non-null int64
age               452132 non-null int64
filesize_crop     452132 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(6), object(3), uint16(1)
memory usage: 42.3+ MB


In [257]:
data_imdb_clean

Unnamed: 0,full_path,gender,name,photo_taken,full_path_crop,birthdate,year_birth,month_birth,weekday_birth,day_birth,age,filesize_crop
0,./datasets/imdb/01/nm0000001_rm124825600_1899-...,1.0,u'Fred Astaire',1968,./datasets/imdb_crop/01/nm0000001_rm124825600_...,1899-05-10 00:00:00,1899,5,2,10,69,11975
1,./datasets/imdb/01/nm0000001_rm3343756032_1899...,1.0,u'Fred Astaire',1970,./datasets/imdb_crop/01/nm0000001_rm3343756032...,1899-05-10 00:00:00,1899,5,2,10,71,8914
2,./datasets/imdb/01/nm0000001_rm577153792_1899-...,1.0,u'Fred Astaire',1968,./datasets/imdb_crop/01/nm0000001_rm577153792_...,1899-05-10 00:00:00,1899,5,2,10,69,23480
3,./datasets/imdb/01/nm0000001_rm946909184_1899-...,1.0,u'Fred Astaire',1968,./datasets/imdb_crop/01/nm0000001_rm946909184_...,1899-05-10 00:00:00,1899,5,2,10,69,22495
4,./datasets/imdb/01/nm0000001_rm980463616_1899-...,1.0,u'Fred Astaire',1968,./datasets/imdb_crop/01/nm0000001_rm980463616_...,1899-05-10 00:00:00,1899,5,2,10,69,17269
5,./datasets/imdb/02/nm0000002_rm1075631616_1924...,0.0,u'Lauren Bacall',1991,./datasets/imdb_crop/02/nm0000002_rm1075631616...,1924-09-16,1924,9,1,16,67,27281
6,./datasets/imdb/02/nm0000002_rm1346607872_1924...,0.0,u'Lauren Bacall',2004,./datasets/imdb_crop/02/nm0000002_rm1346607872...,1924-09-16,1924,9,1,16,80,5731
7,./datasets/imdb/02/nm0000002_rm1363385088_1924...,0.0,u'Lauren Bacall',2004,./datasets/imdb_crop/02/nm0000002_rm1363385088...,1924-09-16,1924,9,1,16,80,7478
8,./datasets/imdb/02/nm0000002_rm1411175936_1924...,0.0,u'Lauren Bacall',1991,./datasets/imdb_crop/02/nm0000002_rm1411175936...,1924-09-16,1924,9,1,16,67,22915
9,./datasets/imdb/02/nm0000002_rm1447271168_1924...,0.0,u'Lauren Bacall',2004,./datasets/imdb_crop/02/nm0000002_rm1447271168...,1924-09-16,1924,9,1,16,80,1849


Save files:

In [258]:
data_wiki_clean.to_csv( './datasets/data_wiki_clean.csv',index=False)
data_imdb_clean.to_csv( './datasets/data_imdb_clean.csv',index=False)


In [12]:
import pandas as pd
data_imdb = pd.read_csv("./datasets/data_imdb_clean2.csv", parse_dates=['birthdate'])
width = 299
height = 299
channels = 3

In [36]:
data_imdb['full_path_crop'][1]

'./datasets/imdb_crop/01/nm0000001_rm3343756032_1899-5-10_1970.jpg'

In [34]:
 mpimg.imread('./datasets/imdb_crop/01/nm0000001_rm124825600_1899-5-10_1968.jpg').shape[1]

257

In [35]:
data_imdb['imheight'] =0
data_imdb.loc[data_imdb['imsize']==3, 'imheight'] = data_imdb['full_path_crop'].apply(lambda x: mpimg.imread(x).shape[0])

In [37]:
data_imdb['imwidth'] =0
data_imdb.loc[data_imdb['imsize']==3, 'imwidth'] = data_imdb['full_path_crop'].apply(lambda x: mpimg.imread(x).shape[1])

In [38]:
data_imdb

Unnamed: 0,full_path,gender,name,photo_taken,full_path_crop,birthdate,year_birth,month_birth,weekday_birth,day_birth,age,filesize_crop,imsize,imheight,imwidth
0,./datasets/imdb/01/nm0000001_rm124825600_1899-...,1.0,u'Fred Astaire',1968,./datasets/imdb_crop/01/nm0000001_rm124825600_...,1899-05-10 00:00:00,1899,5,2,10,69,11975,3,257,257
1,./datasets/imdb/01/nm0000001_rm3343756032_1899...,1.0,u'Fred Astaire',1970,./datasets/imdb_crop/01/nm0000001_rm3343756032...,1899-05-10 00:00:00,1899,5,2,10,71,8914,3,263,263
2,./datasets/imdb/01/nm0000001_rm577153792_1899-...,1.0,u'Fred Astaire',1968,./datasets/imdb_crop/01/nm0000001_rm577153792_...,1899-05-10 00:00:00,1899,5,2,10,69,23480,3,500,500
3,./datasets/imdb/01/nm0000001_rm946909184_1899-...,1.0,u'Fred Astaire',1968,./datasets/imdb_crop/01/nm0000001_rm946909184_...,1899-05-10 00:00:00,1899,5,2,10,69,22495,3,401,400
4,./datasets/imdb/01/nm0000001_rm980463616_1899-...,1.0,u'Fred Astaire',1968,./datasets/imdb_crop/01/nm0000001_rm980463616_...,1899-05-10 00:00:00,1899,5,2,10,69,17269,3,340,340
5,./datasets/imdb/02/nm0000002_rm1075631616_1924...,0.0,u'Lauren Bacall',1991,./datasets/imdb_crop/02/nm0000002_rm1075631616...,1924-09-16,1924,9,1,16,67,27281,3,500,500
6,./datasets/imdb/02/nm0000002_rm1346607872_1924...,0.0,u'Lauren Bacall',2004,./datasets/imdb_crop/02/nm0000002_rm1346607872...,1924-09-16,1924,9,1,16,80,5731,3,210,211
7,./datasets/imdb/02/nm0000002_rm1363385088_1924...,0.0,u'Lauren Bacall',2004,./datasets/imdb_crop/02/nm0000002_rm1363385088...,1924-09-16,1924,9,1,16,80,7478,3,259,259
8,./datasets/imdb/02/nm0000002_rm1411175936_1924...,0.0,u'Lauren Bacall',1991,./datasets/imdb_crop/02/nm0000002_rm1411175936...,1924-09-16,1924,9,1,16,67,22915,3,482,500
9,./datasets/imdb/02/nm0000002_rm1447271168_1924...,0.0,u'Lauren Bacall',2004,./datasets/imdb_crop/02/nm0000002_rm1447271168...,1924-09-16,1924,9,1,16,80,1849,3,67,67


In [39]:
data_imdb.to_csv( './datasets/data_imdb_clean3.csv',index=False)
