# Age prediction using street view images
Link: https://www.kaggle.com/code/gcdatkin/age-prediction-from-images-cnn-regression/notebook 

In [125]:
import numpy as np
import pandas as pd
from pathlib import Path
import os.path

from sklearn.model_selection import train_test_split

import tensorflow as tf

from sklearn.metrics import r2_score

In [126]:
image_dir = Path('/workspace/workspace/ufo-prediction/image_data')

In [127]:
filepaths = pd.Series(list(image_dir.glob('**/*.jpg')), name='Filepath').astype(str)

def extract_age_and_name(filepath):
    filename = os.path.basename(filepath)  # Get the filename from the filepath
    age_and_name = filename.split('.')[0]  # Split by dot and take the first part
    age = ''.join(filter(str.isdigit, age_and_name))[:4]  # Extract first 4 digits for age
    if age:  # Ensure age string is not empty
        age = int(age)
    else:  # Default age if no digits found
        age = 0
    return age

ages = pd.Series(filepaths.apply(lambda x: extract_age_and_name(x)), name='Age')

# Filter to include only ages above 1900
filtered_ages = ages[ages > 1900]

# Ensure we only work with filepaths that have a corresponding age above 1900
filtered_filepaths = filepaths[ages > 1900]

images = pd.concat([filtered_filepaths, filtered_ages], axis=1).sample(frac=1.0, random_state=1).reset_index(drop=True)

In [128]:
images

Unnamed: 0,Filepath,Age
0,/workspace/workspace/ufo-prediction/image_data...,1965
1,/workspace/workspace/ufo-prediction/image_data...,1994
2,/workspace/workspace/ufo-prediction/image_data...,2009
3,/workspace/workspace/ufo-prediction/image_data...,1980
4,/workspace/workspace/ufo-prediction/image_data...,2011
...,...,...
868,/workspace/workspace/ufo-prediction/image_data...,1996
869,/workspace/workspace/ufo-prediction/image_data...,1953
870,/workspace/workspace/ufo-prediction/image_data...,2008
871,/workspace/workspace/ufo-prediction/image_data...,2012


In [129]:
# Split into train and test set
train_df, test_df = train_test_split(images, train_size=0.7, shuffle=True, random_state=1)

In [130]:
train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2
)

test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255
)

In [131]:
train_df

Unnamed: 0,Filepath,Age
502,/workspace/workspace/ufo-prediction/image_data...,1970
730,/workspace/workspace/ufo-prediction/image_data...,2003
604,/workspace/workspace/ufo-prediction/image_data...,1977
246,/workspace/workspace/ufo-prediction/image_data...,1996
494,/workspace/workspace/ufo-prediction/image_data...,1991
...,...,...
715,/workspace/workspace/ufo-prediction/image_data...,2004
767,/workspace/workspace/ufo-prediction/image_data...,1985
72,/workspace/workspace/ufo-prediction/image_data...,2009
235,/workspace/workspace/ufo-prediction/image_data...,1975


In [132]:
train_images = train_generator.flow_from_dataframe(
    dataframe=train_df,
    x_col='Filepath',
    y_col='Age',
    target_size=(120, 120),
    color_mode='rgb',
    class_mode='raw',
    batch_size=32,
    shuffle=True,
    seed=42,
    subset='training'
)

val_images = train_generator.flow_from_dataframe(
    dataframe=train_df,
    x_col='Filepath',
    y_col='Age',
    target_size=(120, 120),
    color_mode='rgb',
    class_mode='raw',
    batch_size=32,
    shuffle=True,
    seed=42,
    subset='validation'
)

test_images = test_generator.flow_from_dataframe(
    dataframe=test_df,
    x_col='Filepath',
    y_col='Age',
    target_size=(120, 120),
    color_mode='rgb',
    class_mode='raw',
    batch_size=32,
    shuffle=False
)

Found 489 validated image filenames.
Found 122 validated image filenames.
Found 262 validated image filenames.


# Training

In [133]:
inputs = tf.keras.Input(shape=(120, 120, 3))
x = tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='relu')(inputs)
x = tf.keras.layers.MaxPool2D()(x)
x = tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu')(x)
x = tf.keras.layers.MaxPool2D()(x)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='linear')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer='adam',
    loss='mse'
)

history = model.fit(
    train_images,
    validation_data=val_images,
    epochs=1000,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=14,
            restore_best_weights=True
        )
    ]
)

Epoch 1/1000


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000


# Results

In [135]:
predicted_ages = np.squeeze(model.predict(test_images))
true_ages = test_images.labels

rmse = np.sqrt(model.evaluate(test_images, verbose=0))
print("     Test RMSE: {:.5f}".format(rmse))

r2 = r2_score(true_ages, predicted_ages)
print("Test R^2 Score: {:.5f}".format(r2))

     Test RMSE: 34.16974
Test R^2 Score: -0.19040


In [136]:
null_rmse = np.sqrt(np.sum((true_ages - np.mean(true_ages))**2) / len(true_ages))
print("Null/Baseline Model Test RMSE: {:.5f}".format(null_rmse))

Null/Baseline Model Test RMSE: 31.31807
