In [1]:
import os
import sys
import pandas as pd
from tqdm import tqdm
import shutil
import torch
from torchvision.transforms import transforms

from PIL import Image

from sklearn.model_selection import train_test_split

# Define the module path so that we can call our own helper functions
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.utilities.utils import get_image_paths

In [2]:
# Setting device on GPU if available, else CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

# 1.1-Data-Cleaning-and-Dataset-Creation

The focus of this notebook is to clean the raw dataset and process it into a dataset ready to train a model.

From the first Notebook (1.0-Data-Analysis.ipynb), we extracted and recorded some parameters to help clean the dataset. The major parameters that we found to be the most influential in "good" quality imagery was the presence of blur, the level of brightness (under or over) and the presence of faces (detected via YuNet) in the imagery. This will be our criteria to "clean" the dataset


### Load the image statistics .csv

In [3]:
csv_path = os.path.join(module_path, "data", "image_statistics.csv")
df = pd.read_csv(csv_path)

In [4]:
df

Unnamed: 0,image_id,filename,path,height,width,size,"image_mean([R,G,B])","st_dev([R,G,B])",image_mode,age,aspect_ratio,laplacian_var,brightness,contrast,face
0,0,122542,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,8652,"[0.6380450129508972, 0.31967222690582275, 0.28...","[0.1845678985118866, 0.18805555999279022, 0.17...",RGB,20,1.0,1113.515213,Normal,249,[[30.868725 25.827965 57.02202 72.232376 ...
1,1,122550,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,9781,"[0.3353889286518097, 0.2767695486545563, 0.202...","[0.275545597076416, 0.19257135689258575, 0.186...",RGB,20,1.0,1639.238871,Normal,229,[[33.812958 22.145706 58.017765 76.09771...
2,2,122573,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,7770,"[0.6544163227081299, 0.6315654516220093, 0.548...","[0.25599220395088196, 0.2756023705005646, 0.24...",RGB,20,1.0,377.888782,Normal,250,[[ 43.162086 34.870895 47.159107 57.5873...
3,3,122642,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,4804,"[0.22364862263202667, 0.18860989809036255, 0.1...","[0.10604064166545868, 0.08799631893634796, 0.0...",RGB,20,1.0,185.784095,Normal,128,[[34.818657 19.407864 57.301003 76.56014 ...
4,4,122646,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,11211,"[0.44937387108802795, 0.43740808963775635, 0.3...","[0.2650805115699768, 0.27240049839019775, 0.28...",RGB,20,1.0,2360.912407,Normal,253,[[32.418243 21.702755 60.584217 74.31904...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29965,29965,99814,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,7910,"[0.17300859093666077, 0.12947949767112732, 0.1...","[0.20211130380630493, 0.16090622544288635, 0.1...",RGB,50,1.0,1620.302140,Normal,231,[[34.110657 22.911034 45.725872 65.63528...
29966,29966,99819,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,8559,"[0.1891694962978363, 0.1776348054409027, 0.218...","[0.23028673231601715, 0.21333780884742737, 0.2...",RGB,50,1.0,2070.926402,Normal,244,[[41.159447 27.54914 44.419582 61.93242...
29967,29967,99907,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,6876,"[0.5533868670463562, 0.5229092836380005, 0.292...","[0.2174087017774582, 0.2374659925699234, 0.233...",RGB,50,1.0,599.539258,Normal,255,[[44.19874 19.792328 53.73103 73.69324 ...
29968,29968,99983,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,10369,"[0.48215118050575256, 0.3947139084339142, 0.33...","[0.3137766718864441, 0.267193078994751, 0.2336...",RGB,50,1.0,2070.207980,Normal,253,[[36.84456 20.456741 52.961735 73.94894 ...


The two columns of interest that we want to use are the "laplacian_var" and brightness columns. The "laplacian_var" column contains the laplacian variance, which we will use to identify blur in the imagery. The brightness column contains labels on whether the image is too bright, too dark or normal. While this column contains classifications instead of values, we can change the classifications by adjusting the thresholds in the previous notebook. 

### Clean the dataset

From the previous notebook, we determined a good threshold for blur was 100. Interestingly, while the imagery labelled "too dark" exhibited examples that for the majority were too dark, the imagery that was considered "too bright" could be the result of being old imagery, drawings or an image containing generally bright colours (but not overexposed). Curious to see it's influence on training the model, we will include this set of data.

In [5]:
# Clean the dataset 
clean_df = df[(df.laplacian_var >= 100) & (df.brightness != "Too Dark") & (df.face.notna()) & (df.image_mode != "L")]
clean_df

Unnamed: 0,image_id,filename,path,height,width,size,"image_mean([R,G,B])","st_dev([R,G,B])",image_mode,age,aspect_ratio,laplacian_var,brightness,contrast,face
0,0,122542,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,8652,"[0.6380450129508972, 0.31967222690582275, 0.28...","[0.1845678985118866, 0.18805555999279022, 0.17...",RGB,20,1.0,1113.515213,Normal,249,[[30.868725 25.827965 57.02202 72.232376 ...
1,1,122550,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,9781,"[0.3353889286518097, 0.2767695486545563, 0.202...","[0.275545597076416, 0.19257135689258575, 0.186...",RGB,20,1.0,1639.238871,Normal,229,[[33.812958 22.145706 58.017765 76.09771...
2,2,122573,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,7770,"[0.6544163227081299, 0.6315654516220093, 0.548...","[0.25599220395088196, 0.2756023705005646, 0.24...",RGB,20,1.0,377.888782,Normal,250,[[ 43.162086 34.870895 47.159107 57.5873...
3,3,122642,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,4804,"[0.22364862263202667, 0.18860989809036255, 0.1...","[0.10604064166545868, 0.08799631893634796, 0.0...",RGB,20,1.0,185.784095,Normal,128,[[34.818657 19.407864 57.301003 76.56014 ...
4,4,122646,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,11211,"[0.44937387108802795, 0.43740808963775635, 0.3...","[0.2650805115699768, 0.27240049839019775, 0.28...",RGB,20,1.0,2360.912407,Normal,253,[[32.418243 21.702755 60.584217 74.31904...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29965,29965,99814,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,7910,"[0.17300859093666077, 0.12947949767112732, 0.1...","[0.20211130380630493, 0.16090622544288635, 0.1...",RGB,50,1.0,1620.302140,Normal,231,[[34.110657 22.911034 45.725872 65.63528...
29966,29966,99819,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,8559,"[0.1891694962978363, 0.1776348054409027, 0.218...","[0.23028673231601715, 0.21333780884742737, 0.2...",RGB,50,1.0,2070.926402,Normal,244,[[41.159447 27.54914 44.419582 61.93242...
29967,29967,99907,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,6876,"[0.5533868670463562, 0.5229092836380005, 0.292...","[0.2174087017774582, 0.2374659925699234, 0.233...",RGB,50,1.0,599.539258,Normal,255,[[44.19874 19.792328 53.73103 73.69324 ...
29968,29968,99983,C:\Users\Jared\projects\python\braineye-age-pr...,128,128,10369,"[0.48215118050575256, 0.3947139084339142, 0.33...","[0.3137766718864441, 0.267193078994751, 0.2336...",RGB,50,1.0,2070.207980,Normal,253,[[36.84456 20.456741 52.961735 73.94894 ...


In [6]:
rejected_count = len(df) - len(clean_df)
print(f"Reject examples: {rejected_count}")

Reject examples: 677


## Dataset Creation

Now that we have cleaned the dataset, let's create the cleaned dataset. For this dataset, we need to create a copy separate to our raw dataset. In addition to this, we wish to split this dataset into three groups train, test and val. With all these changes, we also need to record the chages and any mappings

### Check output folders exist and create if not

In [7]:
# Check if the interrim folder exists. If not, create folder
interrim_path = os.path.join(module_path, "data", "interrim")
if not os.path.isdir(interrim_path):
    os.makedirs(interrim_path)

### Record mappings

In [8]:
dataset = clean_df[["path", "filename", "age"]]
dataset

Unnamed: 0,path,filename,age
0,C:\Users\Jared\projects\python\braineye-age-pr...,122542,20
1,C:\Users\Jared\projects\python\braineye-age-pr...,122550,20
2,C:\Users\Jared\projects\python\braineye-age-pr...,122573,20
3,C:\Users\Jared\projects\python\braineye-age-pr...,122642,20
4,C:\Users\Jared\projects\python\braineye-age-pr...,122646,20
...,...,...,...
29965,C:\Users\Jared\projects\python\braineye-age-pr...,99814,50
29966,C:\Users\Jared\projects\python\braineye-age-pr...,99819,50
29967,C:\Users\Jared\projects\python\braineye-age-pr...,99907,50
29968,C:\Users\Jared\projects\python\braineye-age-pr...,99983,50


In [9]:
# Rename path and age
dataset.rename(columns = {"path": "original_path", "age": "target"}, inplace=True)
dataset

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.rename(columns = {"path": "original_path", "age": "target"}, inplace=True)


Unnamed: 0,original_path,filename,target
0,C:\Users\Jared\projects\python\braineye-age-pr...,122542,20
1,C:\Users\Jared\projects\python\braineye-age-pr...,122550,20
2,C:\Users\Jared\projects\python\braineye-age-pr...,122573,20
3,C:\Users\Jared\projects\python\braineye-age-pr...,122642,20
4,C:\Users\Jared\projects\python\braineye-age-pr...,122646,20
...,...,...,...
29965,C:\Users\Jared\projects\python\braineye-age-pr...,99814,50
29966,C:\Users\Jared\projects\python\braineye-age-pr...,99819,50
29967,C:\Users\Jared\projects\python\braineye-age-pr...,99907,50
29968,C:\Users\Jared\projects\python\braineye-age-pr...,99983,50


In [10]:
dataset["updated_path"] = dataset["original_path"].apply(lambda x: os.path.join(interrim_path, os.path.split(x)[-1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["updated_path"] = dataset["original_path"].apply(lambda x: os.path.join(interrim_path, os.path.split(x)[-1]))


### Copy the "cleaned" dataset

In [11]:
for index, row in tqdm(dataset.iterrows(), total=len(dataset)):
    # Check if the file already exists
    if os.path.isfile(row["updated_path"]):
        pass
    else:
        shutil.copyfile(src=row["original_path"], dst=row["updated_path"])

100%|██████████| 29293/29293 [03:40<00:00, 132.75it/s]


## Train-test split dataset

Since there are close to 30,000 images and 30 different years present in the dataset, we will go with a 80:20 slpit. Usually with this number of images, we may get away with training more and having a larger train set, but due to the large age range, we will settle with a smaller split.

In [12]:
train_set, val_set = train_test_split(dataset, random_state=42, test_size=0.2, stratify=dataset["target"])

In [13]:
# Check if the processed/train folder exists. If not, create folder
train_path = os.path.join(module_path, "data", "processed", "train")
val_path = os.path.join(module_path, "data", "processed", "val")

# Since the dataset is stratified, we can expect all ages to appear in the training set (larger set)
for target in tqdm(train_set.target.unique(), total=len(train_set.target.unique())):
    # Set up the paths
    target_folder = str(target)
    train_target_path = os.path.join(train_path, target_folder)
    val_target_path = os.path.join(val_path, target_folder)
    
    # Check if directories exist. If not, create
    if not os.path.isdir(train_target_path):
        os.makedirs(train_target_path)
        
    if not os.path.isdir(val_target_path):
        os.makedirs(val_target_path)


100%|██████████| 31/31 [00:00<00:00, 1328.72it/s]


In [14]:
# Move the train_set images to .../processed/train
train_set["processed_path"] = train_set["original_path"].apply(lambda x: os.path.join(train_path, "\\".join(x.split("\\")[-2:])))
val_set["processed_path"] = val_set["original_path"].apply(lambda x: os.path.join(val_path, "\\".join(x.split("\\")[-2:])))



### Create train

In [15]:
# Copy the interrim images in the train set to processed 
for index, row in tqdm(train_set.iterrows(), total=len(train_set)):
    # Check if the file already exists
    if os.path.isfile(row["processed_path"]):
        pass
    else:
        shutil.copyfile(src=row["updated_path"], dst=row["processed_path"])

100%|██████████| 23434/23434 [02:47<00:00, 139.63it/s]


In [16]:
# Copy the interrim images in the test set to processed 
for index, row in tqdm(val_set.iterrows(), total=len(val_set)):
    # Check if the file already exists
    if os.path.isfile(row["processed_path"]):
        pass
    else:
        shutil.copyfile(src=row["updated_path"], dst=row["processed_path"])

100%|██████████| 5859/5859 [00:45<00:00, 129.61it/s]


### Save mappings

In [17]:
mappings = pd.concat([train_set, val_set], axis=0)
mappings

Unnamed: 0,original_path,filename,target,updated_path,processed_path
17875,C:\Users\Jared\projects\python\braineye-age-pr...,165164,35,C:\Users\Jared\projects\python\braineye-age-pr...,C:\Users\Jared\projects\python\braineye-age-pr...
15182,C:\Users\Jared\projects\python\braineye-age-pr...,165534,33,C:\Users\Jared\projects\python\braineye-age-pr...,C:\Users\Jared\projects\python\braineye-age-pr...
7983,C:\Users\Jared\projects\python\braineye-age-pr...,144035,28,C:\Users\Jared\projects\python\braineye-age-pr...,C:\Users\Jared\projects\python\braineye-age-pr...
13853,C:\Users\Jared\projects\python\braineye-age-pr...,169976,32,C:\Users\Jared\projects\python\braineye-age-pr...,C:\Users\Jared\projects\python\braineye-age-pr...
24457,C:\Users\Jared\projects\python\braineye-age-pr...,152405,41,C:\Users\Jared\projects\python\braineye-age-pr...,C:\Users\Jared\projects\python\braineye-age-pr...
...,...,...,...,...,...
23423,C:\Users\Jared\projects\python\braineye-age-pr...,155489,40,C:\Users\Jared\projects\python\braineye-age-pr...,C:\Users\Jared\projects\python\braineye-age-pr...
25820,C:\Users\Jared\projects\python\braineye-age-pr...,169198,42,C:\Users\Jared\projects\python\braineye-age-pr...,C:\Users\Jared\projects\python\braineye-age-pr...
1013,C:\Users\Jared\projects\python\braineye-age-pr...,32069,20,C:\Users\Jared\projects\python\braineye-age-pr...,C:\Users\Jared\projects\python\braineye-age-pr...
4745,C:\Users\Jared\projects\python\braineye-age-pr...,146064,24,C:\Users\Jared\projects\python\braineye-age-pr...,C:\Users\Jared\projects\python\braineye-age-pr...


In [18]:
mappings_path = os.path.join(module_path, "data", "mappings.csv")
mappings.to_csv(mappings_path)

## Image mean and standard deviation of cleaned dataset

Since the dataset has been cleaned, let's quickly go through and collect the new dataset mean and standard deviation

In [19]:
image_list = get_image_paths(interrim_path)

image_means = torch.Tensor([0., 0. ,0.]).to(device)
image_stds = torch.Tensor([0., 0., 0.]).to(device)

In [20]:
# Iterate through all the image to extract their information
for path in tqdm(image_list):
    # Load the next image
    image = Image.open(path)
   
    # Some examples are by default grayscale (image_mode = "L"), let's convert these to RGB but also record it's initial mode. 
    image_mode = image.mode
    if image_mode == "L":
        image = image.convert("RGB")
    
    # Convert the image to shape (channel, height, width). Automatically divides values by 255.
    image_tensor = transforms.ToTensor()(image).to(device)
    
    # Extract the mean and standard deviation
    mean = image_tensor.mean(dim=(1,2))
    image_means += mean
    
    st_dev = image_tensor.view(3, -1).std(dim=-1)
    image_stds += st_dev 
    
    

100%|██████████| 29293/29293 [01:23<00:00, 351.18it/s]


In [21]:
print("Mean: ")
print(image_means / len(image_list))
print("Standard Deviation: ")
print(image_stds / len(image_list))

Mean: 
tensor([0.4519, 0.3799, 0.3389], device='cuda:0')
Standard Deviation: 
tensor([0.2496, 0.2254, 0.2166], device='cuda:0')
