In [2]:
import sys
sys.path.append('/usr/local/lib/python3.6/site-packages') # For cv2 finding
import os, glob, math, cv2, time
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from keras.utils import np_utils
from tqdm import tqdm

In [3]:
train_folder = 'data/imgs/train'
test_folder = 'data/imgs/test'

### load the csv file describing the training dataset into a pandas dataframe

In [4]:
driver_images = pd.read_csv('data/driver_imgs_list.csv')

### add path to image and target value series to pandas dataframe describing the training set

In [5]:
driver_images['img_path'] = driver_images.apply(lambda row: 'data/imgs/train/' + row['classname'] + '/' + row['img'], axis=1)
driver_images['target'] = driver_images['classname'].str[1:].astype(int)
targets = np_utils.to_categorical(np.array(driver_images['target']), 10)
driver_images.head()

Unnamed: 0,subject,classname,img,img_path,target
0,p002,c0,img_44733.jpg,data/imgs/train/c0/img_44733.jpg,0
1,p002,c0,img_72999.jpg,data/imgs/train/c0/img_72999.jpg,0
2,p002,c0,img_25094.jpg,data/imgs/train/c0/img_25094.jpg,0
3,p002,c0,img_69092.jpg,data/imgs/train/c0/img_69092.jpg,0
4,p002,c0,img_92629.jpg,data/imgs/train/c0/img_92629.jpg,0


In [6]:
def getDatasets(driver_images):
    """converts the target integer values into one hot vector 
       returns a dataframe containing the image paths and another dataframe with the one-hot target vectors
    """
    image_files = np.array(driver_images['img_path'])
    target_categories = np_utils.to_categorical(np.array(driver_images['target']), 10)
    return (image_files, target_categories)

In [7]:
def splitTrainingSetWithShuffle(driver_images, train_part = .8):
    """"splits a dataset into a training and validation part in a ratio specified by the method argument train_part,
        based on driver, so that all images with a driver reside in just one of the datasets
        shuffles both dataframes after the split
       returns the two dataframes
    """

    distinct_drivers = driver_images.subject.sort_values().unique()
    distinct_drivers_cnt = len(distinct_drivers)
    valid_drivers = round(distinct_drivers_cnt * (1 - train_part))
   
    drivers_valid = distinct_drivers[0:valid_drivers]
    drivers_train = distinct_drivers[valid_drivers:]

    driver_images_valid = driver_images.loc[driver_images['subject'].isin(drivers_valid)]
    driver_images_train = driver_images.loc[driver_images['subject'].isin(drivers_train)]

    driver_images_valid = driver_images_valid.sample(frac=1).reset_index(drop=True)
    driver_images_train = driver_images_train.sample(frac=1).reset_index(drop=True)
    return (driver_images_train, driver_images_valid)

In [4]:
from keras.preprocessing import image                  


def path_to_tensor(img_path, size):
    """
    loads an image from the specied size and converts into a 4D tensor with shape (1,size,size,3), where
    size is the number of pixels of a square image and is passed in as an argument
    """
    # loads RGB image as PIL.Image.Image type
    img = image.load_img(img_path, target_size=(size, size))
    # convert PIL.Image.Image type to 3D tensor with shape (size, size, 3)
    x = image.img_to_array(img)
    # convert 3D tensor to 4D tensor with shape (1, size, size, 3) and return 4D tensor
    return np.expand_dims(x, axis=0)

def paths_to_tensor(img_paths, size=224):
    """
    loads and returns a tensor created after calling path_to_tensor for all images passed in the img_paths dataframe.
    """
    list_of_tensors = [path_to_tensor(img_path, size) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

### Obtain the training and validation dataframes

In [9]:
driver_images_train, driver_images_valid = splitTrainingSetWithShuffle(driver_images)

### obtain file paths and target one-hot vectors for the training anf validation sets 

In [10]:
train_files, train_targets = getDatasets(driver_images_train)
valid_files, valid_targets = getDatasets(driver_images_valid)
# print statistics about the dataset
print('There are %d training images.' % len(train_files))
print('There are %d validation images.' % len(valid_files))

print('There are %d train target.' % len(train_targets))
print('There are %d validation targets.'% len(valid_targets))

There are 18047 training images.
There are 4377 validation images.
There are 18047 train target.
There are 4377 validation targets.


### create the tensors for the training and vaildation datasets and save to storage 

In [11]:
train_data = paths_to_tensor(train_files, 299).astype('float32')/255
valid_data = paths_to_tensor(valid_files, 299).astype('float32')/255

100%|██████████| 18047/18047 [01:47<00:00, 168.62it/s]
100%|██████████| 4377/4377 [00:26<00:00, 166.24it/s]


In [12]:
np.save('data/train_data_299.txt', train_data)
np.save('data/valid_data_299.txt', valid_data)
np.save('data/train_targets.txt', train_targets)
np.save('data/valid_targets.txt', valid_targets)

In [13]:
train_data = paths_to_tensor(train_files, 224).astype('float32')/255
valid_data = paths_to_tensor(valid_files, 224).astype('float32')/255
np.save('data/train_data_224.txt', train_data)
np.save('data/valid_data_224.txt', valid_data)



100%|██████████| 18047/18047 [01:36<00:00, 186.99it/s]
100%|██████████| 4377/4377 [00:22<00:00, 190.37it/s]


### Prepare and save a csv file with the image file paths and name of the image to drive for further  processing

In [4]:
import ntpath

path = os.path.join(test_folder, '*.jpg')
files = np.sort(glob.glob(path))
imgNameArray = []
for file in files:
    _, fileName = ntpath.split(file) 
    imgNameArray.append(fileName.split('.')[0])
imgNameArray
d = {'file_names': files, 'image_names': imgNameArray}    
df = pd.DataFrame(data=d)
df.to_csv('test_imgs_list.csv', index=False)

In [5]:
def save_test_set_chunk (chunk_size=20000, size=224):
    """
    using the test_imgs_list.csv file, load the image files and convert into a tensor using chnunks of chunk_size
    save the resulting tensors of 1000 input images to files.
    chunk_size = number of images to use in each chunk
    size - size of square image in pixels
    """
    chunk = 1
    for gm_chunk in pd.read_csv('data/test_imgs_list.csv', chunksize=chunk_size):
        test_data = paths_to_tensor(gm_chunk['file_names'], size).astype('float32')/255
        np.save('data/tensors/testing/test_data_' + str(size) + '/chunk_' + str(chunk) + '.txt', test_data)
        chunk = chunk + 1

### Save the pre-processed training set images in batches of 10000 per file (size 299 x 299)

In [6]:
save_test_set_chunk (chunk_size=10000, size=299)

100%|██████████| 10000/10000 [00:55<00:00, 181.67it/s]
100%|██████████| 10000/10000 [01:01<00:00, 163.47it/s]
100%|██████████| 10000/10000 [01:00<00:00, 165.81it/s]
100%|██████████| 10000/10000 [00:59<00:00, 168.75it/s]
100%|██████████| 10000/10000 [00:58<00:00, 169.99it/s]
100%|██████████| 10000/10000 [01:02<00:00, 160.19it/s]
100%|██████████| 10000/10000 [01:01<00:00, 163.16it/s]
100%|██████████| 9726/9726 [01:00<00:00, 161.27it/s]


### Save the pre-processed training set images in batches of 10000 per file (size 224 x 224)

In [7]:
save_test_set_chunk (chunk_size=10000, size=224)

100%|██████████| 10000/10000 [02:42<00:00, 61.46it/s]
100%|██████████| 10000/10000 [02:30<00:00, 66.23it/s]
100%|██████████| 10000/10000 [02:39<00:00, 60.79it/s]
100%|██████████| 10000/10000 [02:37<00:00, 79.67it/s]
100%|██████████| 10000/10000 [02:21<00:00, 70.53it/s]
100%|██████████| 10000/10000 [02:16<00:00, 73.00it/s]
100%|██████████| 10000/10000 [02:39<00:00, 62.87it/s]
100%|██████████| 9726/9726 [02:27<00:00, 65.74it/s]
