### Description
This notebook will generate dataset for binary classification if the image is a traffic sign or not. Classes:
- 0: traffic sign
- 1: not traffic sign

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, random
import glob
import cv2
import pickle
from sklearn.model_selection import train_test_split
from PIL import Image

### Configuration

In [3]:
destiny_labels_dir = 'E:\\Data\\Traffic_Signs_Preprocessed_Bin\\labels.pickle'
destiny_data_dir = 'E:\\Data\\Traffic_Signs_Preprocessed_Bin\\data.pickle'
source_sign_data_dir = 'E:\\Data\\Traffic_Signs_Preprocessed_Class\\data.pickle' #
source_data_dir = 'E:\\Data\\Crop_Images'  # images names should follow the following pattern 'image (NUMBER).jpg'
source_data_count = 100 #
gen_data_count = 104010 #
img_rows = 32
img_cols = 32

### Preparing env

In [None]:
# read & preprocess data to crop images from
#images = [cv2.cvtColor(cv2.imread(f"{source_data_dir}\\image ({index}).jpg", cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB) for index in range(1, source_data_count+1)]

# read data with 43 classes
signs_data = pd.read_pickle(source_sign_data_dir)
x_train = np.flip(np.rot90(signs_data['x_train'].transpose(0, 3, 2, 1), axes=(1, 2)), axis=1).astype('float64')
x_test = np.flip(np.rot90(signs_data['x_test'].transpose(0, 3, 2, 1), axes=(1, 2)), axis=1).astype('float64')
x_valid = np.flip(np.rot90(signs_data['x_validation'].transpose(0, 3, 2, 1), axes=(1, 2)), axis=1).astype('float64')

In [None]:
def preprocess_img(img):
    return np.array(img, dtype='float64') / 255.0

In [None]:
def generate_data(images, destiny_shape=(img_rows, img_cols), count=gen_data_count):
    images_count = len(images)
    per_image_windows = int(count / images_count)
    result = []
    
    for index, image in enumerate(images):
        print(f"Progress: {index+1}/{images_count}")
        preprocessed_image = preprocess_img(image)
        
        iterations = per_image_windows
        if index == images_count-1:
            iterations = count - len(result)
            
        for i in range(iterations):
            scale = random.randint(5, 60)/100.0
            min_dim = min(preprocessed_image.shape[0], preprocessed_image.shape[1])
            dim = (int(min_dim*scale), int(min_dim*scale))
            
            x = random.randint(0, image.shape[1] - dim[1])
            y = random.randint(0, image.shape[0] - dim[0])
            
            cropped = image[y:y+dim[0], x:x+dim[1]]
            cropped = cv2.resize(cropped, destiny_shape)
            result.append(cropped)
            
    return np.array(result, dtype='float64') / 255.0

### Generating

In [None]:
generated_data = generate_data(images)

In [None]:
with open('E:\\Data\\Traffic_Signs_Preprocessed_Bin\\generated.pickle', 'wb') as handle:
    pickle.dump(generated_data, handle)

In [None]:
generated_data = pd.read_pickle(source_sign_data_dir)
signs_data = pd.read_pickle(source_sign_data_dir)

In [None]:
signs_data = np.concatenate((x_train, x_test, x_valid))
signs_data = signs_data[:gen_data_count]

In [None]:
with open('E:\\Data\\Traffic_Signs_Preprocessed_Bin\\signs.pickle', 'wb') as handle:
    pickle.dump(signs_data, handle)

In [None]:
generated_data = pd.read_pickle('E:\\Data\\Traffic_Signs_Preprocessed_Bin\\generated.pickle')
signs_data = pd.read_pickle('E:\\Data\\Traffic_Signs_Preprocessed_Bin\\signs.pickle')

In [4]:
y_sign_data = np.zeros(gen_data_count)
y_not_sign_data = np.ones(gen_data_count)

In [19]:
labels = ["traffic sign", "not traffic sign"]

In [None]:
with open(destiny_labels_dir, 'wb') as handle:
    pickle.dump(labels, handle)

In [None]:
x_data = np.concatenate((signs_data, generated_data))
y_data = np.concatenate((y_sign_data, y_not_sign_data))

In [None]:
with open('E:\\Data\\Traffic_Signs_Preprocessed_Bin\\x_data.pickle', 'wb') as handle:
    pickle.dump(x_data, handle)

In [None]:
with open('E:\\Data\\Traffic_Signs_Preprocessed_Bin\\y_data.pickle', 'wb') as handle:
    pickle.dump(y_data, handle)

In [2]:
x_data = pd.read_pickle('E:\\Data\\Traffic_Signs_Preprocessed_Bin\\x_data.pickle')
y_data = pd.read_pickle('E:\\Data\\Traffic_Signs_Preprocessed_Bin\\y_data.pickle')

In [3]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1)

In [16]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.05)

In [17]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
print(x_valid.shape)
print(y_valid.shape)

(177857, 32, 32, 3)
(177857,)
(20802, 32, 32, 3)
(20802,)
(9361, 32, 32, 3)
(9361,)


In [20]:
data = {
    "x_train": x_train,
    "y_train": y_train,
    "x_test": x_test,
    "y_test": y_test,
    "x_valid": x_valid,
    "y_valid": y_valid,
    "labels": labels,
}

In [25]:
with open(destiny_data_dir, 'wb') as handle:
    pickle.dump(data, handle)