# Clear dataset folder

In [1]:
import os
import shutil

DATASET_PATH = './datasets'

for folder in os.listdir(DATASET_PATH):
    folder_path = os.path.join(DATASET_PATH, folder)
    if os.path.isdir(folder_path):
        shutil.rmtree(folder_path)

# BDD100K dataset
- Download dataset from [BDD100K](https://bdd-data.berkeley.edu/portal.html#download)
- Move downloaded zip file to the same folder as this notebook
- Set the constant parameter
- This part of code

In [1]:
# for extract zip file
LABEL_ZIP_PATH = './bdd100k_labels_release.zip'
DATASET_ZIP_PATH = './bdd100k_images_10k.zip'


UNZIP_PATH = './raw_unzip'
# for move images to train and test folder
TRAIN_LABEL_PATH = os.path.join(UNZIP_PATH, 'bdd100k', 'labels', 'bdd100k_labels_images_train.json')
VAL_LABEL_PATH = os.path.join(UNZIP_PATH, 'bdd100k', 'labels', 'bdd100k_labels_images_val.json')
SOURCE_PATH = os.path.join(UNZIP_PATH, 'bdd100k', 'images', '10K')
DATASET_NAME = 'BDD100K'
TRAIN_TEST_SPLIT_RATIO = 0.8

## Extract zip file

In [2]:
import zipfile

In [3]:
# extract zip file
with zipfile.ZipFile(LABEL_ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall(UNZIP_PATH, )
with zipfile.ZipFile(DATASET_ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall(UNZIP_PATH)

## Move image to dataset folder
- folder `A` is night image
- folder `B` is day image

In [4]:
# load label json file
import json
import os
from tqdm import tqdm
import shutil
import random

In [5]:
# clear dataset folder
if os.path.exists(os.path.join('datasets', DATASET_NAME)):
    shutil.rmtree(os.path.join('datasets', DATASET_NAME))

In [6]:
# create dataset structure folder
for path in [
    'datasets',
    os.path.join('datasets', DATASET_NAME),
    os.path.join('datasets', DATASET_NAME, 'trainA'),
    os.path.join('datasets', DATASET_NAME, 'trainB'),
    os.path.join('datasets', DATASET_NAME, 'testA'),
    os.path.join('datasets', DATASET_NAME, 'testB')
]:
    if not os.path.exists(path):
        os.makedirs(path)

In [7]:
train_label = json.load(open(TRAIN_LABEL_PATH))
val_label = json.load(open(VAL_LABEL_PATH))

In [8]:
image_label = dict()
for label in train_label:
    image_label[label['name']] = label['attributes']
for label in val_label:
    image_label[label['name']] = label['attributes']
del train_label, val_label

In [9]:
for subdir in os.listdir(SOURCE_PATH):
    curr_path = os.path.join(SOURCE_PATH, subdir)

    looper = tqdm(os.listdir(curr_path), desc=subdir, unit='image')
    for image in looper:
        looper.set_description(f'{subdir} - {image}')
        looper.refresh()

        if image not in image_label:
            continue
        
        label = image_label[image]
        if label['timeofday'] == 'daytime':
            if random.random() < TRAIN_TEST_SPLIT_RATIO:
                shutil.copy(os.path.join(curr_path, image), os.path.join('datasets', DATASET_NAME, 'trainB', image))
            else:
                shutil.copy(os.path.join(curr_path, image), os.path.join('datasets', DATASET_NAME, 'testB', image))
        elif label['timeofday'] == 'night':
            if random.random() < TRAIN_TEST_SPLIT_RATIO:
                shutil.copy(os.path.join(curr_path, image), os.path.join('datasets', DATASET_NAME, 'trainA', image))
            else:
                shutil.copy(os.path.join(curr_path, image), os.path.join('datasets', DATASET_NAME, 'testA', image))

test - fffc7bba-f05de9f5.jpg: 100%|██████████| 2000/2000 [00:04<00:00, 471.59image/s]
train - ff3da814-c3463a43.jpg: 100%|██████████| 7000/7000 [00:41<00:00, 167.50image/s]
val - ff7b98c7-3cb964ac.jpg: 100%|██████████| 1000/1000 [00:02<00:00, 480.08image/s]


## Clear unused folder

In [10]:
# remove raw unzip folder
shutil.rmtree(UNZIP_PATH)

# Aj.'s dataset (AJDATASET01)

- from VDO `2021_0607_184742_013.MOV`

## Setup constant parameter

In [25]:
IMAGE_POOL_PATH = 'E:/indiv_vdo/extracted/image_pool/'
IMAGE_INFO_DICT_PATH = 'E:/indiv_vdo/extracted/dict/info_dict.pkl'

TARGET_DATASET_PATH = 'E:/indiv_vdo/datasets'
DATASET_NAME = 'AJDATASET01'

TRAIN_TEST_SPLIT_RATIO = 0.9


## Move image to dataset folder

In [26]:
import pickle
import os
import tqdm
import random

In [27]:
image_info_dict = pickle.load(open(IMAGE_INFO_DICT_PATH, 'rb'))

In [29]:
# recreate dataset structure folder
if os.path.exists(os.path.join(TARGET_DATASET_PATH, DATASET_NAME)):
    shutil.rmtree(os.path.join(TARGET_DATASET_PATH, DATASET_NAME))
for path in [
        TARGET_DATASET_PATH,
        os.path.join(TARGET_DATASET_PATH, DATASET_NAME),
        os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'trainA'),
        os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'trainB'),
        os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'testA'),
        os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'testB')
    ]:
    if not os.path.exists(path):
        os.makedirs(path)


In [30]:
# copy image from image pool to dataset folder
looper = tqdm.tqdm(os.listdir(IMAGE_POOL_PATH), desc='copy image', unit='image')
for image_name in looper:
    if image_name in image_info_dict:
        image_info = image_info_dict[image_name]
        if image_info['vdo_name'] != '2021_0607_184742_013.MOV':
            continue
        if image_info['day_night'] == 'day':
            if random.random() < TRAIN_TEST_SPLIT_RATIO:
                shutil.copy(os.path.join(IMAGE_POOL_PATH, image_name), os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'trainB', image_name))
            else:
                shutil.copy(os.path.join(IMAGE_POOL_PATH, image_name), os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'testB', image_name))
        elif image_info['day_night'] == 'night':
            if random.random() < TRAIN_TEST_SPLIT_RATIO:
                shutil.copy(os.path.join(IMAGE_POOL_PATH, image_name), os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'trainA', image_name))
            else:
                shutil.copy(os.path.join(IMAGE_POOL_PATH, image_name), os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'testA', image_name))


copy image: 100%|██████████| 264/264 [00:00<00:00, 608.30image/s]


# Aj.'s dataset (AJDATASET02)

- from VDO `2021_0607_184742_013.MOV` and `Top.MOV`
- which random select 150 images of each day and night to train
- random select 50 images of each day and night to test

*observation : those 2 VDOs has the image structure very similar so I decided to merge them together.*

## Setup constant parameter

In [32]:
IMAGE_POOL_PATH = 'E:/indiv_vdo/extracted/image_pool/'
IMAGE_INFO_DICT_PATH = 'E:/indiv_vdo/extracted/dict/info_dict.pkl'

TARGET_DATASET_PATH = './datasets'
DATASET_NAME = 'AJDATASET02'

TRAIN_TEST_SPLIT_RATIO = 0.9


## Move image to dataset folder

In [33]:
import pickle
import os
import tqdm
import random
import shutil

In [34]:
image_info_dict = pickle.load(open(IMAGE_INFO_DICT_PATH, 'rb'))

In [35]:
# recreate dataset structure folder
if os.path.exists(os.path.join(TARGET_DATASET_PATH, DATASET_NAME)):
    shutil.rmtree(os.path.join(TARGET_DATASET_PATH, DATASET_NAME))
for path in [
        TARGET_DATASET_PATH,
        os.path.join(TARGET_DATASET_PATH, DATASET_NAME),
        os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'trainA'),
        os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'trainB'),
        os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'testA'),
        os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'testB')
    ]:
    if not os.path.exists(path):
        os.makedirs(path)


In [36]:
night_test_count = 0
day_test_count = 0
night_train_count = 0
day_train_count = 0

# copy image from image pool to dataset folder
looper = os.listdir(IMAGE_POOL_PATH)
random.shuffle(looper)
looper = tqdm.tqdm(looper, desc='copy image', unit='image')
for image_name in looper:
    if image_name in image_info_dict:
        image_info = image_info_dict[image_name]
        if image_info['day_night'] == 'day':
            if random.random() < TRAIN_TEST_SPLIT_RATIO and day_train_count < 150:
                shutil.copy(os.path.join(IMAGE_POOL_PATH, image_name), os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'trainB', image_name))
                day_train_count += 1
            elif day_test_count < 50:
                shutil.copy(os.path.join(IMAGE_POOL_PATH, image_name), os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'testB', image_name))
                day_test_count += 1
        elif image_info['day_night'] == 'night':
            if random.random() < TRAIN_TEST_SPLIT_RATIO and night_train_count < 150:
                shutil.copy(os.path.join(IMAGE_POOL_PATH, image_name), os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'trainA', image_name))
                night_train_count += 1
            elif night_test_count < 50:
                shutil.copy(os.path.join(IMAGE_POOL_PATH, image_name), os.path.join(TARGET_DATASET_PATH, DATASET_NAME, 'testA', image_name))
                night_test_count += 1


copy image: 100%|██████████| 673/673 [00:00<00:00, 807.62image/s]


# Display Dataset Information

In [38]:
TARGET_DATASET_PATH = './datasets/'

In [39]:
dataset_name_list = os.listdir(TARGET_DATASET_PATH)
print(f'There are {len(dataset_name_list)} dataset in {TARGET_DATASET_PATH}')
for dataset_idx in range(len(dataset_name_list)):
    folder_list = os.listdir(os.path.join(TARGET_DATASET_PATH, dataset_name_list[dataset_idx]))
    print(f'{dataset_idx+1}: {dataset_name_list[dataset_idx]} ({len(folder_list)} folders)')
    train_image_amt = 0
    test_image_amt = 0

    for folder_idx in range(len(folder_list)):
        image_list = os.listdir(os.path.join(TARGET_DATASET_PATH, dataset_name_list[dataset_idx], folder_list[folder_idx]))
        print(f'\t{chr(ord("a")+folder_idx)}: {folder_list[folder_idx]} - {len(image_list)} images')

        if folder_list[folder_idx].startswith('train'):
            train_image_amt += len(image_list)
        elif folder_list[folder_idx].startswith('test'):
            test_image_amt += len(image_list)

    print(f'\t*: Train: {train_image_amt} images({train_image_amt/(train_image_amt+test_image_amt):.2%})')
    print(f'\t*: Test: {test_image_amt} images({test_image_amt/(train_image_amt+test_image_amt):.2%})')

There are 2 dataset in ./datasets/
1: AJDATASET01 (4 folders)
	a: testA - 21 images
	b: testB - 7 images
	c: trainA - 169 images
	d: trainB - 67 images
	*: Train: 236 images(89.39%)
	*: Test: 28 images(10.61%)
2: AJDATASET02 (4 folders)
	a: testA - 40 images
	b: testB - 50 images
	c: trainA - 150 images
	d: trainB - 150 images
	*: Train: 300 images(76.92%)
	*: Test: 90 images(23.08%)
