# split_uecfood256

- Split UECFOOD256 dataset to training, val, testing sets with ratio 0.7, 0.2, 0.1
- Save img_dir, category_id, x1, y1, x2, y2 into txt file under train_uec256.txt, val_uec256.txt and test_uec256.txt.

In [4]:
import random
import itertools
import numpy as np

In [7]:
def split_dataset():
    dataset_disk = '/Volumes/JS/'
    uecfood256_path = dataset_disk + 'UECFOOD256/' + 'UECFOOD256'
    category = 'category.txt'
    bbox_info = 'bb_info.txt'

    split = [0.7, 0.2, 0.1]
    files_generated = ['train_uec256.txt', 'val_uec256.txt', 'test_uec256.txt']

    """ Put first column (id) and second column (name) from category.txt into two lists """
    category_ids = []
    category_names = []
    with open(uecfood256_path + '/' + category, 'r') as category_list:
        for i, line in enumerate(category_list):
            if i > 0:
                line = line.rstrip('\n')  # delete \n in the end of the line
                line = line.split('\t')
                category_ids.append(int(line[0]))
                category_names.append(line[1])

    """ Read bb_info.txt based on category id """ 
    category_images = []
    category_bbox = []
    for id_index, id in enumerate(category_ids):
        category_images.append([])
        category_bbox.append([])
        with open(uecfood256_path + '/' + str(id) + '/' + bbox_info, 'r') as bbox_list:
            for i, line in enumerate(bbox_list):
                if i > 0:
                    line = line.rstrip('\n')
                    line = line.split(' ')
                    category_images[id_index].append(line[0])
                    category_bbox[id_index].append(list(map(float, line[1:])))

    """ Split categories to train/val/test with ratio define before """
    train_uec256 = []
    val_uec256 = []
    test_uec256 = []
    for id_index, id in enumerate(category_ids):
        # divide each category with 70% training, 20% val, 10% testing
        n_imgs = len(category_images[id_index])
        n_train = int(np.floor(n_imgs * split[0]))
        n_val = int(np.floor(n_imgs * split[1]))
        n_test = int(n_imgs - n_train - n_val)

        # shuffle images
        shuffled_imgs = random.sample(category_images[id_index], n_imgs)

        train_uec256.append(shuffled_imgs[:n_train])  # not including the last one
        val_uec256.append(shuffled_imgs[n_train:n_train + n_val])
        test_uec256.append(shuffled_imgs[n_train + n_val:])

    all_train_list = list(np.unique(list(itertools.chain(*train_uec256))))
    all_val_list = list(np.unique(list(itertools.chain(*val_uec256))))
    all_test_list = list(np.unique(list(itertools.chain(*test_uec256))))

    # Pop out element in training set if it's in testing or val also
    i = 0
    while i < len(all_train_list):  # give priority to val and test over train
        if all_train_list[i] in all_val_list:  # training sample is in val set too
            all_train_list.pop(i)
        elif all_train_list[i] in all_test_list:  # training sample is in test set too
            all_train_list.pop(i)
        else:
            i += 1

    # Pop out element in testing set if it's in val also
    i = 0
    while i < len(all_test_list):  # give priority to val over test
        if all_test_list[i] in all_val_list:  # test sample is in val set too
            all_test_list.pop(i)
        else:
            i += 1

    """ Split bounding box with train, val, test sets """
    imgs_format = 'jpg'
    file = open(uecfood256_path + '/classes.txt', 'w')
    for c in category_names:
        file.write(c + '\n')
    file.close()

    # Training set
    file = open(uecfood256_path + '/' + files_generated[0], 'w')
    file.write('img category_id x1 y1 x2 y2\n')  # header
    for img in all_train_list:
        # it is possible that one image in several categories
        occurrences = []
        for id_index, id in enumerate(category_ids):
            occ = [[uecfood256_path + '/' + str(id) + '/' + img + '.' + imgs_format, str(id)] +
                   category_bbox[id_index][i] for i, elem in enumerate(category_images[id_index]) if elem == img]
            occurrences += occ

        for occ in occurrences:
            img_path = occ[0]
            img_category = occ[1]
            img_bbox = str(occ[2]) + ' ' + str(occ[3]) + ' ' + str(occ[4]) + ' ' + str(occ[5])
            file.write(img_path + ' ' + img_category + ' ' + img_bbox + '\n')
    file.close()

    # Val set
    file = open(uecfood256_path + '/' + files_generated[1], 'w')
    file.write('img category_id x1 y1 x2 y2\n')  # header
    for img in all_train_list:
        # it is possible that one image in several categories
        occurrences = []
        for id_index, id in enumerate(category_ids):
            occ = [[uecfood256_path + '/' + str(id) + '/' + img + '.' + imgs_format, str(id)] +
                   category_bbox[id_index][i] for i, elem in enumerate(category_images[id_index]) if elem == img]
            occurrences += occ

        for occ in occurrences:
            img_path = occ[0]
            img_category = occ[1]
            img_bbox = str(occ[2]) + ' ' + str(occ[3]) + ' ' + str(occ[4]) + ' ' + str(occ[5])
            file.write(img_path + ' ' + img_category + ' ' + img_bbox + '\n')
    file.close()

    # Testing set
    file = open(uecfood256_path + '/' + files_generated[2], 'w')
    file.write('img category_id x1 y1 x2 y2\n')  # header
    for img in all_train_list:
        # it is possible that one image in several categories
        occurrences = []
        for id_index, id in enumerate(category_ids):
            occ = [[uecfood256_path + '/' + str(id) + '/' + img + '.' + imgs_format, str(id)] +
                   category_bbox[id_index][i] for i, elem in enumerate(category_images[id_index]) if elem == img]
            occurrences += occ

        for occ in occurrences:
            img_path = occ[0]
            img_category = occ[1]
            img_bbox = str(occ[2]) + ' ' + str(occ[3]) + ' ' + str(occ[4]) + ' ' + str(occ[5])
            file.write(img_path + ' ' + img_category + ' ' + img_bbox + '\n')
    file.close()

    print('Done!')

In [8]:
split_dataset()

Done!


**The generated txt file should like this: **   
img category_id x1 y1 x2 y2   
/Volumes/JS/UECFOOD256/UECFOOD256/1/1.jpg 1 0.0 143.0 370.0 486.0
/Volumes/JS/UECFOOD256/UECFOOD256/42/1.jpg 42 363.0 91.0 800.0 560.0
/Volumes/JS/UECFOOD256/UECFOOD256/2/100.jpg 2 23.0 0.0 293.0 227.0
/Volumes/JS/UECFOOD256/UECFOOD256/11/1000.jpg 11 11.0 6.0 236.0 203.0
/Volumes/JS/UECFOOD256/UECFOOD256/137/100031.jpg 137 16.0 30.0 456.0 325.0
/Volumes/JS/UECFOOD256/UECFOOD256/137/100060.jpg 137 45.0 28.0 488.0 399.0
/Volumes/JS/UECFOOD256/UECFOOD256/137/100085.jpg 137 56.0 61.0 496.0 383.0
/Volumes/JS/UECFOOD256/UECFOOD256/11/1001.jpg 11 19.0 13.0 231.0 192.0
/Volumes/JS/UECFOOD256/UECFOOD256/153/100125.jpg 153 21.0 18.0 590.0 375.0
/Volumes/JS/UECFOOD256/UECFOOD256/155/100163.jpg 155 20.0 37.0 484.0 481.0
/Volumes/JS/UECFOOD256/UECFOOD256/153/100169.jpg 153 16.0 46.0 502.0 467.0
/Volumes/JS/UECFOOD256/UECFOOD256/123/100208.jpg 123 3.0 8.0 302.0 319.0
/Volumes/JS/UECFOOD256/UECFOOD256/155/100225.jpg 155 45.0 39.0 475.0 435.0
/Volumes/JS/UECFOOD256/UECFOOD256/155/100238.jpg 155 35.0 93.0 456.0 475.0