In [1]:
### OBJECTIVES ###

# All in random
# Select 10 classes 
# Generate training samples, test samples
# Generate 1 percent of the data 10 classes
# Generate 10 percent of the data 5 classes
# Generate 100 percent of the data 10 classes
# Generate 10 percent of all classes


import os
from glob import glob
import numpy as np
from distutils.dir_util import copy_tree
#from sklearn.model_selection import train_test_split
import shutil
import random
#import pandas as pd
from collections import ChainMap
from pathlib import Path
import json
from tqdm import tqdm

### Generate classlists

In [2]:
# Set the dataset dir
dataset_dir = '../../Datasets/food-101/images/'

In [3]:
# Get the classlist
class_list_all = glob(os.path.join(dataset_dir, '*',))
class_list_all = [i.split('\\')[-1] for i in class_list_all]
class_list_all;

In [4]:
# generate classes     
def generate_classlist_all(dataset_dir):
    '''
    Generates a list of all classes based from the looped data.
    
    Parameters
    ------
    dataset_dir - dataset directory
    '''
    
    class_list_all = glob(os.path.join(dataset_dir, '*',))
    class_list_all = [i.split('\\')[-1] for i in class_list_all]
    return class_list_all

In [5]:
# generate class
def generate_classlist(class_list, num_class):
    '''
    Generates a list of n classes.
    
    Parameters
    ------
    class_list - a list of classes
    num_classes - amount of classes you want
    '''
    counter = 0
    class_list_ = []
    while counter != num_class:
        if np.random.choice(class_list) not in class_list_:        
            class_list_.append(np.random.choice(class_list))
            counter +=1
    return class_list_


In [6]:
def loop_through_files(dataset_dir):
    '''
    Loops through the directory and prints out the how much data within
    
    Parameters
    ------
    data_dir(str) - dataset directory
    '''
    total_files = []
    for dirpath, dirnames, filenames in os.walk(data_dir):
        print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")
        total_files.append(len(filenames))
    return sum(total_files)

### Generating a dictionary

In [7]:
# Append each class file in a dictionary list
# get the length of the string
# random choice of 10%

In [8]:
def generate_dict_data(classlist, dataset_dir):
    '''
    Generates a dictionary of class along with its list of data
    
    Parameters
    ------
    classlist - list of classes
    dataset_dir - dataset directory
    '''
    
    dict_ = {}
    for class_ in tqdm(classlist):
        dict_[class_] = []
    for class_name, list_ in dict_.items():
        for filenames in os.listdir(dataset_dir + class_name):
            list_.append(filenames)
    return dict_

### Generating Train Test Split

In [9]:
def generate_train_test_data(dict_files, percentage_train):
    '''
    Generates a train-test split with the percentage given.
    
    Parameters
    ------
    dict_files - dictionary of classes along with its list of data
    percent_test - percent of train data to generate
    
    Usage
    ----
    train, test = generate_train_test_data(dict_files, percentage_test)
    '''
    test_data_all = []
    train_data_all = []
    
    percentage = percentage_train / 100
    
    for name_, data_ in tqdm(dict_files.items()):
        test_data_all.append({name_ : dict_files[name_][(int(len(dict_files[name_])*percentage)):]})
        train_data_all.append({name_ : dict_files[name_][:(int(len(dict_files[name_])*percentage))]})
            
    # combines lists into a single dictionary        
    test_data_all = dict(ChainMap(*test_data_all))
    train_data_all = dict(ChainMap(*train_data_all))        
    return train_data_all, test_data_all

### taking a sample set from the Train Test Split and Copy-Paste our own dataset

In [10]:
source_folder = '../../Datasets/food-101/images/'
destination_folder = '../../testfolder/test/'
test_destination_folder = 'Food-Vision-REDO/datasets/food_101_10_percent/test/'

In [11]:
def build_train_test_datafiles(classlist, destination_folder, folder_name):
    '''
    takes in a class list, and generates folders locally
    
    Parameters
    -----
    classlist - list of classes
    train_dir - local directory for train files
    test_dir - local directory for test files
    '''
    for i in classlist:
        Path(destination_folder + folder_name + '/' + i).mkdir(parents=True, exist_ok=True),
           

In [12]:
def generate_data_locally(classlist, data, source_folder, destination_folder):
    '''
    Copies and paste data towards the destination folder
    
    Parameters
    ------
    classlist - list of classes
    data - dictionary of train/test data
    source_folder - location of the data
    destination_folder - location you want to paste it
    percentage - amount of percentage within you want to pase
    '''
    
    #percent = percentage/100
    for names_ in tqdm(classlist):
        #for files in data[names_][:(int(len(data[names_]) * percent))]:
        for files in data[names_]:
            # construct full file path
            source = source_folder + names_ + '/' + files
            destination = destination_folder + names_+ '/' + files
            # copy only files
            if os.path.isfile(source):
                shutil.copy(source, destination)
        

In [13]:
def generate_data_locally_percent(classlist, data, source_folder, destination_folder, percentage):
    '''
    Copies and paste data towards the destination folder
    
    Parameters
    ------
    classlist - list of classes
    data - dictionary of train/test data
    source_folder - location of the data
    destination_folder - location you want to paste it
    percentage - amount of percentage within you want to pase
    '''
    
    percent = percentage/100
    for names_ in tqdm(classlist):
        for files in data[names_][:(int(len(data[names_]) * percent))]:
        
            # construct full file path
            source = source_folder + names_ + '/' + files
            destination = destination_folder + names_+ '/' + files
            # copy only files
            if os.path.isfile(source):
                shutil.copy(source, destination)
        

In [14]:
classlist_dict = generate_dict_data(classlist=class_list_all, dataset_dir='../../Datasets/food-101/images/')

100%|██████████| 101/101 [00:00<?, ?it/s]


In [15]:
train, test = generate_train_test_data(classlist_dict, percentage_train=75)

100%|██████████| 101/101 [00:00<00:00, 50491.62it/s]


In [16]:
build_train_test_datafiles(class_list_all, destination_folder='./Food-Vision-REDO/datasets/food_101/', folder_name='train_50')

In [315]:
generate_data_locally(
    classlist=class_list_all,
    data=train,
    source_folder='../../Datasets/food-101/images/',
    destination_folder='./Food-Vision-REDO/datasets/food_101/train/',
    #percentage=10
)

100%|██████████| 101/101 [16:00<00:00,  9.51s/it]


In [319]:
generate_data_locally(
    classlist=class_list_all,
    data=test,
    source_folder='../../Datasets/food-101/images/',
    destination_folder='./Food-Vision-REDO/datasets/food_101/test/',
    #percentage=10
)

100%|██████████| 101/101 [00:28<00:00,  3.53it/s]


In [318]:
generate_data_locally_percent(
    classlist=class_list_all,
    data=train,
    source_folder='../../Datasets/food-101/images/',
    destination_folder='./Food-Vision-REDO/datasets/food_101/train_10/',
    percentage=10
)

100%|██████████| 101/101 [00:04<00:00, 24.10it/s]


In [28]:
generate_data_locally_percent(
    classlist=class_list_all,
    data=train,
    source_folder='../../Datasets/food-101/images/',
    destination_folder='./Food-Vision-REDO/datasets/food_101/train_20/',
    percentage=20
)

100%|██████████| 101/101 [00:22<00:00,  4.47it/s]


In [17]:
generate_data_locally_percent(
    classlist=class_list_all,
    data=train,
    source_folder='../../Datasets/food-101/images/',
    destination_folder='./Food-Vision-REDO/datasets/food_101/train_50/',
    percentage=50
)

100%|██████████| 101/101 [11:22<00:00,  6.76s/it]
