# Import libraries

In [None]:
import os
import zipfile
import random
import shutil
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from shutil import copyfile
import matplotlib.pyplot as plt

# Define path/directory

In [None]:
source_path = './baju_adat'

source_path_madura = os.path.join(source_path, 'madura')
source_path_asmat = os.path.join(source_path, 'asmat')
source_path_dayak = os.path.join(source_path, 'dayak')
source_path_minang = os.path.join(source_path, 'minang')
source_path_bali = os.path.join(source_path, 'bali')
source_path_bugis = os.path.join(source_path, 'bugis')

# os.listdir returns a list containing all files under the given path
print(f"There are {len(os.listdir(source_path_madura))} images of Madura.")
print(f"There are {len(os.listdir(source_path_asmat))} images of Asmat.")
print(f"There are {len(os.listdir(source_path_asmat))} images of Dayak.")
print(f"There are {len(os.listdir(source_path_asmat))} images of Minang.")
print(f"There are {len(os.listdir(source_path_asmat))} images of Bali.")
print(f"There are {len(os.listdir(source_path_asmat))} images of Bugis.")

In [None]:
# Define root directory
root_dir = '/baju_adat_preprocess'

# Empty directory to prevent FileExistsError is the function is run several times
if os.path.exists(root_dir):
  shutil.rmtree(root_dir)

In [None]:
# FUNCTION: create_train_val_dirs
def create_train_val_test_dirs(root_path, list_suku):
  """
  Creates directories for the train and test sets
  
  Args:
    root_path (string) - the base directory path to create subdirectories from
  
  Returns:
    None
  """

  ### START CODE HERE

  # HINT:
  # Use os.makedirs to create your directories with intermediate subdirectories
  # Don't hardcode the paths. Use os.path.join to append the new directories to the root_path parameter

  for suku in list_suku:
    os.makedirs(os.path.join(root_path, f'training/{suku}'))
    os.makedirs(os.path.join(root_path, f'validation/{suku}'))
    os.makedirs(os.path.join(root_path, f'testing/{suku}'))
  
  ### END CODE HERE

In [None]:
try:
  create_train_val_test_dirs(root_path=root_dir, 
                             list_suku=['madura','asmat','dayak','minang','bali','bugis'])
except FileExistsError:
  print("You should not be seeing this since the upper directory is removed beforehand")

In [None]:
# Check the created directories as the result of create_train_val_test_dirs function

for rootdir, dirs, files in os.walk(root_dir):
    for subdir in dirs:
        print(os.path.join(rootdir, subdir))

# Split data

In [None]:
# FUNCTION: split_data
def split_train_val_data(SOURCE_DIR, TRAINING_DIR, VALIDATION_DIR, SPLIT_SIZE):
  """
  Splits the data into train and test sets
  
  Args:
    SOURCE_DIR (string): directory path containing the images
    TRAINING_DIR (string): directory path to be used for training
    VALIDATION_DIR (string): directory path to be used for validation
    SPLIT_SIZE (float): proportion of the dataset to be used for training
    
  Returns:
    None
  """
  ### START CODE HERE
  list_file = os.listdir(SOURCE_DIR)
  list_file_notzero = []
  for files in list_file:
    if os.path.getsize(os.path.join(SOURCE_DIR, files)) == 0:
      print(f'{files} is zero so length, so ignoring.')
    else:
      list_file_notzero.extend([files])
  
  len_train = int(len(list_file_notzero) * SPLIT_SIZE)
  len_validation = int(len(list_file_notzero) - len_train)

  train_files = random.sample(list_file_notzero, len_train)
  validation_files = random.sample(list_file_notzero, len_validation)

  for files in train_files:
    copyfile(os.path.join(SOURCE_DIR,files), os.path.join(TRAINING_DIR, files))
  for files in validation_files:
    copyfile(os.path.join(SOURCE_DIR, files), os.path.join(VALIDATION_DIR, files)) 


  ### END CODE HERE

In [None]:
# FUNCTION: split_data
def split_val_test_data(VALIDATION_DIR, TESTING_DIR, SPLIT_SIZE):
  """
  Splits the data into train and test sets
  
  Args:
    SOURCE_DIR (string): directory path containing the images
    TRAINING_DIR (string): directory path to be used for training
    VALIDATION_DIR (string): directory path to be used for validation
    SPLIT_SIZE (float): proportion of the dataset to be used for training
    
  Returns:
    None
  """
  ### START CODE HERE
  list_file = os.listdir(VALIDATION_DIR)
  list_file_notzero = []
  for files in list_file:
    if os.path.getsize(os.path.join(VALIDATION_DIR, files)) == 0:
      print(f'{files} is zero so length, so ignoring.')
    else:
      list_file_notzero.extend([files])
  
  len_validation = int(len(list_file_notzero) * SPLIT_SIZE)
  len_testing = int(len(list_file_notzero) - len_validation)

  testing_files = random.sample(list_file_notzero, len_testing)

  for files in testing_files:
    shutil.move(os.path.join(VALIDATION_DIR, files), s.path.join(TESTING_DIR, files))

  ### END CODE HERE

In [None]:
# Test your split_data function

# Define paths
MADURA_SOURCE_DIR = source_path_madura
ASMAT_SOURCE_DIR = source_path_asmat
DAYAK_SOURCE_DIR = source_path_dayak
MINANG_SOURCE_DIR = source_path_minang
BALI_SOURCE_DIR = source_path_bali
BUGIS_SOURCE_DIR = source_path_bugis

TRAINING_DIR = "/baju_adat_preprocess/training/"
VALIDATION_DIR = "/baju_adat_preprocess/validation/"
TESTING_DIR = "/baju_adat_preprocess/testing/"

TRAINING_MADURA_DIR = os.path.join(TRAINING_DIR, "madura/")
VALIDATION_MADURA_DIR = os.path.join(VALIDATION_DIR, "madura/")
TESTING_MADURA_DIR = os.path.join(TESTING_DIR, "madura/")

TRAINING_ASMAT_DIR = os.path.join(TRAINING_DIR, "asmat/")
VALIDATION_ASMAT_DIR = os.path.join(VALIDATION_DIR, "asmat/")
TESTING_ASMAT_DIR = os.path.join(TESTING_DIR, "asmat/")

TRAINING_DAYAK_DIR = os.path.join(TRAINING_DIR, "dayak/")
VALIDATION_DAYAK_DIR = os.path.join(VALIDATION_DIR, "dayak/")
TESTING_DAYAK_DIR = os.path.join(TESTING_DIR, "dayak/")

TRAINING_MINANG_DIR = os.path.join(TRAINING_DIR, "minang/")
VALIDATION_MINANG_DIR = os.path.join(VALIDATION_DIR, "minang/")
TESTING_MINANG_DIR = os.path.join(TESTING_DIR, "minang/")

TRAINING_BALI_DIR = os.path.join(TRAINING_DIR, "bali/")
VALIDATION_BALI_DIR = os.path.join(VALIDATION_DIR, "bali/")
TESTING_BALI_DIR = os.path.join(TESTING_DIR, "bali/")

TRAINING_BUGIS_DIR = os.path.join(TRAINING_DIR, "bugis/")
VALIDATION_BUGIS_DIR = os.path.join(VALIDATION_DIR, "bugis/")
TESTING_BUGIS_DIR = os.path.join(TESTING_DIR, "bugis/")

In [None]:
# Empty directories in case you run this cell multiple times
if len(os.listdir(TRAINING_MADURA_DIR)) > 0:
  for file in os.scandir(TRAINING_MADURA_DIR):
    os.remove(file.path)
if len(os.listdir(TRAINING_ASMAT_DIR)) > 0:
  for file in os.scandir(TRAINING_ASMAT_DIR):
    os.remove(file.path)
if len(os.listdir(TRAINING_DAYAK_DIR)) > 0:
  for file in os.scandir(TRAINING_DAYAK_DIR):
    os.remove(file.path)
if len(os.listdir(TRAINING_MINANG_DIR)) > 0:
  for file in os.scandir(TRAINING_MINANG_DIR):
    os.remove(file.path)
if len(os.listdir(TRAINING_BALI_DIR)) > 0:
  for file in os.scandir(TRAINING_BALI_DIR):
    os.remove(file.path)
if len(os.listdir(TRAINING_BUGIS_DIR)) > 0:
  for file in os.scandir(TRAINING_BUGIS_DIR):
    os.remove(file.path)

if len(os.listdir(VALIDATION_MADURA_DIR)) > 0:
  for file in os.scandir(VALIDATION_MADURA_DIR):
    os.remove(file.path)
if len(os.listdir(VALIDATION_ASMAT_DIR)) > 0:
  for file in os.scandir(VALIDATION_ASMAT_DIR):
    os.remove(file.path)
if len(os.listdir(VALIDATION_DAYAK_DIR)) > 0:
  for file in os.scandir(VALIDATION_DAYAK_DIR):
    os.remove(file.path)
if len(os.listdir(VALIDATION_MINANG_DIR)) > 0:
  for file in os.scandir(VALIDATION_MINANG_DIR):
    os.remove(file.path)
if len(os.listdir(VALIDATION_BALI_DIR)) > 0:
  for file in os.scandir(VALIDATION_BALI_DIR):
    os.remove(file.path)
if len(os.listdir(VALIDATION_BUGIS_DIR)) > 0:
  for file in os.scandir(VALIDATION_BUGIS_DIR):
    os.remove(file.path)

if len(os.listdir(TESTING_MADURA_DIR)) > 0:
  for file in os.scandir(TESTING_MADURA_DIR):
    os.remove(file.path)
if len(os.listdir(TESTING_ASMAT_DIR)) > 0:
  for file in os.scandir(TESTING_ASMAT_DIR):
    os.remove(file.path)
if len(os.listdir(TESTING_DAYAK_DIR)) > 0:
  for file in os.scandir(TESTING_DAYAK_DIR):
    os.remove(file.path)
if len(os.listdir(TESTING_MINANG_DIR)) > 0:
  for file in os.scandir(TESTING_MINANG_DIR):
    os.remove(file.path)
if len(os.listdir(TESTING_BALI_DIR)) > 0:
  for file in os.scandir(TESTING_BALI_DIR):
    os.remove(file.path)
if len(os.listdir(TESTING_BUGIS_DIR)) > 0:
  for file in os.scandir(TESTING_BUGIS_DIR):
    os.remove(file.path)

In [None]:
# Define proportion of images used for training
# Train:val:test = 90% : 5% : 5%
train_val_split_size = .9
val_test_split_size = .5

# Run the function
# NOTE: Messages about zero length images should be printed out
split_train_val_data(MADURA_SOURCE_DIR, TRAINING_MADURA_DIR, VALIDATION_MADURA_DIR, train_val_split_size) # train val split 
split_val_test_data(VALIDATION_MADURA_DIR, TESTING_MADURA_DIR, val_test_split_size) # val test split

split_train_val_data(ASMAT_SOURCE_DIR, TRAINING_ASMAT_DIR, VALIDATION_ASMAT_DIR, train_val_split_size) # train val split 
split_val_test_data(VALIDATION_ASMAT_DIR, TESTING_ASMAT_DIR, val_test_split_size) # val test split

split_train_val_data(DAYAK_SOURCE_DIR, TRAINING_DAYAK_DIR, VALIDATION_DAYAK_DIR, train_val_split_size) # train val split 
split_val_test_data(VALIDATION_DAYAK_DIR, TESTING_DAYAK_DIR, val_test_split_size) # val test split

split_train_val_data(MINANG_SOURCE_DIR, TRAINING_MINANG_DIR, VALIDATION_MINANG_DIR, train_val_split_size) # train val split 
split_val_test_data(VALIDATION_MINANG_DIR, TESTING_MINANG_DIR, val_test_split_size) # val test split

split_train_val_data(BALI_SOURCE_DIR, TRAINING_BALI_DIR, VALIDATION_BALI_DIR, train_val_split_size) # train val split 
split_val_test_data(VALIDATION_BALI_DIR, TESTING_BALI_DIR, val_test_split_size) # val test split

split_train_val_data(BUGIS_SOURCE_DIR, TRAINING_BUGIS_DIR, VALIDATION_BUGIS_DIR, train_val_split_size) # train val split 
split_val_test_data(VALIDATION_BUGIS_DIR, TESTING_BUGIS_DIR, val_test_split_size) # val test split

In [None]:
# Your function should perform copies rather than moving images so original directories should contain unchanged images
print(f"\n\nOriginal Madura's directory has {len(os.listdir(MADURA_SOURCE_DIR))} images")
print(f"Original Asmat's directory has {len(os.listdir(ASMAT_SOURCE_DIR))} images\n")
print(f"Original Dayak's directory has {len(os.listdir(DAYAK_SOURCE_DIR))} images\n")
print(f"Original Minang's directory has {len(os.listdir(MINANG_SOURCE_DIR))} images\n")
print(f"Original Bali's directory has {len(os.listdir(BALI_SOURCE_DIR))} images\n")
print(f"Original Bugis's directory has {len(os.listdir(BUGIS_SOURCE_DIR))} images\n")

# Training and validation splits. Check that the number of images matches the expected output.
print(f"There are {len(os.listdir(TRAINING_MADURA_DIR))} images of Madura for training")
print(f"There are {len(os.listdir(TRAINING_ASMAT_DIR))} images of Asmat for training")
print(f"There are {len(os.listdir(TRAINING_DAYAK_DIR))} images of Dayak for training")
print(f"There are {len(os.listdir(TRAINING_MINANG_DIR))} images of Minang for training")
print(f"There are {len(os.listdir(TRAINING_BALI_DIR))} images of Bali for training")
print(f"There are {len(os.listdir(TRAINING_BUGIS_DIR))} images of Bugis for training")
print()
print(f"There are {len(os.listdir(VALIDATION_MADURA_DIR))} images of Madura for validation")
print(f"There are {len(os.listdir(VALIDATION_ASMAT_DIR))} images of Asmat for validation")
print(f"There are {len(os.listdir(VALIDATION_DAYAK_DIR))} images of Dayak for validation")
print(f"There are {len(os.listdir(VALIDATION_MINANG_DIR))} images of Minang for validation")
print(f"There are {len(os.listdir(VALIDATION_BALI_DIR))} images of Bali for validation")
print(f"There are {len(os.listdir(VALIDATION_BUGIS_DIR))} images of Bugis for validation")
print()
print(f"There are {len(os.listdir(TESTING_MADURA_DIR))} images of Madura for testing")
print(f"There are {len(os.listdir(TESTING_ASMAT_DIR))} images of Asmat for testing")
print(f"There are {len(os.listdir(TESTING_DAYAK_DIR))} images of Dayak for testing")
print(f"There are {len(os.listdir(TESTING_MINANG_DIR))} images of Minang for testing")
print(f"There are {len(os.listdir(TESTING_BALI_DIR))} images of Bali for testing")
print(f"There are {len(os.listdir(TESTING_BUGIS_DIR))} images of Bugis for testing")

# Train_val generators incl. image augmentation

In [None]:
# GRADED FUNCTION: train_val_generators
def train_val_test_generators(TRAINING_DIR, VALIDATION_DIR, TESTING_DIR):
  """
  Creates the training and validation data generators
  
  Args:
    TRAINING_DIR (string): directory path containing the training images
    VALIDATION_DIR (string): directory path containing the testing/validation images
    
  Returns:
    train_generator, validation_generator - tuple containing the generators
  """
  ### START CODE HERE

  # Instantiate the ImageDataGenerator class (don't forget to set the arguments to augment the images)
  train_datagen = ImageDataGenerator(rescale=1./255.,
                                     rotation_range=50,
                                     width_shift_range=0.2,
                                     height_shift_range=0.2,
                                     shear_range=0.2,
                                     zoom_range=0.2,
                                     horizontal_flip=True,
                                     fill_mode='nearest')

  # Pass in the appropriate arguments to the flow_from_directory method
  train_generator = train_datagen.flow_from_directory(directory=TRAINING_DIR,
                                                      batch_size=20,
                                                      class_mode='binary',
                                                      target_size=(150, 150))



  # Instantiate the ImageDataGenerator class (don't forget to set the rescale argument)
  validation_datagen = ImageDataGenerator(rescale=1./255.)

  # Pass in the appropriate arguments to the flow_from_directory method
  validation_generator = validation_datagen.flow_from_directory(directory=VALIDATION_DIR,
                                                                batch_size=20,
                                                                class_mode='binary',
                                                                target_size=(150, 150))



  # Instantiate the ImageDataGenerator class (don't forget to set the rescale argument)
  test_datagen = ImageDataGenerator(rescale=1./255.)

  # Pass in the appropriate arguments to the flow_from_directory method
  test_generator = test_datagen.flow_from_directory(directory=TESTING_DIR,
                                                    batch_size=20,
                                                    class_mode='binary',
                                                    target_size=(150, 150))

  ### END CODE HERE
  return train_generator, validation_generator, test_generator

In [None]:
# Test your generators
train_generator, validation_generator, test_generator = train_val_test_generators(TRAINING_DIR, VALIDATION_DIR, TESTING_DIR)