# 1. Cek Raw Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os

# Global Variables
RAW_DATASET_PATH = "/content/drive/MyDrive/Product_Based_Capstone/Team_ML/Raw_Dataset"
CLEAN_DATASET_PATH = "/content/drive/MyDrive/Product_Based_Capstone/Team_ML/Clean_Dataset"
TRAIN_PATH = os.path.join(CLEAN_DATASET_PATH, 'train')
DEV_PATH = os.path.join(CLEAN_DATASET_PATH, 'dev')
# TEST_PATH = os.path.join(DATASET_SPLIT_PATH, 'test')
PLANTS_LIST = os.listdir(RAW_DATASET_PATH)
NUM_OF_PLANTS = len(PLANTS_LIST)

print('Total jenis tanaman herbal:', NUM_OF_PLANTS)

Total jenis tanaman herbal: 18


In [None]:
# List gambar per tanaman
def CountPlantImages(data_path):
  index = 1
  sum = 0
  print('-= Banyak gambar per tanaman =-')
  for plant in PLANTS_LIST:
    plant_path = os.path.join(data_path, plant)
    print('{:2}. {:16} : {}'.format(index, plant, len(os.listdir(plant_path))))
    index += 1
    sum += len(os.listdir(plant_path))
  print('\nTotal gambar:', sum)

CountPlantImages(RAW_DATASET_PATH)

-= Banyak gambar per tanaman =-
 1. Sambiloto        : 115
 2. Sembung          : 100
 3. Meniran          : 100
 4. Andong           : 87
 5. Kumis Kucing     : 147
 6. Kemangi          : 100
 7. Pandan           : 139
 8. Serai            : 80
 9. Kelor            : 100
10. Sirih            : 176
11. Talas            : 127
12. Bayam Duri       : 97
13. Singkong         : 101
14. Pepaya           : 100
15. Binahong         : 111
16. Jeruk Nipis      : 75
17. Mint             : 197
18. Cincau Hijau     : 144

Total gambar: 2096


# 2. Preparasi Clean Dataset (Run kalo mau update ke Raw Dataset terbaru)

In [None]:
# Kosongin directory Train-Dev (Train-Dev-Test nanti aja kalo dataset udah agak gede)
from shutil import rmtree

def EmptyCleanDatasetDirectory(clean_data_path, train_path, dev_path): # minus test_path parameter
  if os.path.exists(clean_data_path): # Remove old directory
    rmtree(clean_data_path)

  # Make new empty directory
  for plant in PLANTS_LIST:
    train_plant_path = os.path.join(train_path, plant)
    os.makedirs(train_plant_path)
    
    dev_plant_path = os.path.join(dev_path, plant)
    os.makedirs(dev_plant_path)

    # test_plant_path = os.path.join(test_path, plant)
    # os.makedirs(test_plant_path)
  
  print('Finished emptying old data.')

In [None]:
# Resize data and save on temporary folder
from PIL import Image

def ResizeAndRenameData(source, dest, plant_name, image_size):
  os.makedirs(dest, exist_ok=True)
  count = 0
  zeros_padding = 4

  for plant_image in os.listdir(source):
    im = Image.open(os.path.join(source, plant_image)).convert('RGB')
    jpeg_name = plant_name + str(count).zfill(zeros_padding) + ".jpeg"
    image_dest_path = os.path.join(dest, jpeg_name)
    im.resize(image_size).save(image_dest_path)
    count += 1

In [None]:
# Data Splitting Function (masih Train-Dev Split jadi masih pake SPLIT_SIZE)
import random
from shutil import move

def SplitData(SOURCE, TRAINING, DEVELOPMENT, SPLIT_SIZE, PLANT_NAME): # nanti pake dev_size / test_size

  dir_list = os.listdir(SOURCE)
  randomized_dir_list = random.sample(dir_list, len(dir_list))

  # Remove 0 size images
  final_list = []
  for filename in randomized_dir_list:
    fullpath = os.path.join(SOURCE, filename)
    if os.path.getsize(fullpath) != 0:
      final_list.append(filename)
    else:
      print("{} is zero length, so ignoring.".format(filename))

  # Start Splitting (train-dev split)
  index_split = round(SPLIT_SIZE * len(final_list))
  for filename in final_list[:index_split]:
    source = os.path.join(SOURCE, filename)
    dest = os.path.join(TRAINING, filename)
    move(source, dest)

  for filename in final_list[index_split:]:
    source = os.path.join(SOURCE, filename)
    dest = os.path.join(DEVELOPMENT, filename)
    move(source, dest)
  
  print('Finished splitting data of:', PLANT_NAME)

In [None]:
# Copy file ke Train-Dev Directory (Train-Dev-Test nanti aja kalo dataset udah gede)

def MakeCleanDataset(data_path, train_path, dev_path, image_size, split_size): # minus test_path parameter
  temp_folder_path = '/tmp/convert_images'

  for plant in PLANTS_LIST:
    source_path = os.path.join(data_path, plant)
    train_plant_path = os.path.join(train_path, plant)
    dev_plant_path = os.path.join(dev_path, plant)
    # test_plant_path = os.path.join(test_path, plant)
    
    ResizeAndRenameData(source_path, temp_folder_path, plant, image_size)
    SplitData(temp_folder_path, train_plant_path, dev_plant_path, split_size, plant)

In [None]:
IMAGE_SIZE = (384, 384)
SPLIT_SIZE = 0.8

EmptyCleanDatasetDirectory(CLEAN_DATASET_PATH, TRAIN_PATH, DEV_PATH)
MakeCleanDataset(RAW_DATASET_PATH, TRAIN_PATH, DEV_PATH, IMAGE_SIZE, SPLIT_SIZE)

Finished emptying old data.
Finished splitting data of: Sambiloto
Finished splitting data of: Sembung
Finished splitting data of: Meniran
Finished splitting data of: Andong
Finished splitting data of: Kumis Kucing
Finished splitting data of: Kemangi
Finished splitting data of: Pandan
Finished splitting data of: Serai
Finished splitting data of: Kelor
Finished splitting data of: Sirih
Finished splitting data of: Talas
Finished splitting data of: Bayam Duri
Finished splitting data of: Singkong
Finished splitting data of: Pepaya
Finished splitting data of: Binahong
Finished splitting data of: Jeruk Nipis
Finished splitting data of: Mint
Finished splitting data of: Cincau Hijau


In [None]:
def CekTotalGambar(folder_name, data_path):
  sum = 0
  for rootdir, dirs, files in os.walk(data_path):
    for subdir in dirs:
      path = os.path.join(rootdir, subdir)
      sum += len(os.listdir(path))
  print('Total Gambar {}: {}'.format(folder_name, sum))
  return sum

train_count = CekTotalGambar('Train', TRAIN_PATH)
dev_count = CekTotalGambar('Dev', DEV_PATH)
total_count = train_count + dev_count
ratio_train = round(train_count/total_count, 4)
ratio_dev = round(dev_count/total_count, 4)
print('Total Gambar Clean Dataset: {}\nRasio Train: {}\nRasio Dev: {}'.format(total_count, ratio_train, ratio_dev))

Total Gambar Train: 1679
Total Gambar Dev: 417
Total Gambar Clean Dataset: 2096
Rasio Train: 0.801
Rasio Dev: 0.199
