In [108]:
import os
from PIL import Image

from imgbeddings import imgbeddings

import csv

In [109]:
files_to_remove = ['.DS_Store']
batch_size = 64

root_folder = os.path.normpath(os.getcwd() + os.sep + os.pardir)
data_folder = '0_data/beanleaf_dataset'
beandataset_folder = os.path.join(root_folder, data_folder)
beanleaf_data_partition_folders = os.listdir(beandataset_folder)
beanleaf_data_partition_folders = [i for i in beanleaf_data_partition_folders if i not in files_to_remove]

## Get files from each folder (test, train & validation)

In [110]:
def run_scandir_by_extn(dir, ext):
    subfolders, files = [], []

    for f in os.scandir(dir):
        if f.is_dir():
            subfolders.append(f.path)
        if f.is_file():
            if os.path.splitext(f.name)[1].lower() in ext:
                files.append(f.path)

    for dir in list(subfolders):
        sf, f = run_scandir_by_extn(dir, ext)
        subfolders.extend(sf)
        files.extend(f)
    return subfolders, files

def run_scandir_with_exclusion(dir, exclude):
    subfolders, files = [], []

    for f in os.scandir(dir):
        if f.is_dir():
            subfolders.append(f.path)
        if f.is_file():
            if f.name not in exclude:
                files.append(f.path)

    for dir in list(subfolders):
        sf, f = run_scandir_with_exclusion(dir, exclude)
        subfolders.extend(sf)
        files.extend(f)
    return subfolders, files

test_dataset_folder = os.path.join(beandataset_folder, beanleaf_data_partition_folders[0])
test_subfolders, test_files = run_scandir_with_exclusion(test_dataset_folder, files_to_remove)

train_dataset_folder = os.path.join(beandataset_folder, beanleaf_data_partition_folders[1])
train_subfolders, train_files = run_scandir_with_exclusion(train_dataset_folder, files_to_remove)

validation_dataset_folder = os.path.join(beandataset_folder, beanleaf_data_partition_folders[2])
validation_subfolders, validation_files = run_scandir_with_exclusion(validation_dataset_folder, files_to_remove)

In [111]:
ibed = imgbeddings()

#column headers for the csv
header = ['name', 'url', 'vector']



### Create embeddings for test dataset

In [112]:
embeddings_folder = '0_data/embeddings'
embeddings_folder_path = os.path.join(root_folder, embeddings_folder)
embeddings_folder_path

'/Users/jaganlalthoppe/workspace/mlops/azure/beanleaf-disease-classifier/0_data/embeddings'

In [115]:
test_file_path = os.path.join(embeddings_folder_path, 'beanleaf_test.csv')
with open(test_file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    test_file_count = len(test_files)
    chunks = (test_file_count - 1)
    for i in range(chunks):
        test_embeddings = []
        files = test_files[i*batch_size:(i+1)*batch_size]
        if len(files):
            test_embeddings = ibed.to_embeddings(files)
            # Iterate directory
            for index in range(len(files)):
                data = []
                data.append(os.path.basename(files[index]))
                data.append(files[index])
                data.append(test_embeddings[index])

                # write the data
                writer.writerow(data)


100%|██████████| 64/64 [00:03<00:00, 20.12it/s]
100%|██████████| 64/64 [00:02<00:00, 22.32it/s]


### Create embeddings for train dataset

In [116]:
train_file_path = os.path.join(embeddings_folder_path, 'beanleaf_train.csv')
with open(train_file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    train_file_count = len(train_files)
    chunks = (train_file_count - 1)
    for i in range(chunks):
        train_embeddings = []
        files = train_files[i*batch_size:(i+1)*batch_size]
        if len(files):
            train_embeddings = ibed.to_embeddings(files)
            # Iterate directory
            for index in range(len(files)):
                data = []
                data.append(os.path.basename(files[index]))
                data.append(files[index])
                data.append(train_embeddings[index])

                # write the data
                writer.writerow(data)

100%|██████████| 64/64 [00:02<00:00, 22.95it/s]
100%|██████████| 64/64 [00:02<00:00, 25.95it/s]
100%|██████████| 64/64 [00:02<00:00, 26.95it/s]
100%|██████████| 64/64 [00:02<00:00, 27.54it/s]
100%|██████████| 64/64 [00:02<00:00, 29.48it/s]
100%|██████████| 64/64 [00:02<00:00, 24.48it/s]
100%|██████████| 64/64 [00:03<00:00, 20.08it/s]
100%|██████████| 64/64 [00:02<00:00, 23.03it/s]
100%|██████████| 64/64 [00:02<00:00, 26.21it/s]
100%|██████████| 64/64 [00:02<00:00, 26.94it/s]
100%|██████████| 64/64 [00:02<00:00, 25.30it/s]
100%|██████████| 64/64 [00:02<00:00, 24.09it/s]
100%|██████████| 64/64 [00:02<00:00, 25.53it/s]
100%|██████████| 64/64 [00:02<00:00, 25.52it/s]
100%|██████████| 64/64 [00:02<00:00, 24.59it/s]
100%|██████████| 64/64 [00:02<00:00, 21.89it/s]


### Create embeddings for validation dataset

In [117]:
validation_file_path = os.path.join(embeddings_folder_path, 'beanleaf_validation.csv')
with open(validation_file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    validation_file_count = len(validation_files)
    chunks = (validation_file_count - 1)
    for i in range(chunks):
        validation_embeddings = []
        files = validation_files[i*batch_size:(i+1)*batch_size]
        if len(files):
            validation_embeddings = ibed.to_embeddings(files)
            # Iterate directory
            for index in range(len(files)):
                data = []
                data.append(os.path.basename(files[index]))
                data.append(files[index])
                data.append(validation_embeddings[index])

                # write the data
                writer.writerow(data)

100%|██████████| 64/64 [00:02<00:00, 23.23it/s]
100%|██████████| 64/64 [00:02<00:00, 24.54it/s]
