In [72]:
import os
from PIL import Image

from imgbeddings import imgbeddings
import random
from datetime import datetime

import csv

In [73]:
files_to_remove = ['.DS_Store']
batch_size = 64

root_folder = os.path.normpath(os.getcwd() + os.sep + os.pardir)
data_folder = '0_data/beanleaf_dataset'
beandataset_folder = os.path.join(root_folder, data_folder)
beanleaf_data_partition_folders = os.listdir(beandataset_folder)
beanleaf_data_partition_folders = [i for i in beanleaf_data_partition_folders if i not in files_to_remove]

## Get files from each folder (test, train & validation)

In [74]:
def run_scandir_by_extn(dir, ext):
    subfolders, files = [], []

    for f in os.scandir(dir):
        if f.is_dir():
            subfolders.append(f.path)
        if f.is_file():
            if os.path.splitext(f.name)[1].lower() in ext:
                files.append(f.path)

    for dir in list(subfolders):
        sf, f = run_scandir_by_extn(dir, ext)
        subfolders.extend(sf)
        files.extend(f)
    return subfolders, files

def run_scandir_with_exclusion(dir, exclude):
    subfolders, files = [], []

    for f in os.scandir(dir):
        if f.is_dir():
            subfolders.append(f.path)
        if f.is_file():
            if f.name not in exclude:
                files.append(f.path)

    for dir in list(subfolders):
        sf, f = run_scandir_with_exclusion(dir, exclude)
        subfolders.extend(sf)
        files.extend(f)
    return subfolders, files

In [75]:
ibed = imgbeddings()

#column headers for the csv
header = ['name', 'url', 'actual_label', 'predicted_label', 'prediction_ts', 'vector']



### Create embeddings for test dataset

In [76]:
embeddings_folder = '0_data/embeddings'
embeddings_folder_path = os.path.join(root_folder, embeddings_folder)

In [77]:
class_dict = {
    'angular_leaf_spot': 0,
    'bean_rust': 1,
    'healthy': 2
}

def write_to_csv(files, writer, actual_label, predicted_label, prediction_ts, random_prediction=False):
    file_count = len(files)
    chunks = (file_count - 1)
    actual_label = class_dict[actual_label]
    predicted_label = class_dict[predicted_label]
    classes = ['0', '1', '2']
    for i in range(chunks):
        embeddings = []
        batch_files = files[i*batch_size:(i+1)*batch_size]
        if len(batch_files):
            embeddings = ibed.to_embeddings(batch_files)
            # Iterate directory
            for index in range(len(batch_files)):
                data = []
                data.append(os.path.basename(batch_files[index]))
                data.append(batch_files[index])
                data.append(actual_label)

                if random_prediction:
                    predicted_label = random.choice(classes)

                data.append(predicted_label)
                data.append(prediction_ts)
                data.append(embeddings[index])

                # write the data
                writer.writerow(data)

### Create embeddings for train dataset

In [78]:
now_ts = datetime.timestamp(datetime.now())

In [79]:
train_file_path = os.path.join(embeddings_folder_path, 'beanleaf_train.csv')

train_dataset_folder = os.path.join(beandataset_folder, beanleaf_data_partition_folders[1])
train_dataset_classes_folder = os.listdir(train_dataset_folder)
train_dataset_classes_folder = [i for i in train_dataset_classes_folder if i not in files_to_remove]

for train_class in train_dataset_classes_folder:
    fullpath_with_class = os.path.join(train_dataset_folder, train_class)
    train_subfolders, train_files = run_scandir_with_exclusion(fullpath_with_class, files_to_remove)

    with open(train_file_path, 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        # write the header
        writer.writerow(header)
        write_to_csv(train_files, writer, train_class, train_class, now_ts)

100%|██████████| 64/64 [00:02<00:00, 26.59it/s]
100%|██████████| 64/64 [00:02<00:00, 24.90it/s]
100%|██████████| 64/64 [00:02<00:00, 28.07it/s]
100%|██████████| 64/64 [00:02<00:00, 29.81it/s]
100%|██████████| 64/64 [00:02<00:00, 27.50it/s]
100%|██████████| 64/64 [00:02<00:00, 26.25it/s]
100%|██████████| 64/64 [00:02<00:00, 30.42it/s]
100%|██████████| 64/64 [00:02<00:00, 27.45it/s]
100%|██████████| 64/64 [00:02<00:00, 24.53it/s]
100%|██████████| 64/64 [00:02<00:00, 27.31it/s]
100%|██████████| 64/64 [00:02<00:00, 26.70it/s]
100%|██████████| 64/64 [00:02<00:00, 27.87it/s]
100%|██████████| 64/64 [00:02<00:00, 30.75it/s]
100%|██████████| 64/64 [00:02<00:00, 27.41it/s]
100%|██████████| 64/64 [00:02<00:00, 28.58it/s]


### Create embeddings for validation dataset

In [80]:

validation_file_path = os.path.join(embeddings_folder_path, 'beanleaf_validation.csv')

validation_dataset_folder = os.path.join(beandataset_folder, beanleaf_data_partition_folders[2])
validation_dataset_classes_folder = os.listdir(validation_dataset_folder)
validation_dataset_classes_folder = [i for i in validation_dataset_classes_folder if i not in files_to_remove]

for validation_class in validation_dataset_classes_folder:
    fullpath_with_class = os.path.join(validation_dataset_folder, validation_class)
    validation_subfolders, validation_files = run_scandir_with_exclusion(fullpath_with_class, files_to_remove)

    with open(validation_file_path, 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        # write the header
        writer.writerow(header)
        write_to_csv(validation_files, writer, validation_class, validation_class, now_ts)

### Create embeddings for test dataset (consider as production dataset)

In [81]:
now_ts = datetime.timestamp(datetime.now())

In [82]:
test_file_path = os.path.join(embeddings_folder_path, 'beanleaf_test.csv')

test_dataset_folder = os.path.join(beandataset_folder, beanleaf_data_partition_folders[0])
test_dataset_classes_folder = os.listdir(test_dataset_folder)
test_dataset_classes_folder = [i for i in test_dataset_classes_folder if i not in files_to_remove]

now_ts = datetime.timestamp(datetime.now())
for test_class in test_dataset_classes_folder:
    fullpath_with_class = os.path.join(test_dataset_folder, test_class)
    test_subfolders, test_files = run_scandir_with_exclusion(fullpath_with_class, files_to_remove)
    
    with open(test_file_path, 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        # write the header
        writer.writerow(header)
        write_to_csv(test_files, writer, test_class, test_class, now_ts, random_prediction=True)