# Imports

In [19]:
import numpy as np
import os
import json
import cv2

# Helper Functions

In [20]:
# define a function to generate the ML subsets
def generate_splits(classes, directory, split_file="../data/processed/split_definition.json"):
    # initialize empty lists to store train, validation, and test filenames
    train_files, val_files, test_files = [], [], []
    
    # check if split definition file exists
    if os.path.exists(split_file):
        print("train/validation/test subsets were loaded from a pre-generated file")
        # load split definition file
        with open(split_file, 'r') as file:
            split_data = json.load(file)
            train_files = split_data['Train']
            val_files = split_data['Validation']
            test_files = split_data['Test']
            
    else:
        print("train/validation/test subsets were generated and saved to a file")
        # iterate over each class
        for class_name in classes:
            # get the directory path for the current class
            class_dir = os.path.join(directory, class_name)
            # list all files in the directory
            files = os.listdir(class_dir)
            # shuffle the list of files
            np.random.shuffle(files)
            # calculate split points
            total_files = len(files)
            train_split = int(total_files * 0.6)
            val_split = int(total_files * 0.2)
            # assign files to train, validation, and test sets
            train_files.extend([(class_name, file) for file in files[:train_split]])
            val_files.extend([(class_name, file) for file in files[train_split:train_split+val_split]])
            test_files.extend([(class_name, file) for file in files[train_split+val_split:]])
            
        # shuffle the train, validation, and test sets
        np.random.shuffle(train_files)
        np.random.shuffle(val_files)
        np.random.shuffle(test_files)
        
        # save split definition to a json file
        with open(split_file, 'w') as file:
            json.dump({'Train': train_files, 'Validation': val_files, 'Test': test_files}, file)
            
    # display the number of files in each set
    print("\tNumber of train files:", len(train_files))
    print("\tNumber of val files:", len(val_files))
    print("\tNumber of test files:", len(test_files))
    
    return train_files, val_files, test_files

In [21]:
# define a function that determines the mean and standard deviation of each RGB and and L*a*b*
# color-space channel for an image
def compute_channel_stats(image_path):
    # read the image
    img = cv2.imread(image_path)
    
    # convert image to L*a*b* color space
    lab_img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    
    # compute mean and standard deviation for each color channel (RGB and L*a*b*)
    mean_rgb, std_rgb = cv2.meanStdDev(img)
    mean_lab, std_lab = cv2.meanStdDev(lab_img)
    
    # flatten the results into a feature vector
    channel_stats = np.concatenate((mean_rgb.flatten(), std_rgb.flatten(), mean_lab.flatten(), std_lab.flatten()))
    
    return channel_stats

In [22]:
# define a function that loops through each file to generate a dictionary that contains
# the feature vectors of all images in each class
def generate_feature_vectors(files, directory):
    feature_vectors = {}
    
    # iterate over each file
    for class_name, file_name in files:
        # load the image
        img_path = os.path.join(directory, class_name, file_name)
        
        # compute color statistics
        stats = compute_channel_stats(img_path)
        # append each channel_stats array to the correct class in feature_vectors
        if class_name not in feature_vectors:
            feature_vectors[class_name] = []
        feature_vectors[class_name].append(stats)
        
        # TODO:  add additional features to the feature_vector (HOG, SIFT, others?)
    
    return feature_vectors

# Import data

In [23]:
# define file directory
directory = '../data/interim/PatternNet/PatternNet/images'

# create a list of classes considered for this project
classes = ['beach', 'chaparral', 'dense_residential', 'forest', 'freeway', 'harbor', 'overpass', 'parking_space', 'river', 'swimming_pool']

# define the train, val, and test sets
train_files, val_files, test_files = generate_splits(classes, directory)

train/validation/test subsets were loaded from a pre-generated file
	Number of train files: 4799
	Number of val files: 1599
	Number of test files: 1601


# Generate Feature Vectors

In [24]:
# generate the set of feature vectors for all images in each class
feature_vectors = generate_feature_vectors(train_files, directory)

In [30]:
# inspections

print(type(feature_vectors))
print(feature_vectors.keys())
print(type(feature_vectors['beach']))
print(len(feature_vectors['beach']))
print(feature_vectors['beach'][0])

<class 'dict'>
dict_keys(['parking_space', 'beach', 'forest', 'overpass', 'river', 'dense_residential', 'swimming_pool', 'chaparral', 'freeway', 'harbor'])
<class 'list'>
479
[113.95576477 127.46353149 119.84751892  22.51988622  21.80362058
  22.92205582 133.3782196  122.48179626 134.45556641  21.85814155
   1.91485269   1.31443199]
