# Imports

In [1]:
import numpy as np
import os
import json
import cv2
from skimage.feature import hog
import tarfile

# Helper Functions

In [2]:
# define a function to generate the ML subsets
def generate_splits(classes, directory, split_file="../data/processed/split_definition.json"):
    # initialize empty lists to store train, validation, and test filenames
    train_files, val_files, test_files = [], [], []
    
    # check if split definition file exists
    if os.path.exists(split_file):
        print("train/validation/test subsets were loaded from a pre-generated file")
        # load split definition file
        with open(split_file, 'r') as file:
            split_data = json.load(file)
            train_files = split_data['Train']
            val_files = split_data['Validation']
            test_files = split_data['Test']
            
    else:
        print("train/validation/test subsets were generated and saved to a file")
        # iterate over each class
        for class_name in classes:
            # get the directory path for the current class
            class_dir = os.path.join(directory, class_name)
            # list all files in the directory
            files = os.listdir(class_dir)
            # shuffle the list of files
            np.random.shuffle(files)
            # calculate split points
            total_files = len(files)
            train_split = int(total_files * 0.6)
            val_split = int(total_files * 0.2)
            # assign files to train, validation, and test sets
            train_files.extend([(class_name, file) for file in files[:train_split]])
            val_files.extend([(class_name, file) for file in files[train_split:train_split+val_split]])
            test_files.extend([(class_name, file) for file in files[train_split+val_split:]])
            
        # shuffle the train, validation, and test sets
        np.random.shuffle(train_files)
        np.random.shuffle(val_files)
        np.random.shuffle(test_files)
        
        # save split definition to a json file
        with open(split_file, 'w') as file:
            json.dump({'Train': train_files, 'Validation': val_files, 'Test': test_files}, file)
            
    # display the number of files in each set
    print("\tNumber of train files:", len(train_files))
    print("\tNumber of val files:", len(val_files))
    print("\tNumber of test files:", len(test_files))
    
    return train_files, val_files, test_files

In [3]:
# define a function that determines the mean and standard deviation of each RGB and and L*a*b*
# color-space channel for an image
def compute_channel_stats(image_path):
    # read the image
    img = cv2.imread(image_path)
    
    # convert image to L*a*b* color space
    lab_img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    
    # compute mean and standard deviation for each color channel (RGB and L*a*b*)
    mean_rgb, std_rgb = cv2.meanStdDev(img)
    mean_lab, std_lab = cv2.meanStdDev(lab_img)
    
    # flatten the results into a feature vector
    channel_stats = np.concatenate((mean_rgb.flatten(), std_rgb.flatten(), mean_lab.flatten(), std_lab.flatten()))
    
    return channel_stats

In [4]:
# define a function that determines the hog descriptors for an image's grayscale representation
def compute_hog_stats(image_path):
    # read the image
    img = cv2.imread(image_path)
    
    # convert image to grayscale
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # compute HOG features
    fd, hog_image = hog(gray_img, orientations=4, pixels_per_cell=(32, 32), visualize=True)
    
    return fd, hog_image

In [5]:
# define a function that loops through each file to generate a dictionary that contains
# the feature vectors of all images in each class
def generate_feature_vectors(files, directory):
    feature_vectors = {}
    hog_images = {}
    
    # iterate over each file
    for class_name, file_name in files:
        # load the image
        img_path = os.path.join(directory, class_name, file_name)
        
        # compute color statistics
        channel_stats = compute_channel_stats(img_path)
        
        # compute HOG features
        hog_stats, hog_image = compute_hog_stats(img_path)
        
        # concatenate color statistics and HOG features
        stats = np.concatenate((channel_stats, hog_stats))
        
        # append feature vector to the correct class in feature_vectors
        if class_name not in feature_vectors:
            feature_vectors[class_name] = []
        feature_vectors[class_name].append(stats)
        
        # append hog_image to the correct class in hog_images
        if class_name not in hog_images:
            hog_images[class_name] = []
        hog_images[class_name].append(hog_image)
        
        
        # TODO:  add additional features to the feature_vector (SIFT, spatial frequencies, texture, others?)
    
    return feature_vectors, hog_images

In [6]:
# define a function to save the feature vector dictionary to disk
def save_feature_vectors(feature_vectors, filename):
    # convert numpy arrays to Python lists
    feature_vectors_dict = {}
    for class_name, vectors in feature_vectors.items():
        feature_vectors_dict[class_name] = [vector.tolist() for vector in vectors]
    
    # save feature_vectors_dict dictionary as JSON
    json_filename = filename.replace('.tar.gz', '.json')
    with open(json_filename, 'w') as f:
        json.dump(feature_vectors_dict, f)
    
    # create tar.gz file
    with tarfile.open(filename, 'w:gz') as tar:
        tar.add(json_filename, arcname=os.path.basename(json_filename))
    
    # remove the temporary JSON file
    os.remove(json_filename)

In [7]:
# define a function to load the feature vector dictionary from disk
def load_feature_vectors(filename):
    # extract the JSON file from the tar.gz file
    with tarfile.open(filename, 'r:gz') as tar:
        tar.extractall()
        json_filename = tar.getnames()[0]  # assuming only one file in the archive
    
    # load the JSON file and convert Python lists back to numpy arrays
    with open(json_filename, 'r') as f:
        feature_vectors_dict = json.load(f)
    
    feature_vectors = {}
    for class_name, vectors in feature_vectors_dict.items():
        feature_vectors[class_name] = [np.array(vector) for vector in vectors]
    
    # remove the extracted JSON file
    os.remove(json_filename)
    
    return feature_vectors


# Import data

In [8]:
# define file directory
directory = '../data/interim/PatternNet/PatternNet/images'

# create a list of classes considered for this project
classes = ['beach', 'chaparral', 'dense_residential', 'forest', 'freeway', 'harbor', 'overpass', 'parking_space', 'river', 'swimming_pool']

# define the train, val, and test sets
train_files, val_files, test_files = generate_splits(classes, directory)

train/validation/test subsets were loaded from a pre-generated file
	Number of train files: 4799
	Number of val files: 1599
	Number of test files: 1601


# Generate Feature Vectors

In [9]:
# generate the set of feature vectors for all images in each class
feature_vectors, hog_images = generate_feature_vectors(train_files, directory)

In [10]:
# inspections

print(type(feature_vectors))
print(feature_vectors.keys())
print(type(feature_vectors['beach']))
print(len(feature_vectors['beach']))
print(len(feature_vectors['beach'][0]))

<class 'dict'>
dict_keys(['parking_space', 'beach', 'forest', 'overpass', 'river', 'dense_residential', 'swimming_pool', 'chaparral', 'freeway', 'harbor'])
<class 'list'>
479
1308


# Save data

In [11]:
# save feature vector to disk
save_feature_vectors(feature_vectors, "../data/processed/feature_vectors_1.tar.gz")

# Import data

In [12]:
# load feature vector from disk
feature_vectors = load_feature_vectors("../data/processed/feature_vectors_1.tar.gz")

In [13]:
# inspections

print(type(feature_vectors))
print(feature_vectors.keys())
print(type(feature_vectors['beach']))
print(len(feature_vectors['beach']))
print(len(feature_vectors['beach'][0]))

<class 'dict'>
dict_keys(['parking_space', 'beach', 'forest', 'overpass', 'river', 'dense_residential', 'swimming_pool', 'chaparral', 'freeway', 'harbor'])
<class 'list'>
479
1308


In [14]:
#