# Imports

In [None]:
# imports

import os
import numpy as np
import random
import pandas as pd
import json
import pickle
import matplotlib.pyplot as plt
# Important NOTE: Use opencv >=4.4 
import cv2

# Helper Functions

In [None]:
# define a function to generate the ML subsets
def generate_splits(classes, directory, split_file="../data/processed/split_definition.json"):
    # initialize empty lists to store train, validation, and test filenames
    train_files, val_files, test_files = [], [], []
    
    # check if split definition file exists
    if os.path.exists(split_file):
        print("train/validation/test subsets were loaded from a pre-generated file")
        # load split definition file
        with open(split_file, 'r') as file:
            split_data = json.load(file)
            train_files = split_data['Train']
            val_files = split_data['Validation']
            test_files = split_data['Test']
            
    else:
        print("train/validation/test subsets were generated and saved to a file")
        # iterate over each class
        for class_name in classes:
            # get the directory path for the current class
            class_dir = os.path.join(directory, class_name)
            # list all files in the directory
            files = os.listdir(class_dir)
            # shuffle the list of files
            np.random.shuffle(files)
            # calculate split points
            total_files = len(files)
            train_split = int(total_files * 0.6)
            val_split = int(total_files * 0.2)
            # assign files to train, validation, and test sets
            train_files.extend([(class_name, file) for file in files[:train_split]])
            val_files.extend([(class_name, file) for file in files[train_split:train_split+val_split]])
            test_files.extend([(class_name, file) for file in files[train_split+val_split:]])
            
        # shuffle the train, validation, and test sets
        np.random.shuffle(train_files)
        np.random.shuffle(val_files)
        np.random.shuffle(test_files)
        
        # save split definition to a json file
        with open(split_file, 'w') as file:
            json.dump({'Train': train_files, 'Validation': val_files, 'Test': test_files}, file)
            
    # display the number of files in each set
    print("\tNumber of train files:", len(train_files))
    print("\tNumber of val files:", len(val_files))
    print("\tNumber of test files:", len(test_files))
    
    return train_files, val_files, test_files

# Import and split data

In [None]:
# define file directory
directory = '../data/interim/PatternNet/images'

# create a list of all possible classes
all_classes = []
for item in os.listdir(directory):
    item_path = os.path.join(directory, item)
    if os.path.isdir(item_path):
        all_classes.append(item)
#print(all_classes)
        
# create a list of classes considered for this project
classes = ['beach', 'chaparral', 'dense_residential', 'forest', 'freeway', 'harbor', 'overpass', 'parking_space', 'river', 'swimming_pool']
        
# inspect the number of images per class
data = []
for class_name in classes:
    class_dir = os.path.join(directory, class_name)
    image_count = len(os.listdir(class_dir))
    data.append([class_name, image_count])
image_count_df = pd.DataFrame(data, columns=['Class', 'Total Image Count'])
print("The following classes were selected for evaluation:")
display(image_count_df)

# define the train, val, and test sets
train_files, val_files, test_files = generate_splits(classes, directory)

# Investigate basics

# Inspect examples

In [None]:
def extract_key_points(img):

    # Converting image to grayscale
    gray= cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)

    # Applying SIFT detector
    sift = cv2.SIFT_create()
    kp = sift.detect(gray, None)

    # Marking the keypoint on the image using circles
    sift_img=cv2.drawKeypoints(gray, kp, img,
                          flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)

    return sift_img


In [None]:
# define a function to show a grid of SIFT image in a directory (given a file subset)
def generate_sift(files, directory, images_per_class=3):
    # create a dictionary to store class images
    class_images = {}
    
    # iterate over each file
    for class_name, file_name in files:
        # load the image
        img = plt.imread(os.path.join(directory, class_name, file_name))
        # if class not in dictionary, initialize empty list
        if class_name not in class_images:
            class_images[class_name] = []
        # append image to class list
        class_images[class_name].append(img)

    # create a grid of images
    num_classes = len(class_images)
    fig, axes = plt.subplots(num_classes, images_per_class + 1, figsize=(12, 3*num_classes))
    for i, (class_name, images) in enumerate(class_images.items()):
        # display class name in the first column
        axes[i, 0].text(0.5, 0.5, class_name, fontsize=16, ha='center', va='center')
        axes[i, 0].axis('off')

        # display random images in the subsequent columns
        random.shuffle(images)
        for j in range(images_per_class):
            sift_image = extract_key_points(images[j])
            axes[i, j+1].imshow(sift_image)

    plt.tight_layout()
    plt.show()

In [None]:
# show a grid with 3 random images per class
generate_sift(train_files, directory, 1)