# Imports

In [None]:
import numpy as np
import os
import json
import pickle
import tarfile
import datetime
import cv2
from skimage.feature import hog
from skimage.feature import graycomatrix, graycoprops
#from skimage.feature import greycomatrix, greycoprops
from skimage import data, exposure
import matplotlib.pyplot as plt
import warnings
from utils import *

In [None]:
warnings.filterwarnings("ignore")

# Define Helper Functions

In [None]:
# define a function that determines the mean and standard deviation of each RGB
# color-space channel for an image
def compute_channel_stats(image_path):
    # read the image
    img = cv2.imread(image_path)
    
    # compute mean and standard deviation for each color channel (RGB)
    mean_rgb, std_rgb = cv2.meanStdDev(img)
    
    # flatten the results into a feature vector
    channel_stats = np.concatenate((mean_rgb.flatten(), std_rgb.flatten()))
    
    return channel_stats

In [None]:
# define a function to generate a grid of smoothed distributions of mean intensity counts in each channel
# for each class across all images in each class
def compute_channel_distributions(image_path, bins=20, channels='hsv'):
    
    if channels=='rgb':
        ch1 = 'r' #'Red'
        ch2 =  'g' #'Green'
        ch3 = 'b' #'Blue'
    elif channels=='lab':
        ch1 = 'L*'
        ch2 = 'a*'
        ch3 = 'b*'
    elif channels=='hsv':
        ch1 = 'h' #'Hue'
        ch2 = 's' #'Saturation'
        ch3 = 'v' #'Value'

    # Load the image
    img = plt.imread(image_path)
    if channels=='lab':
        img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB) # convert image to L*a*b* color space
    elif channels=='hsv':
        img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) # convert image to HSV color space
    
    # calculate histograms for each color channel
    ch1_hist, _ = np.histogram(img[:,:,0], bins=bins, range=(0, 255))
    ch2_hist, _ = np.histogram(img[:,:,1], bins=bins, range=(0, 255))
    ch3_hist, _ = np.histogram(img[:,:,2], bins=bins, range=(0, 255))
      
    # generate a vector that concatenates all 3 channel distributions
    ch_distributions = np.concatenate((ch1_hist, ch2_hist, ch3_hist))
    
    # generate a list of feature names
    feature_names = [f"{ch1}_{i}" for i in range(1,bins+1)] + [f"{ch2}_{i}" for i in range(1,bins+1)] + [f"{ch3}_{i}" for i in range(1,bins+1)]
        

    return ch_distributions, feature_names

In [None]:
# define a function that determines the hog descriptors for an image's grayscale representation
def compute_hog_stats(image_path):
    # read the image
    img = cv2.imread(image_path)
    
    # convert image to grayscale
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # compute HOG features
    fd = hog(gray_img, orientations=4, pixels_per_cell=(32, 32), feature_vector=True)
    
    return fd

In [None]:
def compute_glcms(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)  
    
    # distance between pixels
    distances = [1]  
    # angles for texture computation
    angles = [0, np.pi/4, np.pi/2, 3*np.pi/4] 
    glcm = graycomatrix(img, distances, angles, symmetric=True, normed=True)
    
    contrast = graycoprops(glcm, 'contrast').ravel().mean()
    dissimilarity = graycoprops(glcm, 'dissimilarity').ravel().mean()
    homogeneity = graycoprops(glcm, 'homogeneity').ravel().mean()
    energy = graycoprops(glcm, 'energy').ravel().mean()
    correlation = graycoprops(glcm, 'correlation').ravel().mean()
    
    return [contrast, dissimilarity, homogeneity, energy, correlation]

In [None]:
def compute_fft(image_path, filt):
    FREQBINS = 25

    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)  
    filtered_image = apply_gaussian_filter(img, filt)
    magnitude_spectrum = fft_image(filtered_image)
    spec = np.log(1+magnitude_spectrum).ravel()
            
    hist, bins = np.histogram(spec, bins=FREQBINS)
    
    # We don't particularly care about the exact bin boundaries. 
    # Just need the distribution of freq spectrum.

    return hist

In [None]:
# define a function that loops through each file to generate a dictionary that contains
# the feature vectors of all images in each class
def generate_feature_vectors(files, directory):
    feature_vectors = {}

    # The 256x256 was based on EDA on the image size across the various classes.
    # The window size 25 was established by experimentation on image blurring.
    filt = create_gaussian_filter(256, 256, 25)
    
    # iterate over each file
    for class_name, file_name in files:
        # load the image
        img_path = os.path.join(directory, class_name, file_name)

        # compute color statistics
        channel_stats = compute_channel_stats(img_path)
        
        # compute channel distributions
        channel_distributions, ch_dist_names = compute_channel_distributions(img_path)
        
        # compute HOG features
        hog_stats = compute_hog_stats(img_path)
        
        # compute GLCM texture features
        glcm_features = compute_glcms(img_path)
        
        # compute Freq spectrum features
        freq_features = compute_fft(img_path, filt)
    
        combined_features = np.concatenate((channel_stats, channel_distributions, hog_stats, glcm_features, freq_features))
        
        # append each combined_features array to the correct class in feature_vectors
        if class_name not in feature_vectors:
            feature_vectors[class_name] = []
        feature_vectors[class_name].append(combined_features)
        
    hog_feature_names = [f"hog_{i}" for i in range(hog_stats.shape[0])]
    frq_feature_names = [f"freqbin_{i}" for i in range(freq_features.shape[0])]
    
    return feature_vectors, ch_dist_names, hog_feature_names, frq_feature_names

In [None]:
# define a function to save the feature vector dictionary to disk
def save_feature_data(feature_vectors, feature_names, file_directory):
    
    # save vectors
    vectors_filename = os.path.join(file_directory, 'feature_vectors_2.tar.gz')
   
    # convert numpy arrays to Python lists
    feature_vectors_dict = {}
    for class_name, vectors in feature_vectors.items():
        feature_vectors_dict[class_name] = [vector.tolist() for vector in vectors]
    
    # save feature_vectors_dict dictionary as JSON
    json_filename = vectors_filename.replace('.tar.gz', '.json')
    with open(json_filename, 'w') as f:
        json.dump(feature_vectors_dict, f)
    
    # create tar.gz file
    with tarfile.open(vectors_filename, 'w:gz') as tar:
        tar.add(json_filename, arcname=os.path.basename(json_filename))
    
    # remove the temporary JSON file
    os.remove(json_filename)
    
    # save names
    names_filename = os.path.join(file_directory, 'feature_names_2.pkl')
    with open(names_filename, 'wb') as f:
        pickle.dump(feature_names, f)
    
    

# Load data

In [None]:
# define file directory
directory = '../data/interim/PatternNet/PatternNet/images'

# create a list of classes considered for this project
classes = ['beach', 'chaparral', 'dense_residential', 'forest', 'freeway', 'harbor', 'overpass', 'parking_space', 'river', 'swimming_pool']

# define the train, val, and test sets
train_files, val_files, test_files = generate_splits(classes, directory)

# Generate feature data

In [None]:
start_time = datetime.datetime.now()

# generate the set of feature vectors for all images in each class
feature_vectors, ch_dist_names, hog_feature_names, frq_feature_names = generate_feature_vectors(train_files, directory)

end_time = datetime.datetime.now()
elapsed_time = end_time - start_time
print("Time taken for training:", elapsed_time)


In [None]:
# inspections
print(type(feature_vectors))
print(feature_vectors.keys())
print(type(feature_vectors['beach']))
print(len(feature_vectors['beach']))
print(len(feature_vectors['beach'][0]))
print(feature_vectors['beach'][0])

In [None]:
# create a list of feature names
rgb_names = ['r_mean','g_mean','b_mean','r_std','g_std','b_std'] # 6
hsv_names = ch_dist_names # 60
hog_names = hog_feature_names # 1296
frq_names = frq_feature_names # 25
texture_names = ['contrast_mean','dissimilarity_mean','homogeneity_mean','energy_mean','correlation_mean'] # 5
#sift_names = ['']
#freq_names = ['']
feature_names = rgb_names + hsv_names + hog_names + texture_names + frq_names
print(feature_names)

# Save feature data to disk

In [None]:
# save feature data to disk
save_feature_data(feature_vectors, feature_names, "../data/processed/")