# Notebook for Feature Engineering

## Data Loading

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import cv2
import colorsys
import os
import random
from tqdm import tqdm

In [1]:
# dataset info
def load_dataset(data_dir, cl_limit=20):
    X = []
    Y = []
    idx_to_cl = {}
    for i, cl in enumerate(os.listdir(data_dir)):
        if i >= cl_limit:
            break
        
        print(cl)
        idx_to_cl[i] = cl
        for fname in tqdm(os.listdir(os.path.join(data_dir,cl))):
            X.append(cv2.imread(os.path.join(data_dir, cl, fname)))
            Y.append(i)
    return np.array(X), np.array(Y), idx_to_cl


In [3]:
X, Y, idx_to_cl = load_dataset('data')

Airport


100%|██████████| 360/360 [00:05<00:00, 69.37it/s]


BareLand


100%|██████████| 310/310 [00:04<00:00, 77.21it/s] 


BaseballField


100%|██████████| 220/220 [00:02<00:00, 97.61it/s] 


Beach


100%|██████████| 400/400 [00:04<00:00, 83.37it/s] 


Bridge


100%|██████████| 360/360 [00:03<00:00, 90.56it/s] 


Center


100%|██████████| 260/260 [00:03<00:00, 68.19it/s]


Church


100%|██████████| 240/240 [00:02<00:00, 91.86it/s] 


Commercial


100%|██████████| 350/350 [00:04<00:00, 74.67it/s]


DenseResidential


100%|██████████| 410/410 [00:05<00:00, 72.07it/s]


Desert


100%|██████████| 300/300 [00:03<00:00, 76.60it/s]


Farmland


100%|██████████| 370/370 [00:04<00:00, 89.94it/s] 


Forest


100%|██████████| 250/250 [00:03<00:00, 77.42it/s]


Industrial


100%|██████████| 390/390 [00:04<00:00, 80.21it/s]


Meadow


100%|██████████| 280/280 [00:03<00:00, 90.50it/s] 


MediumResidential


100%|██████████| 290/290 [00:04<00:00, 60.37it/s]


Mountain


100%|██████████| 340/340 [00:03<00:00, 85.48it/s] 


Park


100%|██████████| 350/350 [00:04<00:00, 73.75it/s]


Parking


100%|██████████| 390/390 [00:05<00:00, 66.98it/s]


Playground


100%|██████████| 370/370 [00:04<00:00, 90.88it/s] 


Pond


100%|██████████| 420/420 [00:04<00:00, 84.85it/s] 


Port


100%|██████████| 380/380 [00:04<00:00, 94.10it/s] 


RailwayStation


100%|██████████| 260/260 [00:03<00:00, 74.42it/s]


Resort


100%|██████████| 290/290 [00:03<00:00, 80.60it/s]


River


100%|██████████| 410/410 [00:04<00:00, 91.47it/s] 


School


100%|██████████| 300/300 [00:04<00:00, 73.77it/s]


SparseResidential


100%|██████████| 300/300 [00:06<00:00, 49.20it/s]


Square


100%|██████████| 330/330 [00:04<00:00, 67.70it/s]


Stadium


100%|██████████| 290/290 [00:04<00:00, 71.24it/s]


StorageTanks


100%|██████████| 360/360 [00:05<00:00, 67.84it/s]


Viaduct


100%|██████████| 420/420 [00:05<00:00, 78.35it/s]


## Feature Extraction

### HSV features
Use average and variance of 3 channels as features

In [4]:
# takes dataset and feature dictionaries as input
# gets the mean and variance of each hsv channel
def extract_hsv_features(img):
    img_convert = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    channel_mean = img_convert.mean(axis=(0,1))
    channel_var = img_convert.var(axis=(0,1))
    return np.append(channel_mean, channel_var)

### HOG vector

In [5]:
def extract_hog_features(
        img, 
        cell_size=(8,8),
        block_size=(16,16),
        block_stride=(8,8),
        num_bins=10):
    
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    hog = cv2.HOGDescriptor(img_gray.shape, block_size, block_stride, cell_size, num_bins)
    return hog.compute(img_gray)

### Dataset Load Function
Takes in list of feature functions and applies them within the load loop to save resources

In [6]:
# dataset info
def load_dataset(data_dir, feature_functions=[]):
    X = []
    Y = []
    idx_to_cl = {}
    
    features = []
    for i, cl in enumerate(os.listdir(data_dir)):
        print(cl)
        idx_to_cl[i] = cl
        for fname in tqdm(os.listdir(os.path.join(data_dir,cl))):
            img = cv2.imread(os.path.join(data_dir, cl, fname))
            X.append(img)
            Y.append(i)

            feature = np.array([])
            for func in feature_functions:
                feature = np.append(feature, func(img))
            features.append(feature)
            
    return np.array(X), np.array(Y), idx_to_cl, np.array(features)


In [11]:
X, Y, idx_to_cl, features = load_dataset('data', feature_functions=[extract_hsv_features,extract_hog_features])

Airport


100%|██████████| 360/360 [00:15<00:00, 22.70it/s]


BareLand


100%|██████████| 310/310 [00:12<00:00, 24.63it/s]


BaseballField


100%|██████████| 220/220 [00:08<00:00, 26.24it/s]


Beach


100%|██████████| 400/400 [00:15<00:00, 25.75it/s]


Bridge


100%|██████████| 360/360 [00:14<00:00, 25.05it/s]


Center


100%|██████████| 260/260 [00:11<00:00, 22.73it/s]


Church


100%|██████████| 240/240 [00:09<00:00, 24.63it/s]


Commercial


100%|██████████| 350/350 [00:14<00:00, 23.48it/s]


DenseResidential


100%|██████████| 410/410 [00:17<00:00, 22.85it/s]


Desert


100%|██████████| 300/300 [00:12<00:00, 23.46it/s]


Farmland


100%|██████████| 370/370 [00:15<00:00, 24.35it/s]


Forest


100%|██████████| 250/250 [00:10<00:00, 23.74it/s]


Industrial


100%|██████████| 390/390 [00:16<00:00, 23.52it/s]


Meadow


100%|██████████| 280/280 [00:11<00:00, 24.39it/s]


MediumResidential


100%|██████████| 290/290 [00:13<00:00, 21.37it/s]


Mountain


100%|██████████| 340/340 [00:14<00:00, 23.50it/s]


Park


100%|██████████| 350/350 [00:15<00:00, 22.97it/s]


Parking


100%|██████████| 390/390 [00:18<00:00, 21.21it/s]


Playground


100%|██████████| 370/370 [00:15<00:00, 23.29it/s]


Pond


100%|██████████| 420/420 [00:18<00:00, 23.29it/s]


Port


100%|██████████| 380/380 [00:20<00:00, 18.37it/s]


RailwayStation


100%|██████████| 260/260 [00:15<00:00, 16.59it/s]


Resort


100%|██████████| 290/290 [00:14<00:00, 19.56it/s]


River


100%|██████████| 410/410 [00:19<00:00, 20.84it/s]


School


100%|██████████| 300/300 [00:13<00:00, 22.49it/s]


SparseResidential


100%|██████████| 300/300 [00:25<00:00, 11.77it/s]


Square


100%|██████████| 330/330 [00:50<00:00,  6.48it/s]


Stadium


100%|██████████| 290/290 [00:44<00:00,  6.54it/s]


StorageTanks


100%|██████████| 360/360 [02:16<00:00,  2.63it/s]


Viaduct


100%|██████████| 420/420 [02:36<00:00,  2.69it/s]


### Orchestration

In [10]:
print(X.shape)
print(Y.shape)
print(features.shape)

(10000, 600, 600, 3)
(10000,)
(10000, 0)


In [29]:
np.array(data['Airport']).shape

(360, 600, 600, 3)

In [31]:
np.append(np.array(data['Airport']),np.array(data['BareLand']), axis=0).shape

(670, 600, 600, 3)

In [None]:
features = {}
extract_hsv_features(dataset, features)
