In [6]:
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from skimage.feature import hog
from skimage import data

IMG_WIDTH = 256
IMG_HEIGHT = 256
IMG_CHA = 3
CPU_USED = 100
RANDOM_STATE = 11
RS = np.random.RandomState(RANDOM_STATE)

In [7]:
cpu_count()

160

## Read Train Val Test dataset

In [9]:
def load_image2np(img_path:str):
    global IMG_WIDTH, IMG_HEIGHT, IMG_CHA
    img_path = f"../raw_data/{img_path}"
    img = cv2.imread(img_path)
    img = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH))
    img = np.expand_dims(img, axis=0)
    return img

def load_data(meta_path:str):
    # with multiprocessing to speed up loading img
    global CPU_USED
    with open(meta_path, 'r') as f:
        data_path = [line.strip().split(' ') for line in f.readlines()]
        X, y = zip(*data_path)
        
        pool = Pool(CPU_USED)
        pool_outputs = list(tqdm(pool.imap(load_image2np, X), total=len(X)))
        X = np.concatenate(pool_outputs, axis=0)
        y = [int(i) for i in y]
        y = np.array(y)
        return X, y
        # 50s

def union_shuffle(X, y):
    global RS
    index = np.arange(X.shape[0])
    # np shuffle is inplace edit
    RS.shuffle(index)
    return X[index], y[index]


In [10]:
train_X, train_y = load_data('../raw_data/train.txt')
val_X, val_y = load_data('../raw_data/val.txt')
test_X, test_y = load_data('../raw_data/test.txt')

In [5]:
# shuffle data
train_X, train_y = union_shuffle(train_X, train_y)
test_X, test_y = union_shuffle(test_X, test_y)
val_X, val_y = union_shuffle(val_X, val_y)

NameError: name 'RS' is not defined

In [None]:
print('{:<6} shape: '.format('Train'), train_X.shape)
print('{:<6} shape: '.format('Val'), val_X.shape)
print('{:<6} shape: '.format('Test'), test_X.shape)

Train  shape:  (63325, 256, 256, 3)
Val    shape:  (450, 256, 256, 3)
Test   shape:  (450, 256, 256, 3)


## Transform Features

### color histogram
[OpenCV-python学习笔记（三）histograms直方图](https://blog.csdn.net/cliukai/article/details/101379638)

In [None]:
def extract_color_hist(img):
    blue, green, red = cv2.split(img)
    bg_hist = cv2.calcHist([blue, green], [0, 1], None, [16, 16], [0, 256, 0, 256]).reshape((-1, ))
    br_hist = cv2.calcHist([blue, red], [0, 1], None, [16, 16], [0, 256, 0, 256]).reshape((-1, ))
    gr_hist = cv2.calcHist([green, red], [0, 1], None, [16, 16], [0, 256, 0, 256]).reshape((-1, ))
    return np.concatenate([bg_hist, br_hist, gr_hist])

X_color_hist = extract_color_hist(train_X[0])

### Historgram of Gradient
[HOG算法以及python实现](https://www.cnblogs.com/Asp1rant/p/16545025.html)

In [14]:
def extract_gradient_hist(img):
    fd, hog_img = hog(img, orientations=9, pixels_per_cell=(32, 32), visualize=True,
                    cells_per_block=(1, 1), channel_axis=-1)
    cv2.imwrite('test.png', hog_img)
    return fd

extract_gradient_hist(train_X[0]).shape

ValueError: Only images with two spatial dimensions are supported. If using with color/multichannel images, specify `channel_axis`.

In [9]:
def combine_features(img):
    color_outputs = extract_color_hist(img)
    hog_outputs = extract_gradient_hist(img)
    full_features = np.concatenate([color_outputs, hog_outputs], axis=0)
    full_features = np.expand_dims(full_features, axis=0)
    return full_features

def calc_features(X):
    global CPU_USED
    pool = Pool(CPU_USED)
    pool_outputs = list(tqdm(pool.imap(combine_features, X), total=len(X)))
    X_feat = np.concatenate(pool_outputs, axis=0)
    return X_feat

In [10]:
train_X = calc_features(train_X)
val_X = calc_features(val_X)
test_X = calc_features(test_X)

100%|██████████| 63325/63325 [00:52<00:00, 1217.49it/s]
100%|██████████| 450/450 [00:00<00:00, 784.06it/s]
100%|██████████| 450/450 [00:00<00:00, 696.74it/s]


In [11]:
print('{:<6} shape: '.format('Train'), train_X.shape)
print('{:<6} shape: '.format('Val'), val_X.shape)
print('{:<6} shape: '.format('Test'), test_X.shape)

Train  shape:  (63325, 1344)
Val    shape:  (450, 1344)
Test   shape:  (450, 1344)


In [12]:
np.savez('output/train_features.npz', X=train_X, y=train_y)
np.savez('output/val_features.npz', X=val_X, y=val_y)
np.savez('output/test_features.npz', X=test_X, y=test_y)