In [1]:
%load_ext autoreload

In [2]:
%autoreload 0

import os
import numpy as np
import pandas as pd
import zipfile
import time
from scipy.stats import itemfreq
import cv2
from multiprocessing import Pool
from joblib import Parallel, delayed

%autoreload 2

from compute_util import parallel_df, save_df, load_df, chunks

In [3]:
def extract_images():
    """ Extract images from Avito's advertisement image zip archive """

    NUM_IMAGES_TO_EXTRACT = 50

    with zipfile.ZipFile(os.path.join(DATA_PATH, IMG_ZIP_NAME), 'r') as train_zip:
        files_in_zip = sorted(train_zip.namelist())
        for idx, file in enumerate(files_in_zip[:NUM_IMAGES_TO_EXTRACT]):
            if file.endswith('.jpg'):
                train_zip.extract(file, path=IMG_PATH)

In [4]:
def get_dominant_color(img):
    arr = np.float32(img)
    pixels = arr.reshape((-1, 3))

    n_colors = 5
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, .1)
    flags = cv2.KMEANS_RANDOM_CENTERS
    _, labels, centroids = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags)

    palette = np.uint8(centroids)
    quantized = palette[labels.flatten()]
    quantized = quantized.reshape(img.shape)

    dominant_color = palette[np.argmax(itemfreq(labels)[:, -1])]
    return dominant_color


def get_colorfulness(img):
    # split the image into its respective RGB components
    (B, G, R) = cv2.split(img.astype("float"))
    # compute rg = R - G
    rg = np.absolute(R - G)
    # compute yb = 0.5 * (R + G) - B
    yb = np.absolute(0.5 * (R + G) - B)
    # compute the mean and standard deviation of both `rg` and `yb`
    (rbMean, rbStd) = (np.mean(rg), np.std(rg))
    (ybMean, ybStd) = (np.mean(yb), np.std(yb))
    # combine the mean and standard deviations
    stdRoot = np.sqrt((rbStd ** 2) + (ybStd ** 2))
    meanRoot = np.sqrt((rbMean ** 2) + (ybMean ** 2))
    # derive the "colorfulness" metric and return it
    return stdRoot + (0.3 * meanRoot)


def variance_of_laplacian(image):
    """compute the Laplacian of the image and then return the focus
    measure, which is simply the variance of the Laplacian """
    return cv2.Laplacian(image, cv2.CV_64F).var()


def get_data_from_image(image_path):
    img_id = os.path.basename(image_path[:-4])
    cv_img = cv2.imread(image_path)
    if cv_img is None:
        print('%s cannot be converted' % image_path)
        return pd.Series([image_path, img_id, None, None, None, None, None, None])
    #img_size = [dat[0].size[0], dat[0].size[1]]
    img_size = cv_img.shape[0] * cv_img.shape[1]
    img_size_x = cv_img.shape[1]
    img_size_y = cv_img.shape[0]
    means, stds = cv2.meanStdDev(cv_img)
    average_color = [int(round(means[0][0])), int(round(means[1][0])), int(round(means[2][0]))]
    #dominant_color = get_dominant_color(cv_img)
    #dominant_colors = [dominant_color[0], dominant_color[1], dominant_color[2]]
    #color_stats = np.concatenate([means, stds]).flatten()
    gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)   # image grayscale pixels
    blurriness = variance_of_laplacian(gray)   # compute laplacian variance (blur value)
    colorfulness = get_colorfulness(cv_img)

    # color_stats.tolist()
    return pd.Series([image_path, img_id, img_size, img_size_x, img_size_y, blurriness, colorfulness, average_color])

### Loading images features and persisting to disk

In [7]:
DATA_PATH = '/Users/ilaif/Desktop/data/avito'
NAMES = ['train_jpg_2', 'train_jpg_3', 'train_jpg_4']
for name in NAMES:
    img_path = os.path.join(DATA_PATH, name)
    
    img_paths_df = pd.DataFrame([x.path for x in os.scandir(img_path) if x.path.endswith('.jpg')])
    img_paths_df.columns = ['img_path']
    img_count = img_paths_df.shape[0]
    print('Total num images for %s: %s' % (name, img_count))

    try:
        df = load_df(DATA_PATH, name)
    except FileNotFoundError:
        print('%s not found, will create new...' % name)
        df = None

    if df is not None:
        img_paths_df = img_paths_df[~img_paths_df['img_path'].isin(df['img_path'])]

    chunk_size = 5000
    for chunk in chunks(img_paths_df, chunk_size):
        start_time = time.time()
        def paralleled_func(data):
            return data['img_path'].apply(get_data_from_image)
        img_df = parallel_df(chunk, paralleled_func, num_cores=6, num_partitions=18)
        columns = ['img_path', 'img_id', 'img_size', 'img_size_x', 'img_size_y', 'img_blurriness', 'img_colorfulness', 'img_color_avg',]
        img_df.columns = columns
        total_secs = round(time.time() - start_time)

        df = img_df if df is None else pd.concat([df, img_df])
        save_df(df, DATA_PATH, name)

        perc = round(float(df.shape[0] * 100 / img_count))
        print('%s%%: %s images took %s secs, that %s secs per image' % (perc, chunk_size, total_secs, round(float(total_secs / chunk_size), 3)))

Total num images for train_jpg_1: 278167
Total num images for train_jpg_2: 278167
train_jpg_2 not found, will create new...
2%: 5000 images took 65 secs, that 0.013 secs per image
4%: 5000 images took 60 secs, that 0.012 secs per image
5%: 5000 images took 57 secs, that 0.011 secs per image
7%: 5000 images took 50 secs, that 0.01 secs per image
9%: 5000 images took 48 secs, that 0.01 secs per image
11%: 5000 images took 46 secs, that 0.009 secs per image
13%: 5000 images took 48 secs, that 0.01 secs per image
14%: 5000 images took 45 secs, that 0.009 secs per image
16%: 5000 images took 45 secs, that 0.009 secs per image
18%: 5000 images took 46 secs, that 0.009 secs per image
20%: 5000 images took 47 secs, that 0.009 secs per image
22%: 5000 images took 746 secs, that 0.149 secs per image
23%: 5000 images took 741 secs, that 0.148 secs per image
25%: 5000 images took 772 secs, that 0.154 secs per image
/Users/ilaif/Desktop/data/avito/train_jpg_2/b98b291bd04c3d92165ca515e00468fd9756af9