# Image preprocessing
The image preprocessing is based on these two notebooks https://www.kaggle.com/maxlenormand/cropping-to-character-resizing-images combined with https://www.kaggle.com/iafoss/image-preprocessing-128x128.

The images are croped, scaled to fit the max image size and resized to 128x128.

In [2]:
import time

from fastai.vision import *
import pandas as pd

from tqdm import tqdm

import matplotlib.pyplot as plt
import numpy as np
import gc

import cv2

In [3]:
# slighty modified from https://www.kaggle.com/maxlenormand/cropping-to-character-resizing-images
def crop_resize_scaled(df, resize_size = 128):
    HEIGHT = 137
    WIDTH = 236
    CROP_SIZE = resize_size
    original_img_size = HEIGHT * WIDTH
    cropped_img_size = CROP_SIZE * CROP_SIZE
    
    print(f"Original shape of images: {original_img_size}\nCropped & resized shape of images: {cropped_img_size}")
    print(f"Reduction fatio: {np.round(original_img_size/cropped_img_size, 3)}")
    print(df.shape)
    resized_df = df.iloc[:, 1:].values.reshape(-1, HEIGHT, WIDTH)
    
    cropped_imgs = {}
    for img_id in tqdm(range(df.shape[0])):
        img = resized_df[img_id]
        _, thresh = cv2.threshold(img, 30, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
        contours, _ = cv2.findContours(thresh,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)[-2:]
        
        idx = 0 
        ls_xmin = []
        ls_ymin = []
        ls_xmax = []
        ls_ymax = []
        for cnt in contours:
            idx += 1
            x,y,w,h = cv2.boundingRect(cnt)
            ls_xmin.append(x)
            ls_ymin.append(y)
            ls_xmax.append(x + w)
            ls_ymax.append(y + h)
        xmin = min(ls_xmin)
        ymin = min(ls_ymin)
        xmax = max(ls_xmax)
        ymax = max(ls_ymax)

        roi = img[ymin:ymax,xmin:xmax]
        resized_roi = cv2.resize(roi, (resize_size, resize_size))
        cropped_imgs[df.image_id[img_id]] = resized_roi.reshape(-1)
        
    resized = pd.DataFrame(cropped_imgs).T.reset_index()
    resized.columns = resized.columns.astype(str)
    resized.rename(columns={'index':'image_id'},inplace=True)
    return resized

In [4]:
# export

def imagePreprocessing(crop_func, file_in, file_out):
    df = pd.read_parquet(file_in)
    df.reset_index(inplace=True,drop = True)#
    print(df.shape)
    cropped_df = crop_func(df, resize_size = 128)
    cropped_df.to_feather(file_out)
    del cropped_df
    gc.collect()
    

In [5]:
%%time
source = Path('/home/kaggle/bengaliai-cv19/input')

for i in range(4):
    file_in = source/('test_image_data_' + str(i)+'.parquet')
    file_out = source/('test_image_data_crop_scaled_' + str(i)+'.feather')
    print(file_in, file_out)
    imagePreprocessing(crop_resize_scaled,file_in, file_out)
    print(str(i) + ' Done')

/home/kaggle/bengaliai-cv19/input/test_image_data_0.parquet /home/kaggle/bengaliai-cv19/input/test_image_data_crop_scaled_0.feather


100%|██████████| 3/3 [00:00<00:00, 296.47it/s]

(3, 32333)
Original shape of images: 32332
Cropped & resized shape of images: 16384
Reduction fatio: 1.973
(3, 32333)





0 Done
/home/kaggle/bengaliai-cv19/input/test_image_data_1.parquet /home/kaggle/bengaliai-cv19/input/test_image_data_crop_scaled_1.feather


100%|██████████| 3/3 [00:00<00:00, 272.78it/s]

(3, 32333)
Original shape of images: 32332
Cropped & resized shape of images: 16384
Reduction fatio: 1.973
(3, 32333)





1 Done
/home/kaggle/bengaliai-cv19/input/test_image_data_2.parquet /home/kaggle/bengaliai-cv19/input/test_image_data_crop_scaled_2.feather


100%|██████████| 3/3 [00:00<00:00, 391.82it/s]

(3, 32333)
Original shape of images: 32332
Cropped & resized shape of images: 16384
Reduction fatio: 1.973
(3, 32333)





2 Done
/home/kaggle/bengaliai-cv19/input/test_image_data_3.parquet /home/kaggle/bengaliai-cv19/input/test_image_data_crop_scaled_3.feather


100%|██████████| 3/3 [00:00<00:00, 392.28it/s]

(3, 32333)
Original shape of images: 32332
Cropped & resized shape of images: 16384
Reduction fatio: 1.973
(3, 32333)





3 Done
CPU times: user 7min 1s, sys: 3.31 s, total: 7min 4s
Wall time: 1min 54s
