# Import Libraries

In [1]:
# Auto reloads src modules.
%load_ext autoreload
%autoreload 2

# Import custom libraries.
import os
import sys
from pathlib import Path, PurePath

if Path.cwd().parts[-1] != "crnn-pytorch":
    proj_root = str(Path.cwd().parents[0])
if proj_root not in sys.path:
    sys.path.append(proj_root)

# Changes the working directory from a jupyter_notebook to the project directory.
if Path.cwd().parts[-1] == "jupyter_notebooks":
    os.chdir(proj_root)

from src.dataset import extract_jpg_meta, WbsinImageDataset
from torchvision import transforms
import torch
import matplotlib.pyplot as plt
import pandas as pd
import xml.etree.ElementTree as ET
from PIL import Image
import altair as alt
alt.data_transformers.disable_max_rows()
from tqdm.auto import tqdm
tqdm.pandas()
from PIL import Image
import altair as alt
alt.data_transformers.disable_max_rows()
from tqdm.auto import tqdm
import numpy as np
tqdm.pandas()
import cv2
from PIL import Image as im
from scipy.ndimage import interpolation as inter

# Metadata preprocessing

In [3]:
interim_path = Path.cwd() / "data" / "interim"

wbsin_meta_df = pd.read_csv(interim_path / "wbsin_meta.csv")
wbsin_meta_df["xml_path"] = wbsin_meta_df["file_path"].apply(
    lambda x: x.replace("jpg", "xml")
)


In [4]:
wbsin_meta_df["bnd_box"] = wbsin_meta_df["xml_path"].apply(lambda x: extract_bnd_box(x))

In [5]:
def crop_and_save(file_path, label, xmin, xmax, ymin, ymax, save_dir):
    og_img = Image.open(file_path)
    cropped_img = og_img.crop((xmin, ymin, xmax, ymax))

    cropped_img.save(save_dir / f"{label}.jpg")
    return save_dir / f"{label}.jpg"





100%|██████████| 6059/6059 [03:14<00:00, 31.22it/s]


In [6]:
wbsin_meta_df["crop_height"] = wbsin_meta_df.progress_apply(
    lambda row: row["bnd_box"]["ymax"] - row["bnd_box"]["ymin"], axis=1
)
wbsin_meta_df["crop_width"] = wbsin_meta_df.progress_apply(
    lambda row: row["bnd_box"]["xmax"] - row["bnd_box"]["xmin"], axis=1
)

wbsin_meta_df["crop_ratio"] = wbsin_meta_df["crop_width"] / wbsin_meta_df["crop_height"]


100%|██████████| 6059/6059 [00:00<00:00, 49932.58it/s]
100%|██████████| 6059/6059 [00:00<00:00, 49650.65it/s]


In [7]:
wbsin_meta_df.to_csv(Path.cwd() / "data" / "interim" / "cropped_wbsin_meta.csv", index=False)

In [10]:


def preprocess_cropped_image(cropped_image_path, label, save_dir):
    def find_score(arr, angle):
        data = inter.rotate(arr, angle, reshape=False, order=0)
        hist = np.sum(data, axis=1)
        score = np.sum((hist[1:] - hist[:-1]) ** 2)
        return hist, score
        

    # Reads original image
    img = cv2.imread(cropped_image_path,0)

    # Applies adaptive threshold image.
    img = cv2.adaptiveThreshold(img,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY, blockSize=3,C=3)

    # Contour dilates image.
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 1))
    img = cv2.dilate(img, kernel)

    # Blurs.
    blur = cv2.GaussianBlur(img,(3,3),0)
    img = cv2.threshold(blur, 100, 255, cv2.THRESH_BINARY)[1]

    # Thins image.
    kernel = np.ones((3,3),np.uint8)
    img = cv2.erode(img,kernel,iterations = 2)

    # Fixes skew.
    delta = .05
    limit = 4.0
    angles = np.arange(-.5, limit+delta, delta)
    scores = []
    for angle in angles:
        hist, score = find_score(img, angle)
        scores.append(score)
    best_score = max(scores)
    best_angle = angles[scores.index(best_score)]
    best_angle = best_angle.round(2)


    img = im.fromarray(img)
    img = img.rotate(best_angle, expand=1, fillcolor="white")
    img.save(save_dir / f"{label}.jpg")
    return save_dir / f"{label}.jpg"

wbsin_meta_df["proc_path"] = wbsin_meta_df.progress_apply(
    lambda row: preprocess_cropped_image(
        str(row["crop_path"]),
        row["label"],
        Path.cwd() / "data" / "processed" / "wbsin_images",
    ),
    axis=1,
)


100%|██████████| 6059/6059 [1:04:47<00:00,  1.56it/s]


In [11]:
wbsin_meta_df.to_csv(Path.cwd() / "data" / "processed" / "processed_wbsin_meta.csv", index=False)

In [12]:
wbsin_meta_df

Unnamed: 0,file_path,label,width,height,xml_path,bnd_box,crop_path,crop_height,crop_width,crop_ratio,proc_path
0,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,3KPA24AD6LE328168,1280,720,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,"{'xmin': 85.0, 'xmax': 967.0, 'ymin': 188.0, '...",c:\Users\KMA62139\OneDrive - Kia\Documents - B...,82.0,882.0,10.756098,c:\Users\KMA62139\OneDrive - Kia\Documents - B...
1,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,3KPA24AD6LE328199,1280,720,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,"{'xmin': 84.0, 'xmax': 968.0, 'ymin': 172.0, '...",c:\Users\KMA62139\OneDrive - Kia\Documents - B...,82.0,884.0,10.780488,c:\Users\KMA62139\OneDrive - Kia\Documents - B...
2,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,3KPA24AD6LE328235,1280,720,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,"{'xmin': 76.0, 'xmax': 960.0, 'ymin': 150.0, '...",c:\Users\KMA62139\OneDrive - Kia\Documents - B...,84.0,884.0,10.523810,c:\Users\KMA62139\OneDrive - Kia\Documents - B...
3,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,3KPA24AD6LE328266,1280,720,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,"{'xmin': 52.0, 'xmax': 1058.0, 'ymin': 231.0, ...",c:\Users\KMA62139\OneDrive - Kia\Documents - B...,134.0,1006.0,7.507463,c:\Users\KMA62139\OneDrive - Kia\Documents - B...
4,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,3KPA24AD6LE328350,1280,720,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,"{'xmin': 84.0, 'xmax': 966.0, 'ymin': 160.0, '...",c:\Users\KMA62139\OneDrive - Kia\Documents - B...,84.0,882.0,10.500000,c:\Users\KMA62139\OneDrive - Kia\Documents - B...
...,...,...,...,...,...,...,...,...,...,...,...
6054,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,3KPA24AD6LE328039,1280,720,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,"{'xmin': 85.0, 'xmax': 969.0, 'ymin': 146.0, '...",c:\Users\KMA62139\OneDrive - Kia\Documents - B...,85.0,884.0,10.400000,c:\Users\KMA62139\OneDrive - Kia\Documents - B...
6055,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,3KPA24AD6LE328042,1280,720,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,"{'xmin': 44.0, 'xmax': 1053.0, 'ymin': 234.0, ...",c:\Users\KMA62139\OneDrive - Kia\Documents - B...,132.0,1009.0,7.643939,c:\Users\KMA62139\OneDrive - Kia\Documents - B...
6056,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,3KPA24AD6LE328056,1280,720,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,"{'xmin': 116.0, 'xmax': 998.0, 'ymin': 182.0, ...",c:\Users\KMA62139\OneDrive - Kia\Documents - B...,85.0,882.0,10.376471,c:\Users\KMA62139\OneDrive - Kia\Documents - B...
6057,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,3KPA241AALE328554,1280,720,C:\Users\KMA62139\OneDrive - Kia\Documents - B...,"{'xmin': 92.0, 'xmax': 976.0, 'ymin': 160.0, '...",c:\Users\KMA62139\OneDrive - Kia\Documents - B...,85.0,884.0,10.400000,c:\Users\KMA62139\OneDrive - Kia\Documents - B...
