### Imports

In [28]:
import pandas as pd
import os
import numpy as np 
from tqdm import tqdm
import json
import matplotlib.pyplot as plt
import cv2 as cv
from itertools import islice

### Functions

In [3]:
def skewCorrection(img, SKEW_RESULTS=True):
    """
    img : Grey/Binary image
    HOUHG_RESULTS : For displaying hough transform results
    """
    #Hough Transform
    
    # Perform edge detection using the Canny algorithm
    edges = cv.Canny(img, 50, 150, apertureSize=3)
    
    # Apply the Hough Transform to detect lines
    # The second and third arguments are the rho and theta resolutions
    # The fourth argument is the threshold for line detection
    lines = cv.HoughLines(edges, 1, np.pi / 180, 50)
    
    
    #Skew correction
    
    #calculate the avg angle of the detected lines
    try:
        angles = []
        for line in lines:
            rho, theta = line[0]
            angle = (theta * 180)/np.pi
            angles.append(angle)

        # Calculate the median angle and correct the skew
        median_angle = np.median(angles)
        # Subtract 90 to obtain the skew angle
        rotation_angle = median_angle - 90

        #Center of image
        height, width = img.shape[:2]
        center = (width//2, height//2)

        # Rotate the original image using the calculated angle
        rotation_matrix = cv.getRotationMatrix2D(center, rotation_angle, 1)
        rotated_img = cv.warpAffine(img, rotation_matrix, (width, height), borderValue=(255, 255, 255))
    except TypeError:
        rotated_img = img

    
    if (SKEW_RESULTS):
        print(f"No. of lines Detected: {lines.shape[0]}")

        # Draw the detected median lines on the original image
        
        # Find the index of the angle closest to the median_angle
        closest_angle_index = np.argmin(np.abs(np.array(angles) - median_angle))

        # Retrieve the corresponding rho and theta values
        rho, theta = lines[closest_angle_index][0]
        
        # For convreting polar co-ordinates to cartesian co-ordinates
        a = np.cos(theta)
        b = np.sin(theta)
        x0 = a * rho
        y0 = b * rho
        # pts in perpendicular direction of the reference pt(x0,y0) for making a line segment
        x1 = int(x0 + 1000 * (-b))
        y1 = int(y0 + 1000 * (a))
        x2 = int(x0 - 1000 * (-b))
        y2 = int(y0 - 1000 * (a))
        cv.line(img, (x1, y1), (x2, y2), (0, 0, 255), 2)
        
#         plt.figure(figsize=(20,10))
#         plt.subplot(2,2,1)
#         plt.imshow(img, cmap="gray")
#         plt.axis("off")

#         plt.subplot(2,2,2)
#         plt.imshow(edges, cmap="gray")
#         plt.title("Canny Edges")
#         plt.axis("off")
#         plt.show()
        
#         plt.subplot(2,2,3)
#         plt.imshow(rotated_img, cmap="gray")
#         plt.title("Unskewed Image", fontsize=8)
#         plt.axis("off")
#         plt.show()
        
    return rotated_img

def imagePre(img_locs):
    
    for loc in img_locs:
    
        #Load the image
        img = cv.imread(loc)

        #Greyscale 
        grey_img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)

        #No resizing(Already taken care by doctr apparently :/)

        # Apply thresholding to convert the image to binary
        #returns threshold val and binary image
        binary_img = cv.threshold(grey_img, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)[1]

        #erosion operation is typically used to fill small gaps in the objects, 
        kernel = cv.getStructuringElement(cv.MORPH_RECT, (5,5))
        erosion = cv.erode(binary_img, kernel, iterations = 1)


        unskewed_img = skewCorrection(erosion, False)


        #Standardize the image
#         mean = np.mean(unskewed_img)
#         std = np.std(unskewed_img)

#         std_img = (unskewed_img - mean)/std

        yield unskewed_img
    
    
def make_dataset(src_dir, df, dest_dir="../data/trainset", no_of_examples=None):
    
    if no_of_examples == None:
        no_of_examples=len(df["files"])

    img_folder = os.path.join(dest_dir, "images")
    if not os.path.exists(img_folder):
        os.makedirs(img_folder)

    values = {}
    count = 1
    for index, row in tqdm(islice(df.iterrows(), no_of_examples), total = no_of_examples, desc = "Progress"):
        img_loc = os.path.join(src_dir, row["files"])
        
        std_img = next(imagePre([img_loc]))
        

        #New filename
        new_filename = f"img_{count}.jpg"

        new_img_loc = os.path.join(img_folder, new_filename)
    
        #Copying all the files to training set
#         shutil.copy(img_loc, new_img_loc)

        # saving all the pre processed image to training set
        cv.imwrite(new_img_loc, std_img)

        values[new_filename] = row["tokens"]
        count += 1
        
        #Making json file
        json_filename = "labels.json"

        json_loc = os.path.join(dest_dir, json_filename)

        with open(json_loc, "w", encoding='utf-8') as f:
            json.dump(values, f, ensure_ascii=False, indent=4)

## Location of Dataset

In [12]:
#Take data folder location as input
src_dir = input("Enter location to Data Folder : ")

if os.path.exists(src_dir):
    dir_ = os.listdir(src_dir)
    print(f"Data Folder : {dir_}")

Enter location to Data Folder : ./../data/devanagari/trainset/
Data Folder : ['images', '.DS_Store', 'labels.json', 'unique_words.txt', 'vocab.txt', 'train.txt']


### Load Dataset

In [16]:
train_data = '/data/BADRI/IHTR/trainset/devanagari/'
dest_dir = './../data/devanagari/trainset_processed/'

with open(train_data + 'labels.json') as json_file:
    data = json.load(json_file)
    
train_df = pd.DataFrame.from_dict(data, orient='index').reset_index()
train_df.columns = ['files','tokens']

if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)

In [22]:
slice_df = train_df.loc[0:100, "files"]
img_locs = list(slice_df)

img_locs = [os.path.join(train_data + 'images/', loc) for loc in img_locs]

# img_locs = img_locs[0]
# print(img_locs)

img_gen = imagePre(img_locs)

In [None]:
#Making Training Dataset
make_dataset(src_dir=train_data + 'images/', dest_dir=dest_dir, df=train_df)

Progress:  69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                  | 47983/69853 [32:31<23:54, 15.24it/s]

In [26]:
#Making validation Dataset
make_dataset(src_dir=src_dir, dest_dir=dest_dir, df=val_df, no_of_examples=1000)

Progress: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:02<00:00, 417.86it/s]
