In [73]:
#All important library imports go here
import os
import numpy as np
import cv2
from scipy.signal import find_peaks, peak_prominences
import matplotlib.pyplot as plt
from IPython.display import clear_output
import tensorflow as tf

# Line Segmentation

In [74]:
# getting the names of all the paragraph images in the directory files
filenames=os.listdir('Train')
#stripping the extension of the image file from the string of the filename for further use
filenames_split=[filename.replace('.tif', '') for filename in filenames]

In [75]:
def thresholding(image, threshold): #2 tone
    ret, thresh= cv2.threshold(image,threshold,255,cv2.THRESH_BINARY_INV)
    return thresh

In [76]:
def directionalHistogram(img, direction):
  #function to compute the intensity histograms of an image in x and y directions
    (w,h) = img.shape
    sum = []
    pixel_count=0
    
    if direction == 'V':
        for j in range(h-1):
            for i in range(w-1):
                pixel = img[i,j]
                if(pixel == 255):
                    pixel_count+=1
            sum.append(pixel_count)
            pixel_count=0

    if direction == 'H':
        for j in range(w-1):
            for i in range(h-1):
                pixel=img[j,i]
                if(pixel==255):
                    pixel_count+=1
            sum.append(pixel_count)
            pixel_count=0
    return sum

In [77]:
def smoothHist(hist,kernel_size):
    
    # A function to smooth out the noise in intensity histograms of an image
    
    kernel = np.ones(kernel_size) / kernel_size
        
    return np.convolve(hist, kernel, mode='same')

In [78]:
def peakinterp(interp_factor, hist, prominence_factor):
    
    #Given an intensity histogram of an image, this function increases the resolution of the histogram
    #by interpolation and then finds the sharp peaks in this histogram using find_peaks()
    #Interp factor controls the new resolution of the histogram
    #Prominence factor decides how much the targeted peaks stand out from the baseline of the spectrum
    
    new_pixel_space = np.linspace(0, interp_factor*len(hist)-1,interp_factor*len(hist))*(1/interp_factor)
    pixel_space = np.linspace(0, len(hist)-1, len(hist))
    hist_interp = np.interp(new_pixel_space, pixel_space, hist)
    
    peaks, properties = find_peaks(hist_interp, prominence=np.max(hist_interp)/prominence_factor, width=50)
    
    return (peaks,hist_interp, new_pixel_space, pixel_space)

In [79]:
def gradientSign(hist_interp, resampled_pixel_space, Original_pixel_space):
    
    #Given an interpolated intensity histogram, this function finds the 1st derivative
    # of this histogram and outputs a vector of ones and zeros determining the sign
    # of the calculated derivative.
    # When the sign is +ve, the vector has 1
    # When the sign is -ve, the vector has 0
    
    hist_grad = np.gradient(hist_interp)
    grad_sign = np.where(hist_grad >= 0, 1, 0)
    
    return grad_sign

In [80]:
def rle(ia):

    #A function which when given a sequence of binary values outputs the following:
    # 1) the start positions of a portion of repeated values in the sequence
    # 2) the length of the portion of repeated values
    #This will be useful in dealing with the vector representing the sign change of
    #1st derivative of image intensity histogram


    n = len(ia)
    if n == 0: 
        return (None, None, None)
    else:
        y = ia[1:] != ia[:-1]               # check if adjacent points have different gradient signs
        i = np.append(np.where(y), n - 1)   # turning points plus last position
        z = np.diff(np.append(-1, i))       # length of gap between turning points
        p = np.cumsum(np.append(0, z))[:-1] # positions relative to 0
        return(z, p, ia[i])

In [81]:
def cutPositions(runlengths, startpositions, values, threshold,interp_factor):
    
    #Give a vector of ones and zeroes representing the sign change of 1st deriv. of
    # a histogram, this function smoothes out the abrupt changes in gradient sign
    # which might be an artifact of the gradient calculation.

    # This function also gives an estimation of the possible cutting locations to
    # extract lines
    
    viable_index=0
    for i in range(len(runlengths)):
        current_length=runlengths[i]
        if(current_length<threshold): #checking if current line height is less than the chosen threshold
            values[i]=values[viable_index]
        viable_index=i
        
    new_hist=[]
    for i in range(len(startpositions)):
        if(values[i]):
            new_hist+=np.ones(runlengths[i]-1).tolist()
        else:
            new_hist+=np.zeros(runlengths[i]-1).tolist()

    cutpos=[]
    for i in range(1,len(startpositions)):
        last=values[i-1]
        current=values[i]
        
        if(last==0 and current==1):
            cutpos.append(startpositions[i])
        elif(last==1 and i==1):
            cutpos.append(0)

    return (cutpos, new_hist)

In [82]:
def optimalThreshold(cutpos, runlengths, startpositions, values, new_hist, peaks, init_threshold, interp_factor):

    #when removing noise from the gradient sign vector prior to determining the cut locations, we use a threshold
    #value on the run lengths of ones and zeros.
    #An optimal value of the threshold is the value which when used gives us as many cut locations as detected peaks
    # in the original histogram
    
    while((len(cutpos)!= len(peaks))):
        init_threshold=init_threshold+interp_factor
        (cutpos, new_hist)=cutPositions(runlengths, startpositions, values, init_threshold,interp_factor)

    (cutpos, new_hist)=cutPositions(runlengths, startpositions, values, np.abs(init_threshold-interp_factor),interp_factor)
    cutpos=np.array(cutpos)/interp_factor
  
    return (cutpos, new_hist)

In [83]:
def cropImageToLines(cutpos, image, direction='H'):
    
    (w,h)=image.shape
    cropped_images=[]

    if(direction=='H'):
        for i in range(len(cutpos)):
            currentpos=cutpos[i]
            lastpos=cutpos[i-1]
            cropped_images.append(image[lastpos:currentpos-1,0:h-1])
    else:
        for i in range(len(cutpos)):
            currentpos=cutpos[i]
            lastpos=cutpos[i-1]
            cropped_images.append(image[0:w-1, lastpos:currentpos-1])

    return cropped_images

In [84]:
#loop over all pargraph images

!mkdir Lines
for m in range(len(filenames)):
    #print(filenames[m])
    #Read the paragraph image and apply thresholding
    #image = cv2.imread('ParagraphImages_v1.0/FixedTextParagraphs/Train/'+filenames[m],0)
    image = cv2.imread('Train/'+filenames[m],0)
    (w,h) = image.shape
    thresh1=thresholding(image, 240)
 

    #obtaining horizontal histogram and smoothing it
    hist_horizontal=directionalHistogram(thresh1, 'H')
    hist_horizontal_smooth=smoothHist(hist_horizontal,17)

    #Obtaining peak locations from the smoothed horizontal histogram
    init_threshold=50
    interp_factor=100
    (peaks,smooth_interp, new_pixel_space, pixel_space)=peakinterp(interp_factor, hist_horizontal_smooth, 8)
    grad_sign = gradientSign(smooth_interp, new_pixel_space, pixel_space)
    #obtaining the piecewise constant function approximating the sign change behavior of the 1st derivative of the horizontal histogram
    runlengths, startpositions, values =rle(grad_sign)
    (cutpos, new_hist)=cutPositions(runlengths, startpositions, values, init_threshold, interp_factor)


    #Removing undesired sign changes from the piecewise function which are the result of noise or numerical artifiacts, not the desired peaks
    cutpos, new_hist=optimalThreshold(cutpos, runlengths, startpositions, values, new_hist, peaks, 50, 100)
    #displaying lines extracted from the image
    lines= cropImageToLines(cutpos.astype(int), thresh1)
    for i in range(len(lines)):
        #cv2.imwrite("files_segmented/"+filenames_split[m]+"_"+str(i)+".tif", lines[i])        
        cv2.imwrite("Lines/"+filenames_split[m]+"_"+str(i)+".tif", lines[i])

# Word Segmentation

In [85]:
# list the file names inside the folder called lines
filenames=os.listdir('Lines/')
# strip filenames from the file extension for further use
filenames_split=[filename.replace('.tif', '') for filename in filenames]

In [86]:
def cropLineToWords(viable_sequences, image):
    
    #Given a line image and the cutpositions, this functions return the images of the words contained in a line
    
    (w, h) = image.shape
    words=[]
    
    for i in range(len(viable_sequences)):
        if(i > 0 and i < len(viable_sequences)):
            words.append(image[0:w - 1, viable_sequences[i - 1] : viable_sequences[i]])
        elif(i == len(viable_sequences) - 1):
            words.append(image[0:w - 1, viable_sequences[i] : len(viable_sequences)])
            
    return words

In [87]:
def removeSpaces(words):
    words_without_spaces=[]
    for i in range(len(words)):
        if(np.sum(words[i][:,:]>0)):
            words_without_spaces.append(words[i])
    return words_without_spaces  

In [88]:
#loop over all the lines in your line images folder

!mkdir Words

for m in range(len(filenames)):

    filename=filenames_split[m]
    words=[]

    #read the line image in grey-scale
    img=cv2.imread('Lines/'+filenames[m], 0)
    #get dimensions of the image
    (w,h) = img.shape
    #compute the intensity histogram in the y-direction
    hist=directionalHistogram(img, 'V')
    #find the locations where the vertical histogram is zero (background spaces between words)
    zero_sites=np.where(np.asarray(hist)==0)
    zero_sites=zero_sites[0]

    sequences=[]
    sequence_start=0

    #get the start and end of zero sequences of black spaces in the vertical histogram
    for i in range(1,len(zero_sites)):
        last_zero=zero_sites[i-1]
        current_zero=zero_sites[i]
        
        if(current_zero!=last_zero+1): #there is a word between these 2 values
            sequence_end=last_zero
            sequences.append([sequence_start,sequence_end])
            sequence_start=current_zero
            
        if(current_zero==last_zero+1 and i==len(zero_sites)-1): #end of the line
            sequence_start=sequence_end
            sequence_end=current_zero
            sequences.append([sequence_start,sequence_end])
            
    sequence_lengths=[]
    for i in range(len(sequences)): #length of spaces
        sequence_lengths.append(sequences[i][1]-sequences[i][0]+1)
  

    #Threshold the size of the zero sequences (whether it is big enough to consider it as
    # an interword spacing or small enough to consider as intraword spacing)
    sequence_ratio = np.asarray(sequence_lengths)/w
    average_sequence_length = np.sum(sequence_lengths[1:len(sequence_lengths)-1])/len(sequence_lengths) #ignore edge gaps
    viable_sequences = []
    overlap_factor = 0.75 * average_sequence_length 
    viable_sequences_unrolled = []
    
    #used 0 instead of average_sequence_length - overlap_factor to breakdown into letters where possible instead of words
    
    for i in range(len(sequences)):
        if(sequence_lengths[i] >= 0):#average_sequence_length - overlap_factor): #space is between words
            viable_sequences.append(sequences[i])
            viable_sequences_unrolled.append(sequences[i][0])
            viable_sequences_unrolled.append(sequences[i][1])  

    viable_sequences_unrolled.append(-1)
    if(viable_sequences_unrolled[0] != 0): 
        viable_sequences_unrolled = [0] + viable_sequences_unrolled
    words.append(cropLineToWords(viable_sequences_unrolled, img))

    ordered_words=[]

    #remove the spaces (word images with blank black background)
    for i in range(len(words[0])):
        word = words[0][i]
        sum = np.sum(words[0][i][:,:])
        if(sum):
            ordered_words.append(word)
        else:
            ordered_words.append('space')

    #save word images to the word directory
    count=0
    for i in range(len(ordered_words)):
        if(not type(ordered_words[i]) is str):
            count+=1
            cv2.imwrite("Words/"+filename+'_word'+str(count)+".tif", ordered_words[i])

A subdirectory or file Words already exists.


# Image Preprocessing

In [89]:
#list the filenames in the folder containing your dataset
filenames=os.listdir('Words/')
#strip the file extension from the file names
filenames_split=[filename.replace('.tif', '') for filename in filenames]

In [90]:
image_width = 64
image_height = 32
def distortion_free_resize(image, img_size=(image_height, image_width)):
    #resizing the images without distortion using tensorflow
    h,w= img_size
    image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True)

    # Check tha amount of padding needed to be done.
    pad_height = h - tf.shape(image)[0]
    pad_width = w - tf.shape(image)[1]

    # Only necessary if you want to do same amount of padding on both sides.
    if pad_height % 2 != 0:
        height = pad_height // 2
        pad_height_top = height + 1
        pad_height_bottom = height
    else:
        pad_height_top = pad_height_bottom = pad_height // 2

    if pad_width % 2 != 0:
        width = pad_width // 2
        pad_width_left = width + 1
        pad_width_right = width
    else:
        pad_width_left = pad_width_right = pad_width // 2

    image = tf.pad(
        image,
        paddings=[
            [pad_height_top, pad_height_bottom],
            [pad_width_left, pad_width_right],
            [0, 0],],)

    image = tf.transpose(image, perm=[1, 0, 2])
    image = tf.image.flip_left_right(image)

    return image

In [91]:
def crop_image(image, direction='H'): #function to crop the image in vertical and horizontal directions
    
    w, h = image.shape
    if(w < 10 or h < 10):
        return image
    
    hist = directionalHistogram(image, direction)
    flipped_hist = np.flip(hist)

    startpos = 0
    endpos = 0
    
    for i in range(len(hist) - 1):
        if(hist[i] != 0):
            startpos = i
            break
            
    for i in range(len(flipped_hist) - 1):
        if(flipped_hist[i] != 0):
            endpos = len(flipped_hist) - 1 - i
            break
            
    if endpos == 1 or endpos == 0:
        endpos = len(hist) - 1
        
    diff = np.abs(startpos - endpos)
    
    if diff <= 2:
        if 0 <= startpos - 4:
            startpos = startpos - 4
        else:
            endpos = endpos + 4
            
    if(direction == 'H'):
        if(startpos < endpos):
            return image[startpos : endpos, :]
        else:
            return image[endpos : startpos, :]
        
    elif(direction == 'V'):
        if(startpos < endpos):
            return image[:, startpos : endpos]
        else:
            return image[:, endpos : startpos]
        
    else:
        return image

In [92]:
!mkdir Preprocessed

for m in range(len(filenames)):
    
    # read the image in grey scale
    image = cv2.imread('Words/'+filenames[m],0)
    
    #skeletonize the image
    #crop the image to the letter
    
    image_h_cropped=crop_image(image, 'H')
    image_hv_cropped=crop_image(image_h_cropped, 'V')
    
    nh, nw = image_hv_cropped.shape
    if (nh < 30 and nw < 30) or nh < 6 or nw < 6: #ignore punctutation and accidental small marks
        continue
        
    #convert image to RGB because 3 dimensions are required to resize
    image_hv_cropped=cv2.cvtColor(image_hv_cropped,cv2.COLOR_GRAY2RGB)
    
    #distortionless resize with tensorflow
    image=distortion_free_resize(image_hv_cropped)
    
    #convert image back to np array as grey scale and save it as jpg
    image=cv2.cvtColor(image.numpy(),cv2.COLOR_RGB2GRAY)    
    image=np.rot90(image)

    cv2.imwrite("Preprocessed/"+filenames_split[m]+".jpg", image)