In [56]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
from skimage.filters import threshold_otsu
import os
from PIL import Image


In [81]:
test_directory = './data/43290879-American_Samoa/'
test_header = './data/43290879-American_Samoa/samoa-header.jpg'
out_folder = 'segdata/American_Samoa/blocks/'

In [84]:
# for root, dirs, files in os.walk(test_directory):
#     for filename in files:
#         if len(dirs)==0:
#             print(root+'/'+filename)

In [44]:
def block_segmenter(directory,header,outputfolder,rotate):
    head_img = cv2.imread(header)
    for root, dirs, files in os.walk(directory):
        for filename in files:
            if len(dirs) == 0:
                fname = root + '/' + filename
                img = cv2.imread(fname)
                crop_img = edge_clip_rotate(img,rotate)
                temp_suc,seg_img = template_match(crop_img,head_img)
                if temp_suc:
                    cv2.imwrite(outputfolder+'seg-'+filename,seg_img)
                    
                else:
                    cv2.imwrite(outputfolder+'fail-'+filename,seg_img)
                    
               
            

In [43]:
def edge_clip_rotate(image,rotate):
    gray = cv2.cvtColor(image,cv2.COLOR_RGB2GRAY)
    _,thresh = cv2.threshold(gray, np.mean(gray), 255, cv2.THRESH_BINARY_INV)
    edges = cv2.dilate(cv2.Canny(thresh,0,255), None)
    cnt = sorted(cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)[-2], key = cv2.contourArea)[-1]
    mask = np.zeros(gray.shape[0:2],np.uint8)
    masked = cv2.drawContours(mask, [cnt],-1,255,-1)
    
    left = cnt[cnt[:,:,0].argmin()][0]
    right = cnt[cnt[:,:,0].argmax()][0]
    top = cnt[cnt[:,:,1].argmin()][0]
    bottom = cnt[cnt[:,:,1].argmax()][0]
    cropped_image = image[left[0]:right[0],top[1]:bottom[1]]
    
    if(rotate):
        lb = (bottom[0]-left[0])/(bottom[1]-left[1])
        rb = (right[1]-bottom[1])/(bottom[0]-right[0])
        tl = (left[1]-top[1])/(top[0]-left[0])
        tr = (right[0]-top[0])/(right[1]-top[1])


        if right[1]<left[1]: #if right_y <left_y, that is, page is shifted cw relative to straight
            angle = sum([np.arctan(1/tl),np.arctan(1/rb)])*90/np.pi
        else:
            angle = sum([np.arctan(tr),np.arctan(lb)])*-90/np.pi
            multiplier = -1

        (h,w) = cropped_image.shape[:2]
        center = (w/2,h/2)
        rotmat = cv2.getRotationMatrix2D(center,angle,1)
        croprot_image = cv2.warpAffine(cropped_image,rotmat,(w,h))
        return croprot_image
    
    else:
        return cropped_image

In [40]:
def template_match(cropped, header):
    if header.shape[1]>cropped.shape[1]:
        return False, cropped
    w,h = header.shape[::-1][1:3]
    res = cv2.matchTemplate(cropped,header,cv2.TM_CCOEFF)
    min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
    top_left = max_loc
    bottom_right = (top_left[0]+w,top_left[1]+h)
    segmented = cropped[bottom_right[1]:,top_left[0]:bottom_right[0],:]
    return True, segmented

In [85]:
#block_segmenter(test_directory,test_header,out_folder)

Pretty good- some duds, but quite a few came out very nicely. You might want to add a test to see if you need to do the crop/rotation part- ex: Alabama clearly doesn't need it, so putting it in there might be a notable loss by doing it. But, it's to be seen. If you just run Alabama and it works fine, why worry?

In [2]:
direc = './data'

In [37]:
# for root, dirs, files in os.walk(direc):
#     for filename in files:
#         if len(dirs) ==2:
#             print(root+'/'+filename)
#             print(root.split('-')[1])

In [45]:
def full_directory_block_seg(directory,folders_per_state,rotate):
    
    for root, dirs, files in os.walk(directory):
        for filename in files:
            if len(dirs) == folders_per_state:
                header = (root+'/'+filename).replace('\\','/')
                state_dir = (root+'/').replace('\\','/')
                output = 'segdata/'+root.split('-')[1] +'/blocks/'
                os.mkdir('./segdata/'+root.split('-')[1])
                os.mkdir('./'+output)
                block_segmenter(state_dir,header,output,rotate)

            

In [46]:
full_directory_block_seg('./data',2,False)

11.3 GB, around 1/3 was stripped from the top-level data. Looking at the results, it seems not using the rotation method is smarter. In theory, it can help better align the pages for template matching and cleaner cutting. In practice, however, it seems that the archivists did a good job in aligning the files, to the point that the crude left/right/top/bottom methods are not as useful. To be more precise- there are in some cases papers underneath the currently scanned document, and with angle deviations < 10 degrees, it can often be the case that, for example, even in a counter-clockwise shifted page, the 'left' value and the 'bottom' value might have very similar x values and depending on some factors on placement, it greatly shift the angle, instead of just nudging it, as would be prefered.

As a further note, it may be the case that the clipping of the black outer line is unnecessary- and in some cases, counterproductive. In some cases, the algorithm finds the dividing line between Col. 14 and 15 and divides on that line- making cutting on the header impossible. In particular, future segmentation may be better served by just segmenting based on a header and footer image and ignoring the left and right segmentation- but as it is now, we already have more than enough entries to label.

As a further note- while there was one header image used per each state, depending on the view of the data, it might be possible to do multiple states with one header, or on the other hand, using one header per folder, depending on how much the formatting can change

Let's now look at how many 'failures' (in this case, only defined as cases where the header image was wider than the cropped image) there were in each state, by state, so as to better understand the data.

In [73]:
a.shape[0]

2

In [63]:
a = np.array([1,'a'])

In [72]:
np.append(a,[2,'b']).reshape([2,2]).shape

(2, 2)

In [83]:
a = np.array([])
for root, dirs, files in os.walk('./segdata'):
    if len(files)>2:
        failed = 0
        for filename in files:
            if filename.split('-')[0] == 'fail':
                failed = failed + 1
                
        a = np.append(a,[root.split('\\')[1],len(files),failed,round(failed/len(files),2)])
        
a = a.reshape([int(a.shape[0]/4),4])

In [84]:
a = pd.DataFrame(a,columns=['state','total','failed','percent_failed'])

In [86]:
a.set_index('state',inplace=True)

In [91]:
a.sort_values(by='percent_failed',ascending=False).head(10)

Unnamed: 0_level_0,total,failed,percent_failed
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Virgin_Islands,50,50,1.0
Panama_Canal_Zone,104,104,1.0
Alaska,33,33,1.0
Guam,50,50,1.0
Puerto_Rico,56,56,1.0
Colorado,32,21,0.66
Wyoming,5,2,0.4
New_Hampshire,61,20,0.33
Georgia,54,9,0.17
New_Jersey,62,10,0.16


It's worth thinking about why exactly those 5 states failed in every single case- with Guam, it seems the clipping algorithm is mainly to blame, as it cut almost every table on the aforementioned division between col 14 and 15. The others are mysterious, however. That being said, we have more than enough data as is to proceed with just we have. This will be something for further consideration further down the line.

In the next notebook, we will finally whittle all these segments down into cells that can be labeled