In [2]:
# imports & matplotlib options
%matplotlib notebook

import pandas as pd
import numpy as np
import ipywidgets as widgets
import cv2
import collections
import random
from shutil import copy
from pathlib import Path
from datetime import datetime
from collections import OrderedDict
from ipywidgets import interact, IntProgress, Label, VBox
from IPython.display import display, HTML
from matplotlib import pyplot as plt
import img_qc.img_qc as img_qc
from skimage.measure import compare_ssim as ssim
from PIL import Image

# plt.rc('figure', figsize=(30.0, 20.0))
display(HTML("<style>.container {width:85% !important;}</style>"))

print(f'\n{25 * "*_*"}\nLast run on {datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}')


*_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_*
Last run on 2019-08-28 09:54:01 PM


In [3]:
# pre-built lists to iterate over
months = [
    'january',
    'february',
    'march',
    'april',
    'may',
    'june',
    'july',
    'august',
    'september',
    'october',
    'november',
    'december'
]

roi_box_list = ['text', 'left', 'top', 'width', 'height']

print(f'\n{25 * "*_*"}\nLast run on {datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}')


*_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_*
Last run on 2019-08-28 09:54:02 PM


In [4]:
# create page 1 paths list
data_dir_path = Path('/Volumes/jmoor167/data/agrtfn')
page_1_paths_list = sorted(data_dir_path.glob('*.tif'))

# delete macOS '.' index files
regenerate_paths_list = False
for path in page_1_paths_list:
    if path.name.startswith('.'):
        path.unlink()  # delete it
        regenerate_paths_list = True

if regenerate_paths_list:
    page_1_paths_list = sorted(data_dir_path.glob('*.tif'))

print(f'{len(page_1_paths_list)} images in page 1 paths list')


print(f'\n{25 * "*_*"}\nLast run on {datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}')

3451 images in page 1 paths list

*_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_*
Last run on 2019-08-28 09:54:05 PM


In [5]:
# data for creating sub-crops to find

# create OrderedDictionary for Title Crop data
title_data_ordered_dict = OrderedDict()
title_data_ordered_dict['Agricultural and home economics packet'] = OrderedDict(
    [('rgb_0', [3295, (250, 450, 1450, 1150)])])
title_data_ordered_dict['Agricultural news'] = OrderedDict(
    [('rgb_0', [3446, (1150, 50, 3200, 850)])])
title_data_ordered_dict['Farm news'] = OrderedDict(
    [('gray_0', [2364, (1400, 500, 2650, 875)])])
title_data_ordered_dict['Agricultural & home economics news'] = OrderedDict([
    ('rgb_0', [2750, (1100, 50, 3150, 900)]),
    ('rgb_1', [2951, (250, 500, 3050, 1100)])
])
title_data_ordered_dict['Tennessee farm and home news'] = OrderedDict([
    ('gray_0', [1670, (1400, 150, 3150, 300)]),
    ('gray_1', [964, (1400, 150, 3175, 290)]),
    ('gray_2', [2151, (1250, 150, 3025, 325)])
])
title_data_ordered_dict['Tennessee farm news'] = OrderedDict([
    ('gray_0', [5, (725, 525, 2100, 750)]),
    ('gray_1', [15, (550, 600, 1950, 750)]),
    ('gray_2', [456, (1000, 550, 2400, 675)]),
    ('gray_3', [254, (800, 500, 2200, 650)]),
    ('gray_4', [44, (850, 475, 2250, 600)])
])

print(f'\n{25 * "*_*"}\nLast run on {datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}')


*_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_*
Last run on 2019-08-28 09:54:07 PM


In [6]:
# utility functions
def crop_image_for_processing(image, percentage=0.4, top_and_sides_padding=10):

    # get technical metadata
    height, width = image.shape[:2]

    # set (x, y) pairs
    x1, y1, x2, y2 = 0, 0, width, int(height * percentage)

    # add/subtract padding from the top/left/right
    x1 += top_and_sides_padding
    y1 += top_and_sides_padding
    x2 -= top_and_sides_padding
    # add image area to the bottom that was cropped from the top
    y2 += top_and_sides_padding

    # crop image
    image = image[y1:y2, x1:x2]

    return image


def get_np_crop_points(crop_box):
    x_points = []
    y_points = []
    # print(f'crop box: {crop_box}')

    # append all x/y points to their respective lists
    for i in range(len(crop_box)):
        if crop_box[i][0][0]:
            x = (crop_box[i][0][0])
            if x < 0:
                x = -x
            x_points.append(x)
        if crop_box[i][0][1]:
            y = crop_box[i][0][1]
            if y < 0:
                y = -y
            y_points.append(y)
    # print('x/y points')
    # print(x_points, y_points)

    # find extremes in crop box
    x1 = min(x_points)
    x2 = max(x_points)
    y1 = min(y_points)
    y2 = max(y_points)
    # print(f'x1: {x1}, y1: {y1}, x2: {x2}, y2: {y2}')

    return x1, y1, x2, y2


def if_rgb_convert_to_gray(np_image):
    if len(np_image.shape) > 2:
        np_image = cv2.cvtColor(np_image, cv2.COLOR_RGB2GRAY)

    return np_image


def if_bgr_convert_to_gray(np_image):
    if len(np_image.shape) > 2:
        np_image = cv2.cvtColor(np_image, cv2.COLOR_BGR2GRAY)

    return np_image


def resize(image, width=None, height=None, ratio=None, inter=cv2.INTER_AREA):
    # initialize the dimensions of the image to be resized and
    # grab the image size
    dim = None
    (h, w) = image.shape[:2]

    # if both the width and height are None, then return the
    # original image
    if width is None and height is None:
        return image

    # check to see if the width is None
    if width is None:
        # calculate the ratio of the height and construct the
        # dimensions
        ratio = height / float(h)
        dim = (int(w * ratio), height)

    # otherwise, the height is None
    else:
        # calculate the ratio of the width and construct the
        # dimensions
        ratio = width / float(w)
        dim = (width, int(h * ratio))

    # resize the image
    resized = cv2.resize(image, dim, interpolation=inter)

    # return the resized image
    return resized, ratio


def resize_ratio(image, ratio, inter=cv2.INTER_AREA):
    dim = None
    (h, w) = image.shape[:2]
    dim = (int(w * ratio), int(h * ratio))

    # resize the image
    resized = cv2.resize(image, dim, interpolation=inter)

    # return the resized image
    return resized


def quick_imshow(bgr_image):
    if len(bgr_image.shape) > 2:
        bgr_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
        cmap = None
    else:
        cmap = 'gray'
    plt.imshow(bgr_image, cmap=cmap), plt.show()
    
    return


print(f'\n{25 * "*_*"}\nLast run on {datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}')


*_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_*
Last run on 2019-08-28 09:54:07 PM


In [7]:
# sub-crop finding classes
class RootSIFT:
    def __init__(self, extractor):
        # initialize the SIFT feature extractor
        self.extractor = extractor

    def compute(self, image, keypoints, epsilon=1e-7):
        # compute SIFT descriptors
        (keypoints, descriptors) = self.extractor.detectAndCompute(image, None)

        # if there are are no keypoints or descriptors
        if len(keypoints) == 0:
            # return an empty tuple
            return ([], None)

        # apply the Hellinger kernel by first L1-normalizing and taking the
        # square root
        descriptors /= (descriptors.sum(axis=1, keepdims=True) + epsilon)
        descriptors = np.sqrt(descriptors)

        # return a tuple of the keypoints and descriptors
        return (keypoints, descriptors)


print(f'\n{25 * "*_*"}\nLast run on {datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}')


*_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_*
Last run on 2019-08-28 09:54:08 PM


In [8]:
# image sub-crop finding functions


def create_crop_dictionary(data_dictionary, show_images=False):

    crop_dictionary = OrderedDict()
    # bgr_crop_dictionary = OrderedDict()
    # gray_crop_dictionary = OrderedDict()

    for title, data in data_dictionary.items():

        print(f'Processing {title} . . .')
        print(f'\t\tsub_title\tadminDB\t\tcrop_box (x1, y1, x2, y2)')
        # print(data)

        if title not in crop_dictionary:  # instantiate empty OrderedDict
            crop_dictionary[title] = OrderedDict()
            # print(crop_dictionary)

        for sub_title, (adminDB, crop_box) in data.items():
            print(f'\t\t{sub_title}\t\t{adminDB}\t\t{crop_box}')
            # get image color, image_path, and crop box from sub_data
            colorspace = sub_title.split('_')[0]

            # print(colorspace)
            adminDB_ending = f'{str(adminDB).zfill(6)}_0001.tif'
            # print(adminDB_ending)
            try:
                image_path = [
                    x for x in page_1_paths_list if x.name.endswith(adminDB_ending)][0]
            except IndexError:  # no image match found
                print(f'No image for {title} - {sub_title}\n')
                continue

            x1, y1, x2, y2 = crop_box

            # load image and crop it
            image_for_title_crop = cv2.imread(str(image_path))
            title_crop = image_for_title_crop[y1:y2, x1:x2].copy()

            keypoints, descriptors = get_keypoints_and_descriptors(title_crop)

            # add cropped image to dictionaries
            # instantiate empty OrderedDict
            if sub_title not in crop_dictionary[title]:
                crop_dictionary[title][sub_title] = OrderedDict()
                crop_dictionary[title][sub_title] = title_crop, keypoints, descriptors

            # if colorspace == 'gray':  # then Grayscale
            #     if title not in gray_crop_dictionary:
            #         gray_crop_dictionary[title] = OrderedDict()
            #     if sub_title not in gray_crop_dictionary[title]:
            #         gray_crop_dictionary[title][sub_title] = title_crop
            #     cmap = 'gray'
            # else:  # then BGR
            #     if title not in bgr_crop_dictionary:
            #         bgr_crop_dictionary[title] = OrderedDict()
            #     if sub_title not in bgr_crop_dictionary[title]:
            #         bgr_crop_dictionary[title][sub_title] = title_crop
            #     title_crop = cv2.cvtColor(title_crop, cv2.COLOR_BGR2RGB)
            #     cmap = None

            if show_images:
                quick_imshow(title_crop)

    return crop_dictionary  # , gray_crop_dictionary, bgr_crop_dictionary


def get_keypoints_and_descriptors(image):
    # convert to grayscale if necessary
    image_gray = if_rgb_convert_to_gray(image)

    # create SIFT object
    sift = cv2.xfeatures2d.SIFT_create()
    extractor = RootSIFT(sift)

    # find keypoints and descriptors with RootSIFT
    (keypoints, _) = sift.detectAndCompute(image_gray, None)
    keypoints, descriptors = extractor.compute(image_gray, keypoints)

    return keypoints, descriptors


# https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_feature2d/py_feature_homography/py_feature_homography.html
def find_crop_rootSift(image_search_for, image_look_in, minimum_matches, distance_ratio):

    image_search_for = np.array(image_search_for)
    image_look_in = np.array(image_look_in)

    # convert to grayscale if necessary
    image_search_for_gray = if_rgb_convert_to_gray(image_search_for)
    image_look_in_gray = if_rgb_convert_to_gray(image_look_in)

    # equalize histogram of image we're looking in (already done for title crop)
    # image_look_in_gray = cv2.equalizeHist(image_look_in_gray)
    # image_search_for_gray = cv2.equalizeHist(image_search_for_gray)

    # create SIFT object
    sift = cv2.xfeatures2d.SIFT_create()
    extractor = RootSIFT(sift)

    # find keypoints and descriptors with SIFT
    (keypoints, _) = sift.detectAndCompute(image_search_for_gray, None)
    keypoints_1, descriptors_1 = extractor.compute(
        image_search_for_gray, keypoints)
    (keypoints, _) = sift.detectAndCompute(image_look_in_gray, None)
    keypoints_2, descriptors_2 = extractor.compute(
        image_look_in_gray, keypoints)

    FLANN_INDEX_KDTREE = 0
    index_parameters = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
    search_parameters = dict(checks=50)

    flann = cv2.FlannBasedMatcher(index_parameters, search_parameters)

    matches = flann.knnMatch(descriptors_1, descriptors_2, k=2)

    good_matches = []
    for m, n in matches:
        if m.distance < distance_ratio * n.distance:
            good_matches.append(m)
    number_of_good_matches = len(good_matches)
    # print(f'before: {number_of_good_matches}')

    if number_of_good_matches >= minimum_matches:
        source_points = np.float32(
            [keypoints_1[m.queryIdx].pt for m in good_matches]).reshape(-1, 1, 2)
        destination_points = np.float32(
            [keypoints_2[m.trainIdx].pt for m in good_matches]).reshape(-1, 1, 2)

        matrix, mask = cv2.findHomography(
            source_points, destination_points, cv2.RANSAC, 5.0)
        if matrix is not None:

            matches_mask = mask.ravel().tolist()

            height, width = image_search_for_gray.shape
            points = np.float32(
                [[0, 0], [0, height-1], [width-1, height-1], [width-1, 0]]).reshape(-1, 1, 2)
            destination = cv2.perspectiveTransform(points, matrix)

            crop_box = np.int32(destination)

            match_drawing = image_look_in.copy()
            match_drawing = cv2.polylines(match_drawing, [np.int32(
                destination)], True, (255, 0, 0), 10, cv2.LINE_AA)

            # draw matches
            # draw_parameters = dict(
            #     matchColor = (0, 255, 0),
            #     singlePointColor = None,
            #     matchesMask = matches_mask,  # only draw inliers
            #     flags = 2  # don't draw single keypoints
            # )

            # matched_graphic = cv2.drawMatches(
            #     image_search_for,
            #     keypoints_1,
            #     match_drawing,
            #     keypoints_2,
            #     good_matches,
            #     None,
            #     **draw_parameters
            # )

            # crop found image
            perspective_matrix = cv2.getPerspectiveTransform(
                np.float32(destination), points)
            found_image = cv2.warpPerspective(
                image_look_in, perspective_matrix, (width, height))

            return number_of_good_matches, match_drawing, crop_box, found_image

        # print('')
        # print(f'{25 * "*"}')
        # print( "Not enough matches are found - {}/{}".format(len(good),minimum_matches))
        # print(f'{25 * "*"}')
        # print('')

    # return 0 for images if there weren't enough matches
    return number_of_good_matches, None, None, None


def get_descriptor_matches(descriptors_1, descriptors_2, distance_ratio):

    # # load the image packages we're looking in and looking for, then convert to grayscale
    # image_search_for, keypoints_1, descriptors_1 = to_find_image_package
    # image_look_in, keypoints_2, descriptors_2 = find_in_image_package
    # image_search_for_gray = if_rgb_convert_to_gray(image_search_for)
    # image_look_in_gray = if_rgb_convert_to_gray(image_look_in)

    # match descriptors
    FLANN_INDEX_KDTREE = 0
    index_parameters = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
    search_parameters = dict(checks=50)

    flann = cv2.FlannBasedMatcher(index_parameters, search_parameters)

    matches = flann.knnMatch(descriptors_1, descriptors_2, k=2)

    good_matches = []
    for m, n in matches:
        if m.distance < distance_ratio * n.distance:
            good_matches.append(m)
    number_of_good_matches = len(good_matches)

    return number_of_good_matches, good_matches


def get_matching_crop_and_box(matches, keypoints_to_find, keypoints_look_in, image_look_for, image_look_in):

    crop_image, found_image, crop_box = False, False, False

#     source_points = np.float32(
#         [keypoints_to_find[m.queryIdx].pt for m in matches]).reshape(-1, 1, 2)
#     destination_points = np.float32(
#         [keypoints_look_in[m.trainIdx].pt for m in matches]).reshape(-1, 1, 2)

#     matrix, mask = cv2.findHomography(
#         source_points, destination_points, cv2.RANSAC, 5.0)

#     if matrix is not None:

#         matches_mask = mask.ravel().tolist()

#         height, width = image_look_in.shape[:2]
#         print(f'h/w: {height}/{width}')

    source_points = np.float32([keypoints_to_find[m.queryIdx].pt for m in matches]).reshape(-1, 1, 2)
    destination_points = np.float32([keypoints_look_in[m.trainIdx].pt for m in matches]).reshape(-1, 1, 2)

    matrix, mask = cv2.findHomography(
        source_points, destination_points, cv2.RANSAC, 5.0)
    if matrix is not None:

        matches_mask = mask.ravel().tolist()

        height, width = image_look_for.shape[:2]
        # print(f'h/w: {height}/{width}')
        points = np.float32(
            [[0, 0], [0, height-1], [width-1, height-1], [width-1, 0]]).reshape(-1, 1, 2)
        destination = cv2.perspectiveTransform(points, matrix)

        crop_box = np.int32(destination)

        # crop found image
        perspective_matrix = cv2.getPerspectiveTransform(
            np.float32(destination), points)
        found_image = cv2.warpPerspective(
            image_look_in, perspective_matrix, (width, height))
        # quick_imshow(found_image)

    return found_image, crop_box


print(f'\n{25 * "*_*"}\nLast run on {datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}')


*_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_*
Last run on 2019-08-28 09:54:08 PM


In [9]:
# create crops to find
title_crops_dict = create_crop_dictionary(
    title_data_ordered_dict, show_images=False)

print(f'\n{25 * "*_*"}\nLast run on {datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}')

Processing Agricultural and home economics packet . . .
		sub_title	adminDB		crop_box (x1, y1, x2, y2)
		rgb_0		3295		(250, 450, 1450, 1150)
Processing Agricultural news . . .
		sub_title	adminDB		crop_box (x1, y1, x2, y2)
		rgb_0		3446		(1150, 50, 3200, 850)
Processing Farm news . . .
		sub_title	adminDB		crop_box (x1, y1, x2, y2)
		gray_0		2364		(1400, 500, 2650, 875)
Processing Agricultural & home economics news . . .
		sub_title	adminDB		crop_box (x1, y1, x2, y2)
		rgb_0		2750		(1100, 50, 3150, 900)
		rgb_1		2951		(250, 500, 3050, 1100)
Processing Tennessee farm and home news . . .
		sub_title	adminDB		crop_box (x1, y1, x2, y2)
		gray_0		1670		(1400, 150, 3150, 300)
		gray_1		964		(1400, 150, 3175, 290)
		gray_2		2151		(1250, 150, 3025, 325)
Processing Tennessee farm news . . .
		sub_title	adminDB		crop_box (x1, y1, x2, y2)
		gray_0		5		(725, 525, 2100, 750)
		gray_1		15		(550, 600, 1950, 750)
		gray_2		456		(1000, 550, 2400, 675)
		gray_3		254		(800, 500, 2200, 650)
		gray_4		44		

In [10]:
# recursive getsizeof Object and all items contained in it
from sys import getsizeof


def get_size(obj, seen=None):
    """Recursively finds size of objects"""
    size = getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size


print(
    f'title_crops_dict is {round(get_size(title_crops_dict)/1024/1024, 2)} megabytes')

title_crops_dict is 39.18 megabytes


In [11]:
class TnFarmNews:

    def __init__(self, image_path, config=False):
        
        self.image_path = Path(image_path)

        # set config dictionary
        if config:
            self.config = config
        else:
            self.config = {'crop_percentage': 0.4,
                           'distance_ratio': 0.6,
                           'minimum_matches': 9,
                           'top_and_bottom_padding': 10,
                           'resize_width': 1500}

        # set variables
        self.MOST_MATCHES = self.config['minimum_matches']
        self.BEST_SSIM_WITH_TITLE = -1  # on a scale of -1->1, 1 is a perfect match
        self.BEST_TITLE_CROP = False
        self.BEST_TITLE = False
        self.RESIZE_RATIO = False
        
    def preprocess_image(self, crop_percentage=None, top_and_bottom_padding=None):
        
        if not crop_percentage:
            crop_percentage = self.config['crop_percentage']
        if not top_and_bottom_padding:
            top_and_bottom_padding = self.config['top_and_bottom_padding']

        # load image
        self.image = cv2.imread(str(self.image_path))

        # get technical metadata
        self.height, self.width = self.image.shape[:2]
        if len(self.image.shape) > 2:
            self.colorspace = 'gray'
        else:
            self.colorspace = 'rgb'

        # crop image and get shape
        self.image_cropped = crop_image_for_processing(
            self.image, crop_percentage, top_and_bottom_padding)
        self.height_cropped, self.width_cropped = self.image_cropped.shape[:2]
        
        return self.image_cropped
    

    def guess_title(self, crop_dictionary, image=None, debug=False):

        if debug:
            print(f'Guessing title of {self.image_path.name} . . .')

        # resize cropped image and get features
        # if self.RESIZE_RATIO:
        #     image = resize_ratio(self.image_cropped, self.RESIZE_RATIO)
        # else:
        #     image, self.RESIZE_RATIO = resize(
        #         self.image_cropped, width=self.config['resize_width'])
        
        if not image:
            image = self.preprocess_image()
        
        self.keypoints, self.descriptors = get_keypoints_and_descriptors(image)

        best_per_title_dict = OrderedDict()

        for title, data in crop_dictionary.items():

                if title not in best_per_title_dict:  # instantiate an OrderedDict()
                    best_per_title_dict[title] = OrderedDict()  # number_of_matches, crop_box
                    best_per_title_dict[title] = [0, False, False, False]
                    if debug:
                        print(title)

                for sub_title, sub_data in crop_dictionary[title].items():

                    crop_image, crop_keypoints, crop_descriptors = sub_data
                    # quick_imshow(crop_image)

                    number_of_matches, matches = get_descriptor_matches(
                        crop_descriptors, self.descriptors, self.config['distance_ratio'])
                    if debug:
                        print(sub_title)
                        print(f'# of matches/minMatches: {number_of_matches}/{(self.MOST_MATCHES * 0.5)}')

                    if number_of_matches >= (self.MOST_MATCHES * 0.5):  # then it might be the best match

                        found_image, crop_box = get_matching_crop_and_box(
                                matches, crop_keypoints, self.keypoints, crop_image, image)

                        try:
                            if not crop_box:  # then we didn't get a valid crop_box
                                if debug:
                                    print(f'\t\tFalse positive: invalid crop_box')
                                continue
                        except ValueError:  # valid numpy array throws a ValueError when testing existence

                            if debug:
                                print(f'crop box: {crop_box}')
                            x1, y1, x2, y2 = get_np_crop_points(crop_box)

                            # x1 = int(x1 / self.RESIZE_RATIO)
                            # y1 = int(y1 / self.RESIZE_RATIO)
                            # x2 = int(x2 / self.RESIZE_RATIO)
                            # y2 = int(y2 / self.RESIZE_RATIO)

                            # if width/height of title isn't over min_title_width/height pixels
                            # it's not the title
                            min_title_width = 900
                            min_title_height = 50
                            title_width = x2 - x1
                            title_height = y2 - y1

                            if (x2 - x1 < min_title_width):
                                if debug:
                                    print(f'\t\t\tFalse positive: width < minimum')
                                    print(
                                        f'\t\t\t\t{x2} - {x1} = {title_width} < {min_title_width}')
                                continue
                            elif (y2 - y1 < min_title_height):
                                if debug:
                                    print(f'\t\t\tFalse positive: height < minimum')
                                    print(
                                        f'\t\t\t\t{y2} - {y2} = {title_height} < {min_title_height}')
                                continue
                                
                            if debug:
                                print(x1, y1, x2, y2)
                                crop = self.image_cropped[int(y1):int(y2), int(x1):int(x2)]
                                print(f'cropped image')
                                quick_imshow(crop)

                            # convert to grayscale for ssim
                            found_gray = if_bgr_convert_to_gray(found_image)
                            crop_gray = if_bgr_convert_to_gray(crop_image)

                            # get the structural similiarity index of the match with the image for a 2nd heuristic
                            try:
                                if debug:
                                    print('found_gray')
                                    quick_imshow(found_gray)
                                    print(f'crop_gray')
                                    quick_imshow(crop_gray)
                                ssim_value = ssim(found_gray, crop_gray)
                                if ssim_value > self.BEST_SSIM_WITH_TITLE:
                                    if debug:
                                        print(f'new best ssim: {title, ssim_value}')
                                    self.BEST_SSIM_WITH_TITLE = ssim_value
                                    points = [x1, y1, x2, y2]
                                    self.BEST_TITLE_CROP = points
                                    self.BEST_TITLE = title
                                    self.MOST_MATCHES = number_of_matches
                    
                            except ValueError:  # image shape doesn't match, so most likely incorrect title
                                if debug:
                                    print('wrong dimensions')
                                continue
                    

    def ocr(self):

        image = cv2.cvtColor(self.image_cropped, cv2.COLOR_BGR2GRAY)

        # blur image
        image = cv2.GaussianBlur(image, (3, 3), 1)

        # binarize
        self.binarized = cv2.adaptiveThreshold(
            image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 175, 21)

        # ocr
        self.line_and_word_boxes = tool.image_to_string(
            Image.fromarray(self.binarized),
            lang='eng',
            builder=pyocr.builders.LineBoxBuilder(),
        )


print(f'\n{25 * "*_*"}\nLast run on {datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}')


*_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_*
Last run on 2019-08-28 09:54:19 PM


In [17]:
csv_path = data_dir_path.joinpath('agrtfn_title.csv')
if csv_path.is_file():
    processed_df = pd.read_csv(csv_path)
else:
    processed_df = pd.DataFrame()
processed_df

Unnamed: 0,path,title_crop,title_guess,title_matches_number,title_ssim
0,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,,,9,-1.000000
1,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,,,9,-1.000000
2,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,,,9,-1.000000
3,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,"[803, 563, 2200, 793]",Tennessee farm news,18,0.643979
4,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,,,9,-1.000000
5,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,"[1972, 100, 6164, 767]",Tennessee farm news,6,0.316811
6,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,"[452, 671, 3145, 831]",Tennessee farm news,10,0.486516
7,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,"[540, 589, 1939, 739]",Tennessee farm news,1243,1.000000
8,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,"[593, 51, 2851, 2363]",Tennessee farm news,8,0.531900
9,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,"[477, 301, 2006, 547]",Tennessee farm news,6,0.576308


In [18]:
# there are duplicate rows due to different data paths
# add image_name as a DataFrame column so this can be parsed
processed_df['image_name'] = processed_df.path.str.split('/').str[-1]
processed_df.head(5)

Unnamed: 0,path,title_crop,title_guess,title_matches_number,title_ssim,image_name
0,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,,,9,-1.0,0012_004266_000001_0001.tif
1,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,,,9,-1.0,0012_004266_000002_0001.tif
2,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,,,9,-1.0,0012_004266_000006_0001.tif
3,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,"[803, 563, 2200, 793]",Tennessee farm news,18,0.643979,0012_004266_000009_0001.tif
4,/Volumes/jmoor167/data/tnfarmnews/0012_004266_...,,,9,-1.0,0012_004266_000011_0001.tif


In [19]:
paths_df = processed_df.copy()

In [20]:
processed_df.drop(['path'], axis=1, inplace=True)
processed_df.head(3)

Unnamed: 0,title_crop,title_guess,title_matches_number,title_ssim,image_name
0,,,9,-1.0,0012_004266_000001_0001.tif
1,,,9,-1.0,0012_004266_000002_0001.tif
2,,,9,-1.0,0012_004266_000006_0001.tif


In [26]:
# batch process
rows_list = []

images_to_process_before_saving_csv = 20

# uncomment to reset processed_df
# processed_df = pd.DataFrame()

# let's process the pages in random order
paths_list = page_1_paths_list
# paths_list = random.sample(page_1_paths_list, len(page_1_paths_list))

number_of_paths = len(paths_list)

# progress bar
progress_label = Label('Images to process')
progress_bar = IntProgress(min=0, max=number_of_paths)
progress_widget = VBox([progress_label, progress_bar])
display(progress_widget)


for index, image_path in enumerate(paths_list, start=1):

    # if the image_path is already in the dataframe skip it
    if len(processed_df) > 0:
        # test for the paths that end with our filename as the filename MUST be unique
        # anyway as it's based on the preservation identifier
        if (processed_df['image_name'].str.endswith(str(image_path.name))).any():
            # if modulo of processed images is 0 or it's the last image save data to the CSV 
            if index % images_to_process_before_saving_csv == 0 or index == number_of_paths:
                if len(rows_list) > 0:

                    print(f'Saving data from {index} images to {csv_path}')

                    # get dataframe from processed rows
                    crop_df = pd.DataFrame(rows_list)

                    # add dataframes together
                    processed_df = pd.concat([processed_df, crop_df])

                    # drop duplicates
                    processed_df = processed_df.iloc[processed_df.astype(str).drop_duplicates().index]

                    # sort on image_path and reset the index
                    processed_df = processed_df.sort_values(by='image_name').reset_index(drop=True)
                    processed_df.to_csv(csv_path, index=False)

                    # reset rows_list
                    rows_list = []

            continue

    label = f'Processing {image_path.name} . . . {index}/{number_of_paths}'
    progress_label.value = label

    issue = TnFarmNews(image_path)
    # issue.ocr()
    # draw_month_box(issue.line_and_word_boxes, issue.image_cropped)
    issue.guess_title(title_crops_dict, debug=False)
    # print(f'\t{issue.BEST_TITLE}')

    # get input row in dictionary format
    # key = column_name
    results_dictionary = {'image_name': issue.image_path.name,
                          'title_guess': issue.BEST_TITLE,
                          'title_crop': issue.BEST_TITLE_CROP,
                          'title_matches_number': issue.MOST_MATCHES,
                          'title_ssim': issue.BEST_SSIM_WITH_TITLE
                          }

    for key in results_dictionary:
        if results_dictionary[key]:
            continue
        else:
            results_dictionary.update({key: None})

    rows_list.append(results_dictionary)

    progress_bar.value = index
    
    # if modulo of processed images is 0 or it's the last image save data to the CSV 
    if index % images_to_process_before_saving_csv == 0 or index == number_of_paths:
        
        print(f'Saving data from {index} images to {csv_path}')
        
        # get dataframe from processed rows
        crop_df = pd.DataFrame(rows_list)
        
        # add dataframes together
        processed_df = pd.concat([processed_df, crop_df])

        # drop duplicates
        processed_df = processed_df.iloc[processed_df.astype(str).drop_duplicates().index]

        # sort on image_path and reset the index
        processed_df = processed_df.sort_values(by='image_name').reset_index(drop=True)
        processed_df.to_csv(csv_path, index=False)
        
        # reset rows_list
        rows_list = []

VBox(children=(Label(value='Images to process'), IntProgress(value=0, max=3451)))

Saving data from 40 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 60 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 80 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 100 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 120 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 140 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 160 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 180 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 200 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 220 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 240 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 260 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 280 images to /Volumes/jmoor167/data/agrtfn/agrtfn

Saving data from 2140 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 2160 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 2180 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 2200 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 2220 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 2240 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 2260 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 2280 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 2300 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 2320 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 2340 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 2360 images to /Volumes/jmoor167/data/agrtfn/agrtfn_title.csv
Saving data from 2380 images to /Volumes/jmoor167/da

In [None]:
# batch process
rows_list = []

images_to_process_before_saving_csv = 20

# uncomment to reset processed_df
# processed_df = pd.DataFrame()

# let's process the pages in random order
# paths_list = page_1_paths_list
paths_list = random.sample(page_1_paths_list, len(page_1_paths_list))

number_of_paths = len(paths_list)

# progress bar
progress_label = Label('Images to process')
progress_bar = IntProgress(min=0, max=number_of_paths)
progress_widget = VBox([progress_label, progress_bar])
display(progress_widget)


for index, image_path in enumerate(paths_list, start=1):

    # if the image_path is already in the dataframe skip it
    if len(processed_df) > 0:
        # test for the paths that end with our filename as the filename MUST be unique
        # anyway as it's based on the preservation identifier
        if (processed_df['path'].str.endswith(str(image_path.name))).any():
            continue

    label = f'Processing {image_path.name} . . . {index}/{number_of_paths}'
    progress_label.value = label

    issue = TnFarmNews(image_path)
    # issue.ocr()
    # draw_month_box(issue.line_and_word_boxes, issue.image_cropped)
    issue.guess_title(title_crops_dict, debug=False)
    # print(f'\t{issue.BEST_TITLE}')

    # get input row in dictionary format
    # key = column_name
    results_dictionary = {'path': str(issue.image_path),
                          'title_guess': issue.BEST_TITLE,
                          'title_crop': issue.BEST_TITLE_CROP,
                          'title_matches_number': issue.MOST_MATCHES,
                          'title_ssim': issue.BEST_SSIM_WITH_TITLE
                          }

    for key in results_dictionary:
        if results_dictionary[key]:
            continue
        else:
            results_dictionary.update({key: None})

    rows_list.append(results_dictionary)

    progress_bar.value = index
    
    # if modulo of processed images is 0 or it's the last image save data to the CSV 
    if index % images_to_process_before_saving_csv == 0 or index == number_of_paths:
        
        print(f'Saving data from {index} images to {csv_path}')
        
        # get dataframe from processed rows
        crop_df = pd.DataFrame(rows_list)
        
        # add dataframes together
        processed_df = pd.concat([processed_df, crop_df])

        # drop duplicates
        processed_df.drop_duplicates(subset=['path'], inplace=True)

        # sort on image_path and reset the index
        processed_df = processed_df.sort_values(by='path').reset_index(drop=True)
        processed_df.to_csv(csv_path, index=False)
        
        # reset rows_list
        rows_list = []

In [None]:
# get dataframe from processed rows
crop_df = pd.DataFrame(rows_list)
crop_df

In [None]:
# add dataframes together
processed_df = pd.concat([processed_df, crop_df])

# drop duplicates
processed_df.drop_duplicates(subset=['path'], inplace=True)

# sort on image_path and reset the index
processed_df = processed_df.sort_values(by='path').reset_index(drop=True)
processed_df.to_csv(csv_path, index=False)

In [None]:
len(processed_df)

In [None]:
processed_df

In [None]:
# based on low match data in the cell below, I changed the minimum matches to 10 to see if I could correct
# the false positive on data/images/0012_004266_000395_0001.tif
# I ended up dropping the values that were under 8 matches in the processed_df and saving out to CSV
print(f'before drop: {len(processed_df)}')
no_low_matches_df = processed_df[processed_df['title_matches_number'] >= 10].reset_index(drop=True)
print(f'after drop: {len(no_low_matches_df)}')

# processed_df = no_low_matches_df

In [None]:
string = 'string'
isinstance(string, str)

In [None]:
for i in range(len(processed_df)):
    path = processed_df.iloc[i]['path']
    title = processed_df.iloc[i]['title_guess']
    matches = processed_df.iloc[i]['title_matches_number']
    crop_box = processed_df.iloc[i]['title_crop']
    print(matches, path)
    print(title)
    if isinstance(crop_box, str):
        crop_box = crop_box.replace('[', '').replace(']', '')
        points = crop_box.split(', ')
        # print(points)
        points = [int(x) for x in points]
    elif isinstance(crop_box, list):
        # print(f'crop_box: {crop_box}')
        points = crop_box
    else:
        continue
    x1, y1, x2, y2 = points
    image = cv2.imread(path)
    crop = image[y1:y2, x1:x2]
    quick_imshow(crop)
    print('')

In [None]:
low_matches_df = processed_df[processed_df['title_matches_number'] < 10].reset_index(drop=True)
for i in range(len(low_matches_df)):
    path = low_matches_df.iloc[i]['path']
    title = low_matches_df.iloc[i]['title_guess']
    matches = low_matches_df.iloc[i]['title_matches_number']
    crop_box = low_matches_df.iloc[i]['title_crop']
    # print(low_matches_df.iloc[i]['title_crop'])

    


    # print(low_matches_df.iloc[i])
    print(title, matches)
    print(path)
    print(type(crop_box))
    print(crop_box)
    print('')
    if isinstance(crop_box, str):
        print('string')
        crop_box = crop_box.replace('[', '').replace(']', '')
        points = crop_box.split(', ')
        # print(points)
        points = [int(x) for x in points]
    elif isinstance(crop_box, list):
        print(f'crop_box: {crop_box}')
        points = crop_box
    else:
        continue
    x1, y1, x2, y2 = points
    image = cv2.imread(path)
    crop = image[y1:y2, x1:x2]
    quick_imshow(crop)

In [None]:
low_matches_df = test[test['title_matches_number'] < 10].reset_index(drop=True)
for i in range(len(low_matches_df)):
    path = low_matches_df.iloc[i]['path']
    title = low_matches_df.iloc[i]['title_guess']
    matches = low_matches_df.iloc[i]['title_matches_number']
    # print(low_matches_df.iloc[i]['title_crop'])

    crop_box = low_matches_df.iloc[i]['title_crop']
    crop_box = crop_box.replace('[', '').replace(']', '')
    points = crop_box.split(', ')
    # print(points)
    points = [int(x) for x in points]
    x1, y1, x2, y2 = points

    # print(low_matches_df.iloc[i])
    print(title, matches)
    if matches < 8:
        print(path)
    image = cv2.imread(path)
    crop = image[y1:y2, x1:x2]
    quick_imshow(crop)

In [None]:
low_ssim_df = test[test['title_ssim'] < 0.5].reset_index(drop=True)
for i in range(len(low_ssim_df)):
    path = low_ssim_df.iloc[i]['path']
    # print(low_ssim_df.iloc[i]['title_crop'])
    title = low_ssim_df.iloc[i]['title_guess']
    matches = low_ssim_df.iloc[i]['title_matches_number']
    ssim = low_ssim_df.iloc[i]['title_ssim']
    
    crop_box = low_ssim_df.iloc[i]['title_crop']
    crop_box = crop_box.replace('[', '').replace(']', '')
    points = crop_box.split(', ')
    # print(points)
    points = [int(x) for x in points]
    x1, y1, x2, y2 = points
    
    # print(low_ssim_df.iloc[i])
    print(title, ssim)
    image = cv2.imread(path)
    crop = image[y1:y2, x1:x2]
    quick_imshow(crop)

In [None]:
test = pd.read_csv(title_csv_path)
test[test['title_matches_number'] < 15]

In [None]:
image_path = crop_df['image_path'][0]
image = cv2.imread(str(image_path))
x1, y1, x2, y2 = crop_df['title_crop'][0]
image_cropped = image[y1:y2, x1:x2]
plt.imshow(image), plt.show()
plt.imshow(image_cropped), plt.show()

In [None]:
@interact
def show_images(file=page_1_paths_list):
    # load image
    issue = TnFarmNews(file)
    issue.ocr()
    # draw_month_box(issue.line_and_word_boxes, issue.image_cropped)
    # image = Image.open(file)
    # temp_image_path = Path('_temp_image.jpg')
    # image.save(temp_image_path)
    # display(ipyImage(temp_image_path))
    issue.guess_title(title_crops_dict, debug=True)
    print(issue.best_title)

In [None]:
# this is my attempt to try and make a crop-value widget!

page_1_paths_dictionary = {path.name: path for path in page_1_paths_list}
initial_path = page_1_paths_list[0]
initial_images_key = initial_path.name
initial_image = cv2.imread(image)
initial_x2 =

# Create widgets
images = widgets.Dropdown(
    options=page_1_paths_dictionary, value=initial_path_key)
crop_x1 = widgets.IntRangeSlider(value=(100, , min=0, max=initial_image.shape[1])
crop_y1=widgets.IntSlider(value=0, min=0, max=initial_image.shape[0])
crop_x2=widgets.IntSlider()

# Updates the image options based on directory value
def update_crop(*args):
    crop_x1.max=os.listdir(directory.value)

# Tie the image options to directory value
directory.observe(update_crop, 'value')

# Show the images
def show_images(fdir, file):
    display(Image(f'{fdir}/{file}'))

_=interact(show_images, fdir=directory, file=images)
def crop_image(file=page_1_paths_list, x1=(0, Image.open(file).size[0])):
    # load image
    issue=TnFarmNews(file)
    issue.ocr()
    draw_month_box(issue.line_and_word_boxes, issue.image_cropped)
    # image = Image.open(file)
    # temp_image_path = Path('_temp_image.jpg')
    # image.save(temp_image_path)
    # display(ipyImage(temp_image_path))
    issue.guess_title(debug=True)
    print(issue.best_title)