In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.ion()

In [1]:
# import and display options
# %matplotlib inline
# %matplotlib notebook
# %matplotlib qt
from collections import OrderedDict
from pathlib import Path

import cv2
import ipywidgets as widgets
import numpy as np
import pandas as pd
from dateutil.parser import parse
from IPython.display import clear_output, display, HTML
from ipywidgets import interact, IntProgress, Label, VBox, HBox
# from matplotlib import pyplot as plt

import plotly.graph_objects as go
import pyocr.builders
import pyocr
from datetime import datetime
from autocorrect import spell
from fuzzywuzzy import fuzz, process
import img_qc.img_qc as img_qc
from PIL import Image

tools = pyocr.get_available_tools()
tool = tools[0]

display(HTML("<style>.container {width:100% !important;}</style>"))

In [2]:
# hardcoded data directory and options for both serial titles and metadata fields
data_directory_path = Path('data/')
serial_titles_dict = {'Select a title': 'Select a title',
                      'Tennessee farm news': 'agrtfn',
                      'Tennessee farm and home science': 'agrtfhs',
                      'UT Special extension circulars': 'agrutesc'
                     }
metadata_fields_list = ['Select a metadata field', 'date', 'title']

In [3]:
class MetadataField():
    def __init__(self, csv_path):
        self.csv_path = Path(csv_path)
        self.dataframe = pd.read_csv(self.csv_path)
        self.number_of_rows, self.number_of_columns = self.dataframe.shape      

In [4]:
# create interactive csv_path widget
identifier_widget = widgets.RadioButtons(layout={'width': 'initial'},
                                         style={
                                             'description_width': 'initial'},
                                         options=serial_titles_dict,
                                         description='Serial Title:',
                                         disabled=False
                                        )
metadata_field_widget = widgets.RadioButtons(layout={'width': 'initial'},
                                            style={
                                                'description_width': 'initial'},
                                            options=metadata_fields_list,
                                            description='Metadata field:',
                                            disabled=False,
                                           )

csv_path_display_widget = widgets.Text(layout={'width': 'initial'},
                                       description='CSV Path:')

csv_path_exists_validity_widget = widgets.Valid(description='Does CSV exist?',
                                               style={
                                                'description_width': 'initial'})
load_csv_button_widget = widgets.Button(description='Load CSV',
                                        style={'description_width': 'initial'},
                                       )

load_csv_button_output_widget = widgets.Text(layout={'width': 'initial'})


def on_button_clicked(b):
    global metadata
    # csv_path = Path(csv_path_display_widget.value)
    try:
        dataframe = pd.read_csv(csv_path_display_widget.value)
    except FileNotFoundError:
        load_csv_button_output_widget.value = 'No dataframe; CSV path invalid'
        return
    metadata = MetadataField(csv_path_display_widget.value)
    number_of_rows, number_of_columns = dataframe.shape
    load_csv_button_output_widget.value = f'{metadata.csv_path.name} loaded as dataframe with {metadata.number_of_rows} rows and {metadata.number_of_columns} columns'
    return metadata

load_csv_button_widget.on_click(on_button_clicked)

def select_csv(identifier, metadata_field):
    csv_name = f'{identifier}_{metadata_field}.csv'
    csv_path = data_directory_path.joinpath(csv_name)
    csv_path_display_widget.value = str(csv_path.resolve())
    csv_path_exists_validity_widget.value = csv_path.is_file()
    # print(f'Path to CSV: {csv_path}')
    return csv_path


# csv_path_interactive_widget = widgets.interactive(select_csv, identifier=identifier_widget, metadata_field=metadata_field_widget)
# output_csv_path_widget = interactive(select_csv, 'identifier'=identifier_widget, 'metadata_field'=metadata_field_widget)
csv_path_interactive_widget = widgets.interactive_output(select_csv, {'identifier': identifier_widget, 'metadata_field': metadata_field_widget})

In [5]:
# Select a title and metadata field to process
row_1_widgets = HBox([identifier_widget, metadata_field_widget])

row_2_widgets = csv_path_display_widget  # HBox([csv_path_display_widget])

row_3_widgets = HBox([load_csv_button_widget, csv_path_exists_validity_widget])

row_4_widgets = load_csv_button_output_widget

my_widget = VBox([row_1_widgets, row_2_widgets, row_3_widgets, row_4_widgets])

my_widget

VBox(children=(HBox(children=(RadioButtons(description='Serial Title:', layout=Layout(width='initial'), option…

In [None]:
# write images_not_processed_list to text file QUICKLY with Python!
output_path = Path('/Users/jeremy/Documents/GitHub/utk_ProjectCeres/data/agrtfn_date_to_process.txt')
with open (output_path, 'w') as text_file:
    text_file.write('\n'.join(images_not_processed_list))

In [7]:
# create image_names_list
image_names_list = metadata.dataframe.image_name.unique().tolist()
len(image_names_list)

3082

In [8]:
# create interactive date selection widget
months_dict = {'January': 1,
               'February': 2,
               'March': 3,
               'April': 4,
               'May': 5,
               'June': 6,
               'July': 7,
               'August': 8,
               'September': 9,
               'October': 10,
               'November': 11,
               'December': 12
              }
months_list = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
months_number_of_days_dict = {'January': 31,
                             'February': 28,
                             'March': 31,
                             'April': 30,
                             'May': 31,
                              'June': 30,
                             'July': 31,
                             'August': 31,
                              'September': 30,
                              'October': 31,
                              'November': 30,
                              'December': 31
                             }
numbers_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

month_widget = widgets.RadioButtons(options=months_list,
                                   description='Month:')

day_tens_widget = widgets.RadioButtons(options=numbers_list[0:4],
                                      description='Day 10s:')
day_ones_widget = widgets.RadioButtons(options=numbers_list,
                                      description='Day 1s:')
year_tens_widget = widgets.RadioButtons(options=numbers_list,
                                       description='Year 10s:')
year_ones_widget = widgets.RadioButtons(options=numbers_list,
                                      description='Year 1s:')

date_display_widget = widgets.Text(description='Date:')
max_days=0
def select_date(month, day_tens, day_ones, year_tens, year_ones):

    year = int(f'19{year_tens}{year_ones}')
    max_days = months_number_of_days_dict[month]
    # set radio option buttons based on max days in the month
    day_tens_end = int(str(max_days)[0]) + 1
    day_tens_widget.options = numbers_list[:day_tens_end]
    day_tens_widget.value = day_tens
    if day_tens_widget.value == 0:
        day_ones_start = 1
        if day_ones_widget.value == 0:
            day_ones_widget.value = 1
            day_ones = 1
    else:
        day_ones_start = 0
    if day_tens_widget.value == 3:
        day_ones_end = int(str(max_days)[1]) + 1
        day_ones_widget.options = numbers_list[day_ones_start:day_ones_end]
        day_ones_widget.value = day_ones
    elif year % 4 == 0 and month == 'February':  # then it's a leapyear
        day_ones_widget.options = numbers_list[day_ones_start:]
        day_ones_widget.value = day_ones
    elif month == 'February':
        day_ones_widget.options = numbers_list[day_ones_start:-1]
        day_ones_widget.value = day_ones
    
    else:
        day_ones_widget.options = numbers_list[day_ones_start:]
        day_ones_widget.value = day_ones
    day_ones = day_ones_widget.value
    day = (day_tens * 10) + day_ones
    date = f'{month} {day}, {year}'
    # print(date)
    date_display_widget.value = date
    return date
    

    
select_date_interactive_widget = widgets.interactive_output(select_date, {'month': month_widget,
                                                                          'day_tens': day_tens_widget,
                                                                          'day_ones': day_ones_widget,
                                                                          'year_tens': year_tens_widget,
                                                                          'year_ones': year_ones_widget}
                                                           )

# select_date_interactive_widget

# load_csv_button_widget = widgets.Button(description='Load CSV',
#                                         style={'description_width': 'initial'},
#                                        )

# load_csv_button_output_widget = widgets.Text(layout={'width': 'initial'})
    
# def on_button_clicked(b):
#     global metadata
#     # csv_path = Path(csv_path_display_widget.value)
#     try:
#         dataframe = pd.read_csv(csv_path_display_widget.value)
#     except FileNotFoundError:
#         load_csv_button_output_widget.value = 'No dataframe; CSV path invalid'
#         return
#     metadata = MetadataField(csv_path_display_widget.value)
#     number_of_rows, number_of_columns = dataframe.shape
#     load_csv_button_output_widget.value = f'{metadata.csv_path.name} loaded as dataframe with {metadata.number_of_rows} rows and {metadata.number_of_columns} columns'
#     return metadata

# load_csv_button_widget.on_click(on_button_clicked)

# def select_csv(identifier, metadata_field):
#     csv_name = f'{identifier}_{metadata_field}.csv'
#     csv_path = data_directory_path.joinpath(csv_name)
#     csv_path_display_widget.value = str(csv_path.resolve())
#     csv_path_exists_validity_widget.value = csv_path.is_file()
#     # print(f'Path to CSV: {csv_path}')
#     return csv_path


# csv_path_interactive_widget = widgets.interactive(select_csv, identifier=identifier_widget, metadata_field=metadata_field_widget)
# output_csv_path_widget = interactive(select_csv, 'identifier'=identifier_widget, 'metadata_field'=metadata_field_widget)
# csv_path_interactive_widget = widgets.interactive_output(select_csv, {'identifier': identifier_widget, 'metadata_field': metadata_field_widget})

In [None]:
# Select a Month Day Year

date_row_1_widgets = HBox([month_widget, day_tens_widget, day_ones_widget, year_tens_widget, year_ones_widget])

date_row_2_widgets = date_display_widget  # HBox([csv_path_display_widget])

# date_row_3_widgets = HBox([load_csv_button_widget, csv_path_exists_validity_widget])

# date_row_4_widgets = load_csv_button_output_widget

date_widget = VBox([date_row_1_widgets, date_row_2_widgets])#, date_row_3_widgets, date_row_4_widgets])
date_widget.layout.height = '300px'
date_widget

In [None]:
date_guesses = metadata.dataframe['date_guess'].tolist()

In [None]:
csv_path_display_widget.value

In [None]:
# fix date_guess column messed up by Excel
month_abbrev_dict = {'Jan': 'January',
                    'Feb': 'February',
                    'Mar': 'March',
                    'Apr': 'April',
                    'May': 'May',
                    'Jun': 'June',
                    'Jul': 'July',
                    'Aug': 'August',
                    'Sep': 'September',
                    'Oct': 'October',
                    'Nov': 'November',
                    'Dec': 'December'}

fixed_date_guesses = []
for date_guess in date_guesses:
    # print(date_guess)
    if isinstance(date_guess, float):  # then it's NaN, empty
        fixed_date_guesses.append(None)
    elif len(date_guess) == 8 or len(date_guess) == 9:  # correct length for date
        try:
            day, month, year = date_guess.split('-')
        except ValueError:
            fixed_date_guesses.append(date_guess)
            continue
        month = month_abbrev_dict[month]
        year = f'19{year}'
        date = f'{month} {day}, {year}'
        fixed_date_guesses.append(date)
    else:
        fixed_date_guesses.append(date_guess)
fixed_date_guesses

In [None]:
metadata.dataframe['date_guess'] = fixed_date_guesses

In [None]:
metadata.dataframe.to_csv(csv_path_display_widget.value)

In [10]:
# pre-built lists to iterate over
months = [
    'january',
    'february',
    'march',
    'april',
    'may',
    'june',
    'july',
    'august',
    'september',
    'october',
    'november',
    'december'
]

roi_box_list = ['text', 'left', 'top', 'width', 'height']

print(f'\n{25 * "*_*"}\nLast run on {datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}')


*_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_*
Last run on 2019-08-30 07:32:25 PM


In [11]:
# create a dictionary of possible month corrections

months_dict = {}
for month in months:
    correction_list = []
    # the first 2 letters can be missing
    for i in range(3):
        correction = month[i:]
        if len(correction) > 2:
            correction_list.append(month[i:])
    # the last 2 letters can be missing
    for i in range(1, 3):
        i = len(month) - i
        correction = month[:i]
        if len(correction) > 2:
            correction_list.append(month[:i])
    # print(correction_list)
    # these are common translations
    errors_dict = {'j': 't', 'd': 'j', 't': 'l', 'a': '£', 'f': '£', 'a': '&',}
    for error in errors_dict:
        for correction in correction_list:
            if error in correction:
                # print(error, errors_dict[error])
                replaced = correction.replace(error, errors_dict[error])
                # print(replaced)
                correction_list.append(replaced)
        
    months_dict[month] = correction_list

In [12]:
# utility functions
def crop_image_for_processing(image, percentage=0.4, top_and_sides_padding=10):

    # get technical metadata
    height, width = image.shape[:2]

    # set (x, y) pairs
    x1, y1, x2, y2 = 0, 0, width, int(height * percentage)

    # add/subtract padding from the top/left/right
    x1 += top_and_sides_padding
    y1 += top_and_sides_padding
    x2 -= top_and_sides_padding
    # add image area to the bottom that was cropped from the top
    y2 += top_and_sides_padding

    # crop image
    image = image[y1:y2, x1:x2]

    return image


def get_np_crop_points(crop_box):
    x_points = []
    y_points = []
    # print(f'crop box: {crop_box}')

    # append all x/y points to their respective lists
    for i in range(len(crop_box)):
        if crop_box[i][0][0]:
            x = (crop_box[i][0][0])
            if x < 0:
                x = -x
            x_points.append(x)
        if crop_box[i][0][1]:
            y = crop_box[i][0][1]
            if y < 0:
                y = -y
            y_points.append(y)
    # print('x/y points')
    # print(x_points, y_points)

    # find extremes in crop box
    x1 = min(x_points)
    x2 = max(x_points)
    y1 = min(y_points)
    y2 = max(y_points)
    # print(f'x1: {x1}, y1: {y1}, x2: {x2}, y2: {y2}')

    return x1, y1, x2, y2


def if_rgb_convert_to_gray(np_image):
    if len(np_image.shape) > 2:
        np_image = cv2.cvtColor(np_image, cv2.COLOR_RGB2GRAY)

    return np_image


def if_bgr_convert_to_gray(np_image):
    if len(np_image.shape) > 2:
        np_image = cv2.cvtColor(np_image, cv2.COLOR_BGR2GRAY)

    return np_image


def resize(image, width=None, height=None, ratio=None, inter=cv2.INTER_AREA):
    # initialize the dimensions of the image to be resized and
    # grab the image size
    dim = None
    (h, w) = image.shape[:2]

    # if both the width and height are None, then return the
    # original image
    if width is None and height is None:
        return image

    # check to see if the width is None
    if width is None:
        # calculate the ratio of the height and construct the
        # dimensions
        ratio = height / float(h)
        dim = (int(w * ratio), height)

    # otherwise, the height is None
    else:
        # calculate the ratio of the width and construct the
        # dimensions
        ratio = width / float(w)
        dim = (width, int(h * ratio))

    # resize the image
    resized = cv2.resize(image, dim, interpolation=inter)

    # return the resized image
    return resized, ratio


def resize_ratio(image, ratio, inter=cv2.INTER_AREA):
    dim = None
    (h, w) = image.shape[:2]
    dim = (int(w * ratio), int(h * ratio))

    # resize the image
    resized = cv2.resize(image, dim, interpolation=inter)

    # return the resized image
    return resized


def quick_imshow(bgr_image):
    if len(bgr_image.shape) > 2:
        bgr_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
        cmap = None
    else:
        cmap = 'gray'
    plt.imshow(bgr_image, cmap=cmap), plt.show()
    
    return


def search(values, searchFor):
    for k in values:
        for v in values[k]:
            if searchFor in v:
                return k
    return None


print(f'\n{25 * "*_*"}\nLast run on {datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}')


*_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_*
Last run on 2019-08-30 07:32:26 PM


In [13]:
class PageOne:

    def __init__(self, image_path, config=False):

        self.image_path = Path(image_path)

        # set config dictionary
        if config:
            self.config = config
        else:
            self.config = {'crop_percentage': 0.4,
                           'distance_ratio': 0.6,
                           'minimum_matches': 5,
                           'top_and_sides_padding': 10,
                           'resize_width': 1600}

        # set variables
        self.MOST_MATCHES = self.config['minimum_matches']
        self.BEST_SSIM_WITH_TITLE = -1  # on a scale of -1->1, 1 is a perfect match
        self.BEST_TITLE_CROP = False
        self.BEST_TITLE = False
        self.RESIZE_RATIO = False

    def preprocess_image(self, crop_percentage=None, top_and_sides_padding=None):

        if not crop_percentage:
            crop_percentage = self.config['crop_percentage']
        if not top_and_sides_padding:
            top_and_sides_padding = self.config['top_and_sides_padding']

        # load image
        self.image = cv2.imread(str(self.image_path))

        # get technical metadata
        self.height, self.width = self.image.shape[:2]
        if len(self.image.shape) > 2:
            self.colorspace = 'gray'
        else:
            self.colorspace = 'rgb'

        # crop image and get shape
        self.image_cropped = crop_image_for_processing(
            self.image, crop_percentage, top_and_sides_padding)
        self.height_cropped, self.width_cropped = self.image_cropped.shape[:2]

        return self.image_cropped

    def guess_title(self, crop_dictionary, image=None, debug=None):

        print(f'Guessing title of {self.image_path.name} . . .')

        # resize cropped image and get features
        # if self.RESIZE_RATIO:
        #     image = resize_ratio(self.image_cropped, self.RESIZE_RATIO)
        # else:
        #     image, self.RESIZE_RATIO = resize(
        #         self.image_cropped, width=self.config['resize_width'])

        if not image:
            image = self.preprocess_image()

        self.keypoints, self.descriptors = get_keypoints_and_descriptors(image)

        best_per_title_dict = OrderedDict()

        for title, data in crop_dictionary.items():

            if title not in best_per_title_dict:  # instantiate an OrderedDict()
                # number_of_matches, crop_box
                best_per_title_dict[title] = OrderedDict()
                best_per_title_dict[title] = [0, False, False, False]
                if debug:
                    print(title)

            for sub_title, sub_data in crop_dictionary[title].items():

                crop_image, crop_keypoints, crop_descriptors = sub_data
                # quick_imshow(crop_image)

                number_of_matches, matches = get_descriptor_matches(
                    crop_descriptors, self.descriptors, self.config['distance_ratio'])
                if debug:
                    print(sub_title)
                    print(
                        f'# of matches/minMatches: {number_of_matches}/{(self.MOST_MATCHES * 0.5)}')

                # then it might be the best match
                if number_of_matches >= (self.MOST_MATCHES * 0.5):

                    found_image, crop_box = get_matching_crop_and_box(
                        matches, crop_keypoints, self.keypoints, crop_image, image)

                    try:
                        if not crop_box:  # then we didn't get a valid crop_box
                            if debug:
                                print(f'\t\tFalse positive: invalid crop_box')
                            continue
                    except ValueError:  # valid numpy array throws a ValueError when testing existence

                        if debug:
                            print(f'crop box: {crop_box}')
                        x1, y1, x2, y2 = get_np_crop_points(crop_box)

                        # x1 = int(x1 / self.RESIZE_RATIO)
                        # y1 = int(y1 / self.RESIZE_RATIO)
                        # x2 = int(x2 / self.RESIZE_RATIO)
                        # y2 = int(y2 / self.RESIZE_RATIO)

                        # if width/height of title isn't over min_title_width/height pixels
                        # it's not the title
                        min_title_width = 800
                        min_title_height = 40
                        title_width = x2 - x1
                        title_height = y2 - y1

                        if (x2 - x1 < min_title_width):
                            if debug:
                                print(f'\t\t\tFalse positive: width < minimum')
                                print(
                                    f'\t\t\t\t{x2} - {x1} = {title_width} < {min_title_width}')
                            continue
                        elif (y2 - y1 < min_title_height):
                            if debug:
                                print(f'\t\t\tFalse positive: height < minimum')
                                print(
                                    f'\t\t\t\t{y2} - {y2} = {title_height} < {min_title_height}')
                            continue

                        if debug:
                            print(x1, y1, x2, y2)
                            crop = self.image_cropped[int(
                                y1):int(y2), int(x1):int(x2)]
                            print(f'cropped image')
                            quick_imshow(crop)

                        # convert to grayscale for ssim
                        found_gray = if_bgr_convert_to_gray(found_image)
                        crop_gray = if_bgr_convert_to_gray(crop_image)

                        # get the structural similiarity index of the match with the image for a 2nd heuristic
                        try:
                            if debug:
                                print('found_gray')
                                quick_imshow(found_gray)
                                print(f'crop_gray')
                                quick_imshow(crop_gray)
                            ssim_value = ssim(found_gray, crop_gray)
                            if ssim_value > self.BEST_SSIM_WITH_TITLE:
                                print(f'new best ssim: {title, ssim_value}')
                                self.BEST_SSIM_WITH_TITLE = ssim_value
                                points = [x1, y1, x2, y2]
                                self.BEST_TITLE_CROP = points
                                self.BEST_TITLE = title
                                self.MOST_MATCHES = number_of_matches

                        except ValueError:  # image shape doesn't match, so most likely incorrect title
                            print('wrong dimensions')
                            continue
        return

    def binarize(self, image=None, debug=None):

        if type(image) is not np.ndarray:
            image = self.preprocess_image()

        # convert to gray and blur
        image = if_bgr_convert_to_gray(image)
        image = cv2.GaussianBlur(image, (3, 3), 1)

        # binarize
        self.binarized = cv2.adaptiveThreshold(
            image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 175, 21)
        
        return self.binarized

    def ocr(self, image=None, debug=None):
        
        if type(image) is not np.ndarray:
            image = self.binarize()
        
        # show image we're OCRing
        # quick_imshow(image)
        
        # create line and word boxes
        line_and_word_boxes = tool.image_to_string(
            Image.fromarray(image),
            lang='eng',
            builder=pyocr.builders.LineBoxBuilder(),
        )
        # create text
        text = tool.image_to_string(
            Image.fromarray(image),
            lang='eng',
            builder=pyocr.builders.TextBuilder(),
        )

        return line_and_word_boxes, text
    
    def ocr_thrice(self, image=None, debug=None):
        
        if type(image) is not np.ndarray:
            image = self.binarize()
            
        height, width = image.shape[:2]
        
        # ocr everything
        self.ocr_all = self.ocr(image)
        self.ocr_all_lines = self.ocr_all[0]
        self.ocr_all_text = self.ocr_all[1]
        
        # crop and ocr right half
        image = image[0:height, int(width/2):width]
        self.ocr_right = self.ocr(image)
        self.ocr_right_lines = self.ocr_right[0]
        self.ocr_right_text = self.ocr_right[1]
        
        # crop and ocr left half
        image = image[0:height, 0:int(width/2)]
        self.ocr_left = self.ocr(image)
        self.ocr_left_lines = self.ocr_left[0]
        self.ocr_left_text = self.ocr_left[1]
        
        # return ocr_all
        return self.ocr_all
    
    def get_date_crop_dictionary(self):
        
        # get a dataframe for all rows with image_name
        image_name_dataframe = metadata.dataframe[metadata.dataframe['image_name'] == self.image_path.name]

        # create dictionary of date guesses and crop boxes
        crop_box_dict = OrderedDict()
        for date_guess_id in image_name_dataframe['date_guess_id']:
            crop_box = image_name_dataframe.loc[image_name_dataframe['date_guess_id'] == date_guess_id]['date_crop_box'].tolist()[0]
            date_guess = image_name_dataframe.loc[image_name_dataframe['date_guess_id'] == date_guess_id]['date_guess'].tolist()[0]
            # print(crop_box)
            crop_box_dict[date_guess] = crop_box
        
        self.crop_box_dictionary = sanitize_crop_box(crop_box_dict)
        # print(crop_box_dict)
        
        return self.crop_box_dictionary

print(f'\n{25 * "*_*"}\nLast run on {datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}')


*_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_*
Last run on 2019-08-30 07:32:27 PM


In [14]:
class OcrResults:
    def __init__(self, line_and_word_boxes):
        self.filter_chars = 'abcdefghijklmnopqrstuvwxyz1234567890£& '
        self.line_and_word_boxes = line_and_word_boxes

    def get_month(self, debug=False):
        
        self.months = OrderedDict()
        number_of_months = 0

        # iterate over lines in the OCR result
        for line in self.line_and_word_boxes:
            # filter out everything that's not a a-z,0-9, and a 'space'
            line_condensed = [
                x for x in line.content if x.lower() in self.filter_chars]

            # filtering produces a list of chars, so join it into a line of text
            text = ''.join(line_condensed)

            # split the joined text into words at each space
            for word in text.split(' '):

                # only process word results over 2 chars in length
                if len(word) > 2:

                    # try to match the word as lower-case with the months dictionary
                    month = search(months_dict, word.lower())

                    if month is None:  # try autocorrecting the word and searching again
                        fixed_word = spell(word)
                        if debug:
                            print(f'fixed_word: {fixed_word}')
                        month = search(months_dict, fixed_word.lower())

                    if month is None:  # continue on to the next word
                        continue
                        
                    # get the word 
                    for index, box in enumerate(line.word_boxes):
                        if box.content == word:
                            found_month = month
                            # print(box.content)
                            # print(box.position)
                            if debug:
                                print(f'index: {index}')
                                if index > 0:
                                    word_before = line.word_boxes[index-1].content
                                    print(f'word before: {word_before}')
                                word_found = line.word_boxes[index].content
                                print(f'word: {word_found}')
                                word_after = line.word_boxes[index+1].content
                                print(f'word after: {word_after}')
                            (x1, y1), (x2, y2) = box.position[:2]
                            crop_box = [x1, y1, x2, y2]
                            number_of_months += 1
                            yield found_month, word, crop_box
        # otherwise
        if number_of_months > 0:
            return
        else:
            yield None

print(f'\n{25 * "*_*"}\nLast run on {datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}')


*_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_**_*
Last run on 2019-08-30 07:32:27 PM


In [None]:
import matplotlib as mpl
from io import BytesIO
from IPython.display import Image as ipyImage

def cv2jupyter(image):
    """Display a 2- or 3-d numpy array as an image."""
    # convert to rgb
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    if image.ndim == 2:
        format, cmap = 'png', mpl.cm.gray
    elif image.ndim == 3:
        format, cmap = 'jpg', None
    else:
        raise ValueError("Only 2- or 3-d arrays can be displayed as images.")
    # Don't let matplotlib autoscale the color range so we can control overall luminosity
    vmax = 255 if image.dtype == 'uint8' else 1.0
    with BytesIO() as buffer:
        mpl.image.imsave(buffer, image, format=format, cmap=cmap, vmin=0, vmax=vmax)
        out = buffer.getvalue()
    return ipyImage(out)

In [None]:
text = widgets.FloatText()
slider = widgets.FloatSlider()
display(text,slider)
 
mylink = widgets.jslink((text, 'value'), (slider, 'value'))

In [67]:
# display plot.ly image

def display_image(image_bgr):
    # Create figure
    fig = go.FigureWidget()
    
    image_source = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
    image_source = Image.fromarray(image_source)

    # Constants
    source_width, source_height = image_source.size
    figure_height = 600
    resize_ratio = figure_height / source_height
    figure_width = int(source_width * resize_ratio)

    # Add invisible scatter trace.
    # This trace is added to help the autoresize logic work.
    fig.add_trace(
        go.Scatter(
            x=[0, figure_width],
            y=[0, figure_height],
            mode="markers",
            marker_opacity=0
        )
    )

    # Configure axes
    fig.update_xaxes(
        visible=False,
        range=[0, figure_width]
    )

    fig.update_yaxes(
        visible=False,
        range=[0, figure_height],
        # the scaleanchor attribute ensures that the aspect ratio stays constant
        scaleanchor="x"
    )

    # Add image
    fig.update_layout(
        images=[go.layout.Image(
            x=0,
            sizex=figure_width,
            y=figure_height,
            sizey=figure_height,
            xref="x",
            yref="y",
            opacity=1.0,
            layer="below",
            sizing="stretch",
            source=image_source)]
    )

    # Configure other layout
    fig.update_layout(
        width=figure_width,
        height=figure_height,
        margin={"l": 0, "r": 0, "t": 0, "b": 0},
    )
    
    # fig.layout.dragmode = 'pan'

    # fig.show()
    return fig

In [152]:
# images currently on external hard drive
image_data_directory_path = Path('/Volumes/jmoor167/data/agrtfn/')

# image_name widget
image_name_widget = widgets.Dropdown(layout={'width': 'initial'},
                                    style={'description_width': 'initial'},
                                    value = image_names_list[0],
                                    options=image_names_list,
                                    description='Image name:',
                                    disabled=False
                                    )

def sanitize_crop_box(crop_box_dictionary):
    
    sanitized_dict = OrderedDict()
    
    for data in crop_box_dictionary:
        
        crop_box = crop_box_dictionary[data]
        
        # crop box is a string in form of : "(x1, y1, x2, y2)"
        try:  # stripping off parentheses
            sanitized_crop_box = crop_box.strip('()')
            
        except AttributeError:  # can't strip float instead of string because it's NaN
            # add back as None
            sanitized_dict[data] = None
            continue
        
        # remove commas then split on spaces and cast each string as an integer
        sanitized_crop_box = sanitized_crop_box.replace(',', '')
        sanitized_crop_box = [int(point) for point in sanitized_crop_box.split(' ')]        
        
        # print(sanitized_crop_box)
        sanitized_dict[data] = sanitized_crop_box
        
    return sanitized_dict
  

def resize_crop_box_dictionary(crop_box_dictionary, resize_ratio):
        
        resized_crop_box_dictionary = OrderedDict()
        
        for data in crop_box_dictionary:
            
            crop_box = crop_box_dictionary[data]
            resized_crop_box = [int(point * resize_ratio) for point in crop_box]
            
            # print(resized_crop_box)
            
            resized_crop_box_dictionary[data] = resized_crop_box
        
        return resized_crop_box_dictionary     


def draw_text(image, crop_box_dictionary, crop=False):
    """
    returns a dictionary of images identified by their
    """
    drawings_dictionary = OrderedDict()
    
    for data in crop_box_dictionary:
        
        crop_box = crop_box_dictionary[data]
        x1, y1, x2, y2 = crop_box
        
        # draw this box on the ocr_all image
        drawing = cv2.rectangle(image.copy(), (x1, y1), (x2, y2), (0, 0, 255), 8)
        # draw a black rectangle above this to add text to
        drawing = cv2.rectangle(drawing, (x1, y1-45), (x2, y1), (0, 0, 0), -1)  # negative thickness to fill rectangle
        
        font = cv2.FONT_HERSHEY_SIMPLEX
        cv2.putText(drawing, str(data), (x1+17, (y1-7)), font, 1, (255,255,255), 2, cv2.LINE_AA)
            
        x1_crop = x1 - 25
        y1_crop = y1 - 45 - 25
        x2_crop = x2 + 25
        y2_crop = y2 + 25

        drawing_crop = drawing[y1_crop:y2_crop, x1_crop:x2_crop]
        
        
        # add drawing with text to dictionary
        drawings_dictionary[data] = [drawing, drawing_crop]
        
    return drawings_dictionary


def display_image_dictionary(image, image_dictionary):
    
    number_of_axes = len(image_dictionary)
    
    labels_list = [label for label in image_dictionary]
    images_list = [image_dictionary[label] for label in image_dictionary]
    
    # labels_list.insert(0, 'entire image')
    # images_list.insert(0, image)
    
    # display.clear_output()
    fig, axes = plt.subplots(nrows=1, ncols=number_of_axes, figsize=(8,4))
    
    if number_of_axes == 1:
        image_rgb = cv2.cvtColor(images_list[0], cv2.COLOR_BGR2RGB)
        axes.imshow(image_rgb)
        axes.set_title(str(labels_list[0]))
    else:
        for ax, label, image in zip(range(number_of_axes), labels_list, images_list):
            # print(label)
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            axes[ax].imshow(image_rgb)
            axes[ax].set_title(str(label))
                               
    fig.canvas.draw()
    # plt.tight_layout()
    
    return
    

image_display_widget = widgets.Output()


def select_image(image_name):
    
    # global fig_subplots
    
    image_path = image_data_directory_path.joinpath(image_name)
    
    page_1 = PageOne(image_path)
    
    image = cv2.imread(str(page_1.image_path))
    
    page_1.get_date_crop_dictionary()  # self-sanitizing, loads as list of integers
    
    resized_image, resize_ratio = resize(image, width=page_1.config['resize_width'])
    
    resized_height, resized_width = resized_image.shape[:2]
    cropped_image = resized_image[0:int(resized_height*0.5), 0:resized_width]
    
    try:
        resized_crop_box_dictionary = resize_crop_box_dictionary(page_1.crop_box_dictionary, resize_ratio)
    except TypeError:
        print('No Date Guesses Found')
        full_fig = display_image(cropped_image)
        display(full_fig)
        return
    
    drawings_dictionary = draw_text(cropped_image, resized_crop_box_dictionary)
    
    # display_image_dictionary(resized_image, drawings_dictionary)
    
    # print(date_guesses_list)
    guesses_list = [guess for guess in drawings_dictionary]
    images_list = [drawings_dictionary[guess] for guess in guesses_list]
    
    image_source = images_list[0]
    full_fig = display_image(image_source[0])
    # crop_fig = display_image(f'{image_name} Crop', image_source[1])
    # fig_subplots=  HBox([fw1, fw2])
    # fig_subplots #this line displays the subplots in the Jupyter Notebook
    
    # subplots = HBox([full_fig, crop_fig])
    
    # fig_subplots = HBox([display_image(image_source[0]), display_image(image_source[1], scale_factor=1)])
    # display(subplots)
    
    # return subplots
    
    display(full_fig)
    return guesses_list[0]
    
    
    
#     number_of_guesses = len(guesses_list)
#     clear_output(wait=True)
#     # fig, axes = plt.subplots(1, number_of_guesses, figsize=(number_of_guesses*5, 3))
    
#     if number_of_guesses == 1:
#         # image_for_jupyter = cv2jupyter(images_list[0])
#         image_rgb = cv2.cvtColor(images_list[0], cv2.COLOR_BGR2RGB)
#         # axes.imshow(image_rgb)
#         # axes.set_title(str(guesses_list[0]))
#     else:
#         for column, guess, image_bgr in zip(iter(range(number_of_guesses)), guesses_list, images_list):
#             # print(label)
#             # image_for_jupyter = cv2jupyter(plot_image)
#             image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
#             # print(column)
#             # print(guess)
#             # print(image_bgr)
# #             axes[column].imshow(image_rgb)
# #             axes[column].set_title(str(guess))
                               
# #     plt.show()
    
    
    
    # return fig.canvas


# csv_path_interactive_widget = widgets.interactive(select_csv, identifier=identifier_widget, metadata_field=metadata_field_widget)
# output_csv_path_widget = interactive(select_csv, 'identifier'=identifier_widget, 'metadata_field'=metadata_field_widget)

# image_interactive_widget = widgets.interactive_output(select_image, {'image_name': image_name_widget})
image_interactive_widget = widgets.interactive(select_image, image_name=image_name_widget)
image_interactive_widget.layout.height = '600px'
image_interactive_widget

# test_widget = VBox([image_name_widget, image_interactive_widget])
# # test_widget.layout.height = '600px'
# # test_widget.layout.width = '1200px'
# test_widget


interactive(children=(Dropdown(description='Image name:', layout=Layout(width='initial'), options=('0012_00426…

In [61]:
# OKAY -- Next step is to only get the contour around the image area on the image when loading the image
# the output from the DATE SELECTION WIDGET should be used to create the text that is displayed in the black box above the image

In [161]:
# create interactive date selection widget
months_dict = {'January': 1,
               'February': 2,
               'March': 3,
               'April': 4,
               'May': 5,
               'June': 6,
               'July': 7,
               'August': 8,
               'September': 9,
               'October': 10,
               'November': 11,
               'December': 12
              }
months_list = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
months_number_of_days_dict = {'January': 31,
                             'February': 28,
                             'March': 31,
                             'April': 30,
                             'May': 31,
                              'June': 30,
                             'July': 31,
                             'August': 31,
                              'September': 30,
                              'October': 31,
                              'November': 30,
                              'December': 31
                             }
numbers_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

raw_date_text_widget = widgets.Text(description='Raw text:')
parsed_date_text_widget = widgets.Text(description='Parsed text:')
month_widget = widgets.RadioButtons(options=months_list,
                                   description='Month:')
day_tens_widget = widgets.RadioButtons(options=numbers_list[0:4],
                                      description='Day 10s:')
day_ones_widget = widgets.RadioButtons(options=numbers_list,
                                      description='Day 1s:')
year_tens_widget = widgets.RadioButtons(options=numbers_list,
                                       description='Year 10s:')
year_ones_widget = widgets.RadioButtons(options=numbers_list,
                                      description='Year 1s:')
date_display_widget = widgets.Text(description='Date:')
button_previous_image = widgets.Button(style={'description_width': 'initial'},
                                   description='Discard & Go Back',
                                  )
button_next_image = widgets.Button(style={'description_width': 'initial'},
                                   description='Save & Load Next',
                                  )

max_days=0
def select_date(month, day_tens, day_ones, year_tens, year_ones):

    year = int(f'19{year_tens}{year_ones}')
    max_days = months_number_of_days_dict[month]
    # set radio option buttons based on max days in the month
    day_tens_end = int(str(max_days)[0]) + 1
    day_tens_widget.options = numbers_list[:day_tens_end]
    day_tens_widget.value = day_tens
    if day_tens_widget.value == 0:
        day_ones_start = 1
        if day_ones_widget.value == 0:
            day_ones_widget.value = 1
            day_ones = 1
    else:
        day_ones_start = 0
    if day_tens_widget.value == 3:
        day_ones_end = int(str(max_days)[1]) + 1
        day_ones_widget.options = numbers_list[day_ones_start:day_ones_end]
        day_ones_widget.value = day_ones
    elif year % 4 == 0 and month == 'February':  # then it's a leapyear
        day_ones_widget.options = numbers_list[day_ones_start:]
        day_ones_widget.value = day_ones
    elif month == 'February':
        day_ones_widget.options = numbers_list[day_ones_start:-1]
        day_ones_widget.value = day_ones
    
    else:
        day_ones_widget.options = numbers_list[day_ones_start:]
        day_ones_widget.value = day_ones
    day_ones = day_ones_widget.value
    day = (day_tens * 10) + day_ones
    date = f'{month} {day}, {year}'
    # print(date)
    date_display_widget.value = date
    return date
    
select_date_interactive_widget = widgets.interactive_output(select_date, {'month': month_widget,
                                                                          'day_tens': day_tens_widget,
                                                                          'day_ones': day_ones_widget,
                                                                          'year_tens': year_tens_widget,
                                                                          'year_ones': year_ones_widget}
                                                           )

def set_date():
    date = image_interactive_widget.result
    
    if isinstance(date, float) or date is None:
        raw_date_text_widget.value = 'NaN'
        parsed_date_text_widget.value = 'No date found'
        return
    
    raw_date_text_widget.value = str(date)
    
    try:
        parsed_date = parse(date)
    except ValueError:
        parsed_date_text_widget.value = 'No date found'
        return
    try:
        month, day, year = parsed_date.strftime("%B %d %Y").split(' ')
    except ValueError:
        print('value_error')
        parsed_date_text_widget.value = 'No date found'
        return
    
    parsed_date = f'{month} {day}, {year}'
    parsed_date_text_widget.value = parsed_date
    month_widget.value = month
    
    
    
    # set day_tens value to 1 so we get full option for day_ones
    day_tens_widget.value = 1
    
    # set day_ones
    day_ones = int(str(day)[1])
    day_ones_widget.value = day_ones
    
    # set day_tens
    day_tens = int(str(day)[0])
    day_tens_widget.value = day_tens
    
    year_tens = int(str(year)[2])
    year_tens_widget.value = year_tens
    year_ones = int(str(year)[3])
    year_ones_widget.value = year_ones

def on_button_previous_image_clicked(b):
    
    start_image_name = image_name_widget.value
    
    current_image_index = image_names_list.index(start_image_name)
    
    previous_image_index = current_image_index - 1
    
    image_name_widget.value = image_names_list[previous_image_index]
    
    set_date()
    
    return image_name_widget.value

button_previous_image.on_click(on_button_previous_image_clicked)

def on_button_next_image_clicked(b):
    
    start_image_name = image_name_widget.value
    
    current_image_index = image_names_list.index(start_image_name)
    
    next_image_index = current_image_index + 1
    
    if next_image_index == len(image_names_list):
        next_image_index = 0
    
    image_name_widget.value = image_names_list[next_image_index]
    
    set_date()
    
    return image_name_widget.value

button_next_image.on_click(on_button_next_image_clicked)

# Select a Month Day Year

date_row_1_widgets = HBox([raw_date_text_widget, parsed_date_text_widget])

date_row_2_widgets = HBox([date_display_widget, button_previous_image, button_next_image])

date_row_3_widgets = HBox([month_widget, day_tens_widget, day_ones_widget, year_tens_widget, year_ones_widget])

date_row_4_widgets = image_name_widget

date_widget = VBox([date_row_1_widgets, date_row_2_widgets, date_row_3_widgets, date_row_4_widgets])

date_widget.layout.height = '350px'
date_widget

VBox(children=(HBox(children=(Text(value='', description='Raw text:'), Text(value='', description='Parsed text…

In [57]:
image_interactive_widget.result

'February 2, 1942'

In [None]:
# images currently on external hard drive -- OLD
image_data_directory_path = Path('/Volumes/jmoor167/data/agrtfn/')

# image_name widget
image_name_widget = widgets.Dropdown(layout={'width': 'initial'},
                                    style={'description_width': 'initial'},
                                    options=image_names_list,
                                    description='Image name:',
                                    disabled=False
                                    )

def sanitize_crop_box(crop_box_dictionary):
    
    sanitized_dict = OrderedDict()
    
    for data in crop_box_dictionary:
        
        crop_box = crop_box_dictionary[data]
        
        # crop box is a string in form of : "(x1, y1, x2, y2)"
        try:  # stripping off parentheses
            sanitized_crop_box = crop_box.strip('()')
            
        except AttributeError:  # can't strip float instead of string because it's NaN
            # add back as None
            sanitized_dict[data] = None
            continue
        
        # remove commas then split on spaces and cast each string as an integer
        sanitized_crop_box = sanitized_crop_box.replace(',', '')
        sanitized_crop_box = [int(point) for point in sanitized_crop_box.split(' ')]        
        
        # print(sanitized_crop_box)
        sanitized_dict[data] = sanitized_crop_box
        
    return sanitized_dict
  

def resize_crop_box_dictionary(crop_box_dictionary, resize_ratio):
        
        resized_crop_box_dictionary = OrderedDict()
        
        for data in crop_box_dictionary:
            
            crop_box = crop_box_dictionary[data]
            resized_crop_box = [int(point * resize_ratio) for point in crop_box]
            
            # print(resized_crop_box)
            
            resized_crop_box_dictionary[data] = resized_crop_box
        
        return resized_crop_box_dictionary     


def draw_text_and_crop(image, crop_box_dictionary):
    """
    returns a dictionary of images identified by their
    """
    drawings_dictionary = OrderedDict()
    
    for data in crop_box_dictionary:
        
        crop_box = crop_box_dictionary[data]
        x1, y1, x2, y2 = crop_box
        
        # draw this box on the ocr_all image
        drawing = cv2.rectangle(image.copy(), (x1, y1), (x2, y2), (0, 0, 255), 8)
        # draw a black rectangle above this to add text to
        drawing = cv2.rectangle(drawing, (x1, y1-45), (x2, y1), (0, 0, 0), -1)  # negative thickness to fill rectangle
        
        font = cv2.FONT_HERSHEY_SIMPLEX
        cv2.putText(drawing, str(data), (x1+17, (y1-7)), font, 1, (255,255,255), 2, cv2.LINE_AA)
        
        x1_crop = x1 - 25
        y1_crop = y1 - 45 - 25
        x2_crop = x2 + 25
        y2_crop = y2 + 25
        
        drawing_crop = drawing[y1_crop:y2_crop, x1_crop:x2_crop]
        
        # add drawing with text to dictionary
        drawings_dictionary[data] = drawing_crop
        
    return drawings_dictionary


def display_image_dictionary(image, image_dictionary):
    
    number_of_axes = len(image_dictionary)
    
    labels_list = [label for label in image_dictionary]
    images_list = [image_dictionary[label] for label in image_dictionary]
    
    # labels_list.insert(0, 'entire image')
    # images_list.insert(0, image)
    
    # display.clear_output()
    fig, axes = plt.subplots(nrows=1, ncols=number_of_axes, figsize=(8,4))
    
    if number_of_axes == 1:
        image_rgb = cv2.cvtColor(images_list[0], cv2.COLOR_BGR2RGB)
        axes.imshow(image_rgb)
        axes.set_title(str(labels_list[0]))
    else:
        for ax, label, image in zip(range(number_of_axes), labels_list, images_list):
            # print(label)
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            axes[ax].imshow(image_rgb)
            axes[ax].set_title(str(label))
                               
    fig.canvas.draw()
    # plt.tight_layout()
    
    return
    

image_display_widget = widgets.Output()


def select_image(image_name):
    
    
    image_path = image_data_directory_path.joinpath(image_name)
    
    page_1 = PageOne(image_path)
    
    image = cv2.imread(str(page_1.image_path))
    
    page_1.get_date_crop_dictionary()  # self-sanitizing, loads as list of integers
    
    resized_image, resize_ratio = resize(image, width=page_1.config['resize_width'])
    
    try:
        resized_crop_box_dictionary = resize_crop_box_dictionary(page_1.crop_box_dictionary, resize_ratio)
    except TypeError:
        print('No Date Guesses Found')
        return
    
    drawings_dictionary = draw_text_and_crop(resized_image, resized_crop_box_dictionary)
    
    # display_image_dictionary(resized_image, drawings_dictionary)
    
    # print(date_guesses_list)
    guesses_list = [guess for guess in drawings_dictionary]
    images_list = [drawings_dictionary[guess] for guess in guesses_list]     
    
    
    number_of_guesses = len(guesses_list)
    clear_output(wait=True)
    fig, axes = plt.subplots(1, number_of_guesses, figsize=(number_of_guesses*5, 3))
    
    if number_of_guesses == 1:
        # image_for_jupyter = cv2jupyter(images_list[0])
        image_rgb = cv2.cvtColor(images_list[0], cv2.COLOR_BGR2RGB)
        axes.imshow(image_rgb)
        axes.set_title(str(guesses_list[0]))
    else:
        for column, guess, image_bgr in zip(iter(range(number_of_guesses)), guesses_list, images_list):
            # print(label)
            # image_for_jupyter = cv2jupyter(plot_image)
            image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
            # print(column)
            # print(guess)
            # print(image_bgr)
            axes[column].imshow(image_rgb)
            axes[column].set_title(str(guess))
                               
    plt.show()
    
    
    
    return fig.canvas


# csv_path_interactive_widget = widgets.interactive(select_csv, identifier=identifier_widget, metadata_field=metadata_field_widget)
# output_csv_path_widget = interactive(select_csv, 'identifier'=identifier_widget, 'metadata_field'=metadata_field_widget)
image_interactive_widget = widgets.interactive(select_image, image_name=image_name_widget)
output = image_interactive_widget.children[-1]

test_widget = VBox([image_name_widget, output])
test_widget.layout.height = '200px'
# test_widget.layout.width = '1200px'
display(test_widget)

In [None]:
# Stdlib imports
from io import BytesIO

# Third-party libraries
from IPython.display import Image as ipyImage
from ipywidgets import interact, interactive, fixed
import matplotlib as mpl
import numpy as np

def arr2img(arr):
    """Display a 2- or 3-d numpy array as an image."""
    if arr.ndim == 2:
        format, cmap = 'png', mpl.cm.gray
    elif arr.ndim == 3:
        format, cmap = 'jpg', None
    else:
        raise ValueError("Only 2- or 3-d arrays can be displayed as images.")
    # Don't let matplotlib autoscale the color range so we can control overall luminosity
    vmax = 255 if arr.dtype == 'uint8' else 1.0
    with BytesIO() as buffer:
        mpl.image.imsave(buffer, arr, format=format, cmap=cmap, vmin=0, vmax=vmax)
        out = buffer.getvalue()
    return Image(out)

In [None]:
image_widget = VBox([image_name_widget])
image_widget

In [None]:
# process dates/images

# images currently on external hard drive
image_data_directory_path = Path('/Volumes/jmoor167/data/agrtfn/')

# get unique list of image names to process
image_names_list = metadata.dataframe['image_name'].unique().tolist()

resize_width = 800

for image_name in image_names_list:
    
    # get a dataframe for all rows with image_name
    image_name_dataframe = metadata.dataframe[metadata.dataframe['image_name'] == image_name]
    
    # create dictionary of date guesses and crop boxes
    crop_box_dict = OrderedDict()
    for date_guess_id in image_name_dataframe['date_guess_id']:
        crop_box = image_name_dataframe.loc[image_name_dataframe['date_guess_id'] == date_guess_id]['date_crop_box'].tolist()[0]
        date_guess = image_name_dataframe.loc[image_name_dataframe['date_guess_id'] == date_guess_id]['date_guess'].tolist()[0]
        # print(crop_box)
        crop_box_dict[index] = [date_guess, crop_box]
    # print(crop_box_dict)
    
    # load image as TnFarmNews page_1
    image_path = image_data_directory_path.joinpath(image_name)
    page_1 =  PageOne(image_path)
    
    image = page_1.preprocess_image(crop_percentage=1, top_and_sides_padding=0)
    
    # resize image
    resized_image, resize_ratio = resize(image, width=resize_width)
    
    # create dictionary of resized image with date guess rectangles drawn on
    resized_with_dates_dict = OrderedDict()
    for date_guess_id in crop_box_dict:
        date_guess, crop_box = crop_box_dict[date_guess_id]
        
        # print(f'crop_box: {crop_box}')
        
        # crop box is a string in form of : "(x1, y1, x2, y2)"
        try:  # stripping off parentheses and remove commas
            sanitized_crop_box = crop_box.strip('()').replace(',', '')
        except AttributeError:  # float instead of string because it's NaN
            # there's not a real guess so (mark it a negative?)
            continue  # on to the next date_guess_id
            
        # split the crop box on spaces and cast each string as an int
        sanitized_crop_box = [int(point) for point in sanitized_crop_box.split(' ')]
        
        # print(sanitized_crop_box)
        
        x1, y1, x2, y2 = sanitized_crop_box
        resized_x1, resized_y1, resized_x2, resized_y2 = [int(point * resize_ratio) for point in sanitized_crop_box]
        resized_crop_box = resized_x1, resized_y1, resized_x2, resized_y2      
        
        # print(resized_crop_box)
        
        # draw this box on the ocr_all image
        resized_drawing = cv2.rectangle(resized_image.copy(), (x1, y1), (x2, y2), (0, 0, 255), 8)
        # draw a black rectangle above this to add text to
        resized_drawing = cv2.rectangle(resized_drawing, (x1, y1-100), (x2, y1), (0, 0, 0), -1)  # negative thickness to fill rectangle
        
        font = cv2.FONT_HERSHEY_SIMPLEX
        print(f'date_guess before fail: {date_guess}')
        if date_guess is not None:
            date_writing = date_guess
        else:
            date_writing = 'Date Guess Fail'
        cv2.putText(resized_drawing, date_writing, (x1+25, (y1-25)), font, 2, (255,255,255), 3, cv2.LINE_AA)
              
        quick_imshow(resized_drawing)
    
    

In [None]:
date_widget