### This is intended to be an end-to-end pipeline

### Checklist for deployment

- [ ] Copy `net` and `utils` to main dir. 
- [ ] Compile `bbox` in the `utils` folder
- [ ] Remember to `pip install future`

## Dewarp Experiment
To use page_dewarp("/path/to/image.jpg"), make sure to install `pip install future`


In [44]:
import re

import pandas as pd
import pytesseract
from pytesseract import Output
from tqdm import tqdm
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [46]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cedric/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/cedric/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
from __future__ import division
from __future__ import print_function
from builtins import zip
from builtins import str
from builtins import range
from builtins import object
from past.utils import old_div
import os
import sys
import datetime
import cv2
from PIL import Image
import numpy as np
import scipy.optimize

In [88]:


# for some reason pylint complains about cv2 members being undefined :(
# pylint: disable=E1101

PAGE_MARGIN_X = 40       # reduced px to ignore near L/R edge
PAGE_MARGIN_Y = 20       # reduced px to ignore near T/B edge

OUTPUT_ZOOM = 1.0        # how much to zoom output relative to *original* image
OUTPUT_DPI = 300         # just affects stated DPI of PNG, not appearance
REMAP_DECIMATE = 16      # downscaling factor for remapping image

ADAPTIVE_WINSZ = 55      # window size for adaptive threshold in reduced px

TEXT_MIN_WIDTH = 15      # min reduced px width of detected text contour
TEXT_MIN_HEIGHT = 2      # min reduced px height of detected text contour
TEXT_MIN_ASPECT = 1.5    # filter out text contours below this w/h ratio
TEXT_MAX_THICKNESS = 10  # max reduced px thickness of detected text contour

EDGE_MAX_OVERLAP = 1.0   # max reduced px horiz. overlap of contours in span
EDGE_MAX_LENGTH = 100.0  # max reduced px length of edge connecting contours
EDGE_ANGLE_COST = 10.0   # cost of angles in edges (tradeoff vs. length)
EDGE_MAX_ANGLE = 7.5     # maximum change in angle allowed between contours

RVEC_IDX = slice(0, 3)   # index of rvec in params vector
TVEC_IDX = slice(3, 6)   # index of tvec in params vector
CUBIC_IDX = slice(6, 8)  # index of cubic slopes in params vector

SPAN_MIN_WIDTH = 30      # minimum reduced px width for span
SPAN_PX_PER_STEP = 20    # reduced px spacing for sampling along spans
FOCAL_LENGTH = 1.2       # normalized focal length of camera

DEBUG_LEVEL = 0          # 0=none, 1=some, 2=lots, 3=all
DEBUG_OUTPUT = 'file'    # file, screen, both

WINDOW_NAME = 'Dewarp'   # Window name for visualization

# nice color palette for visualizing contours, etc.
CCOLORS = [
    (255, 0, 0),
    (255, 63, 0),
    (255, 127, 0),
    (255, 191, 0),
    (255, 255, 0),
    (191, 255, 0),
    (127, 255, 0),
    (63, 255, 0),
    (0, 255, 0),
    (0, 255, 63),
    (0, 255, 127),
    (0, 255, 191),
    (0, 255, 255),
    (0, 191, 255),
    (0, 127, 255),
    (0, 63, 255),
    (0, 0, 255),
    (63, 0, 255),
    (127, 0, 255),
    (191, 0, 255),
    (255, 0, 255),
    (255, 0, 191),
    (255, 0, 127),
    (255, 0, 63),
]

# default intrinsic parameter matrix
K = np.array([
    [FOCAL_LENGTH, 0, 0],
    [0, FOCAL_LENGTH, 0],
    [0, 0, 1]], dtype=np.float32)


def debug_show(name, step, text, display):

    if DEBUG_OUTPUT != 'screen':
        filetext = text.replace(' ', '_')
        outfile = name + '_debug_' + str(step) + '_' + filetext + '.png'
        cv2.imwrite(outfile, display)

    if DEBUG_OUTPUT != 'file':

        image = display.copy()
        height = image.shape[0]

        cv2.putText(image, text, (16, height-16),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.0,
                    (0, 0, 0), 3, cv2.LINE_AA)

        cv2.putText(image, text, (16, height-16),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.0,
                    (255, 255, 255), 1, cv2.LINE_AA)

        cv2.imshow(WINDOW_NAME, image)

        while cv2.waitKey(5) < 0:
            pass


def round_nearest_multiple(i, factor):
    i = int(i)
    rem = i % factor
    if not rem:
        return i
    else:
        return i + factor - rem


def pix2norm(shape, pts):
    height, width = shape[:2]
    scl = 2.0/(max(height, width))
    offset = np.array([width, height], dtype=pts.dtype).reshape((-1, 1, 2))*0.5
    return (pts - offset) * scl


def norm2pix(shape, pts, as_integer):
    height, width = shape[:2]
    scl = max(height, width)*0.5
    offset = np.array([0.5*width, 0.5*height],
                      dtype=pts.dtype).reshape((-1, 1, 2))
    rval = pts * scl + offset
    if as_integer:
        return (rval + 0.5).astype(int)
    else:
        return rval


def fltp(point):
    return tuple(point.astype(int).flatten())


def draw_correspondences(img, dstpoints, projpts):

    display = img.copy()
    dstpoints = norm2pix(img.shape, dstpoints, True)
    projpts = norm2pix(img.shape, projpts, True)

    for pts, color in [(projpts, (255, 0, 0)),
                       (dstpoints, (0, 0, 255))]:

        for point in pts:
            cv2.circle(display, fltp(point), 3, color, -1, cv2.LINE_AA)

    for point_a, point_b in zip(projpts, dstpoints):
        cv2.line(display, fltp(point_a), fltp(point_b),
                 (255, 255, 255), 1, cv2.LINE_AA)

    return display


def get_default_params(corners, ycoords, xcoords):

    # page width and height
    page_width = np.linalg.norm(corners[1] - corners[0])
    page_height = np.linalg.norm(corners[-1] - corners[0])
    rough_dims = (page_width, page_height)

    # our initial guess for the cubic has no slope
    cubic_slopes = [0.0, 0.0]

    # object points of flat page in 3D coordinates
    corners_object3d = np.array([
        [0, 0, 0],
        [page_width, 0, 0],
        [page_width, page_height, 0],
        [0, page_height, 0]])

    # estimate rotation and translation from four 2D-to-3D point
    # correspondences
    _, rvec, tvec = cv2.solvePnP(corners_object3d,
                                 corners, K, np.zeros(5))

    span_counts = [len(xc) for xc in xcoords]

    params = np.hstack((np.array(rvec).flatten(),
                        np.array(tvec).flatten(),
                        np.array(cubic_slopes).flatten(),
                        ycoords.flatten()) +
                       tuple(xcoords))

    return rough_dims, span_counts, params


def project_xy(xy_coords, pvec):

    # get cubic polynomial coefficients given
    #
    #  f(0) = 0, f'(0) = alpha
    #  f(1) = 0, f'(1) = beta

    alpha, beta = tuple(pvec[CUBIC_IDX])

    poly = np.array([
        alpha + beta,
        -2*alpha - beta,
        alpha,
        0])

    xy_coords = xy_coords.reshape((-1, 2))
    z_coords = np.polyval(poly, xy_coords[:, 0])

    objpoints = np.hstack((xy_coords, z_coords.reshape((-1, 1))))

    image_points, _ = cv2.projectPoints(objpoints,
                                        pvec[RVEC_IDX],
                                        pvec[TVEC_IDX],
                                        K, np.zeros(5))

    return image_points


def project_keypoints(pvec, keypoint_index):

    xy_coords = pvec[keypoint_index]
    xy_coords[0, :] = 0

    return project_xy(xy_coords, pvec)


def resize_to_screen(src, maxw=1280, maxh=700, copy=False):

    height, width = src.shape[:2]

    scl_x = float(width)/maxw
    scl_y = float(height)/maxh

    scl = int(np.ceil(max(scl_x, scl_y)))

    if scl > 1.0:
        inv_scl = 1.0/scl
        img = cv2.resize(src, (0, 0), None, inv_scl, inv_scl, cv2.INTER_AREA)
    elif copy:
        img = src.copy()
    else:
        img = src

    return img


def box(width, height):
    return np.ones((height, width), dtype=np.uint8)


def get_page_extents(small):

    height, width = small.shape[:2]

    xmin = PAGE_MARGIN_X
    ymin = PAGE_MARGIN_Y
    xmax = width-PAGE_MARGIN_X
    ymax = height-PAGE_MARGIN_Y

    page = np.zeros((height, width), dtype=np.uint8)
    cv2.rectangle(page, (xmin, ymin), (xmax, ymax), (255, 255, 255), -1)

    outline = np.array([
        [xmin, ymin],
        [xmin, ymax],
        [xmax, ymax],
        [xmax, ymin]])

    return page, outline


def get_mask(name, small, pagemask, masktype):

    sgray = cv2.cvtColor(small, cv2.COLOR_RGB2GRAY)

    if masktype == 'text':
        
        #mask = cv2.adaptiveThreshold(sgray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 7, 4)
        mask = cv2.adaptiveThreshold(sgray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, ADAPTIVE_WINSZ, 25)

        if DEBUG_LEVEL >= 3:
            debug_show(name, 0.1, 'thresholded', mask)

        mask = cv2.dilate(mask, box(9, 1))

        if DEBUG_LEVEL >= 3:
            debug_show(name, 0.2, 'dilated', mask)

        mask = cv2.erode(mask, box(1, 3))

        if DEBUG_LEVEL >= 3:
            debug_show(name, 0.3, 'eroded', mask)

    else:

        mask = cv2.adaptiveThreshold(sgray, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
                                     cv2.THRESH_BINARY_INV,
                                     ADAPTIVE_WINSZ,
                                     7)
                    
        # mask = cv2.adaptiveThreshold(sgray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 7, 4)

        if DEBUG_LEVEL >= 3:
            debug_show(name, 0.4, 'thresholded', mask)

        mask = cv2.erode(mask, box(3, 1), iterations=3)

        if DEBUG_LEVEL >= 3:
            debug_show(name, 0.5, 'eroded', mask)

        mask = cv2.dilate(mask, box(8, 2))

        if DEBUG_LEVEL >= 3:
            debug_show(name, 0.6, 'dilated', mask)

    return np.minimum(mask, pagemask)


def interval_measure_overlap(int_a, int_b):
    return min(int_a[1], int_b[1]) - max(int_a[0], int_b[0])


def angle_dist(angle_b, angle_a):

    diff = angle_b - angle_a

    while diff > np.pi:
        diff -= 2*np.pi

    while diff < -np.pi:
        diff += 2*np.pi

    return np.abs(diff)


def blob_mean_and_tangent(contour):

    moments = cv2.moments(contour)

    area = moments['m00']
    
    if area == 0:
        return np.array([0,0]), np.array([0,0])

    mean_x = old_div(moments['m10'], area)
    mean_y = old_div(moments['m01'], area)

    moments_matrix = old_div(np.array([
        [moments['mu20'], moments['mu11']],
        [moments['mu11'], moments['mu02']]
    ]), area)

    _, svd_u, _ = cv2.SVDecomp(moments_matrix)

    center = np.array([mean_x, mean_y])
    tangent = svd_u[:, 0].flatten().copy()

    return center, tangent


class ContourInfo(object):

    def __init__(self, contour, rect, mask):

        self.contour = contour
        self.rect = rect
        self.mask = mask

        self.center, self.tangent = blob_mean_and_tangent(contour)

        self.angle = np.arctan2(self.tangent[1], self.tangent[0])

        clx = [self.proj_x(point) for point in contour]

        lxmin = min(clx)
        lxmax = max(clx)

        self.local_xrng = (lxmin, lxmax)

        self.point0 = self.center + self.tangent * lxmin
        self.point1 = self.center + self.tangent * lxmax

        self.pred = None
        self.succ = None

    def proj_x(self, point):
        return np.dot(self.tangent, point.flatten()-self.center)

    def local_overlap(self, other):
        xmin = self.proj_x(other.point0)
        xmax = self.proj_x(other.point1)
        return interval_measure_overlap(self.local_xrng, (xmin, xmax))


def generate_candidate_edge(cinfo_a, cinfo_b):

    # we want a left of b (so a's successor will be b and b's
    # predecessor will be a) make sure right endpoint of b is to the
    # right of left endpoint of a.
    if cinfo_a.point0[0] > cinfo_b.point1[0]:
        tmp = cinfo_a
        cinfo_a = cinfo_b
        cinfo_b = tmp

    x_overlap_a = cinfo_a.local_overlap(cinfo_b)
    x_overlap_b = cinfo_b.local_overlap(cinfo_a)

    overall_tangent = cinfo_b.center - cinfo_a.center
    overall_angle = np.arctan2(overall_tangent[1], overall_tangent[0])

    delta_angle = old_div(max(angle_dist(cinfo_a.angle, overall_angle),
                      angle_dist(cinfo_b.angle, overall_angle)) * 180,np.pi)

    # we want the largest overlap in x to be small
    x_overlap = max(x_overlap_a, x_overlap_b)

    dist = np.linalg.norm(cinfo_b.point0 - cinfo_a.point1)

    if (dist > EDGE_MAX_LENGTH or
            x_overlap > EDGE_MAX_OVERLAP or
            delta_angle > EDGE_MAX_ANGLE):
        return None
    else:
        score = dist + delta_angle*EDGE_ANGLE_COST
        return (score, cinfo_a, cinfo_b)


def make_tight_mask(contour, xmin, ymin, width, height):

    tight_mask = np.zeros((height, width), dtype=np.uint8)
    tight_contour = contour - np.array((xmin, ymin)).reshape((-1, 1, 2))

    cv2.drawContours(tight_mask, [tight_contour], 0,
                     (1, 1, 1), -1)

    return tight_mask


def get_contours(name, small, pagemask, masktype):

    mask = get_mask(name, small, pagemask, masktype)

    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL,
                                   cv2.CHAIN_APPROX_NONE)

    contours_out = []

    for contour in contours:

        rect = cv2.boundingRect(contour)
        xmin, ymin, width, height = rect

        if (width < TEXT_MIN_WIDTH or
                height < TEXT_MIN_HEIGHT or
                width < TEXT_MIN_ASPECT*height):
            continue

        tight_mask = make_tight_mask(contour, xmin, ymin, width, height)

        if tight_mask.sum(axis=0).max() > TEXT_MAX_THICKNESS:
            continue

        contours_out.append(ContourInfo(contour, rect, tight_mask))

    if DEBUG_LEVEL >= 2:
        visualize_contours(name, small, contours_out)

    return contours_out


def assemble_spans(name, small, pagemask, cinfo_list):

    # sort list
    cinfo_list = sorted(cinfo_list, key=lambda cinfo: cinfo.rect[1])

    # generate all candidate edges
    candidate_edges = []

    for i, cinfo_i in enumerate(cinfo_list):
        for j in range(i):
            # note e is of the form (score, left_cinfo, right_cinfo)
            edge = generate_candidate_edge(cinfo_i, cinfo_list[j])
            if edge is not None:
                candidate_edges.append(edge)

    # sort candidate edges by score (lower is better)
    candidate_edges.sort()

    # for each candidate edge
    for _, cinfo_a, cinfo_b in candidate_edges:
        # if left and right are unassigned, join them
        if cinfo_a.succ is None and cinfo_b.pred is None:
            cinfo_a.succ = cinfo_b
            cinfo_b.pred = cinfo_a

    # generate list of spans as output
    spans = []

    # until we have removed everything from the list
    while cinfo_list:

        # get the first on the list
        cinfo = cinfo_list[0]

        # keep following predecessors until none exists
        while cinfo.pred:
            cinfo = cinfo.pred

        # start a new span
        cur_span = []

        width = 0.0

        # follow successors til end of span
        while cinfo:
            # remove from list (sadly making this loop *also* O(n^2)
            cinfo_list.remove(cinfo)
            # add to span
            cur_span.append(cinfo)
            width += cinfo.local_xrng[1] - cinfo.local_xrng[0]
            # set successor
            cinfo = cinfo.succ

        # add if long enough
        if width > SPAN_MIN_WIDTH:
            spans.append(cur_span)

    if DEBUG_LEVEL >= 2:
        visualize_spans(name, small, pagemask, spans)

    return spans


def sample_spans(shape, spans):

    span_points = []

    for span in spans:

        contour_points = []

        for cinfo in span:

            yvals = np.arange(cinfo.mask.shape[0]).reshape((-1, 1))
            totals = (yvals * cinfo.mask).sum(axis=0)
            means = old_div(totals, cinfo.mask.sum(axis=0))

            xmin, ymin = cinfo.rect[:2]

            step = SPAN_PX_PER_STEP
            start = old_div(((len(means)-1) % step), 2)

            contour_points += [(x+xmin, means[x]+ymin)
                               for x in range(start, len(means), step)]

        contour_points = np.array(contour_points,
                                  dtype=np.float32).reshape((-1, 1, 2))

        contour_points = pix2norm(shape, contour_points)

        span_points.append(contour_points)

    return span_points


def keypoints_from_samples(name, small, pagemask, page_outline,
                           span_points):

    all_evecs = np.array([[0.0, 0.0]])
    all_weights = 0

    for points in span_points:

        _, evec = cv2.PCACompute(points.reshape((-1, 2)),
                                 None, maxComponents=1)

        weight = np.linalg.norm(points[-1] - points[0])

        all_evecs += evec * weight
        all_weights += weight

    evec = old_div(all_evecs, all_weights)

    x_dir = evec.flatten()

    if x_dir[0] < 0:
        x_dir = -x_dir

    y_dir = np.array([-x_dir[1], x_dir[0]])

    pagecoords = cv2.convexHull(page_outline)
    pagecoords = pix2norm(pagemask.shape, pagecoords.reshape((-1, 1, 2)))
    pagecoords = pagecoords.reshape((-1, 2))

    px_coords = np.dot(pagecoords, x_dir)
    py_coords = np.dot(pagecoords, y_dir)

    px0 = px_coords.min()
    px1 = px_coords.max()

    py0 = py_coords.min()
    py1 = py_coords.max()

    p00 = px0 * x_dir + py0 * y_dir
    p10 = px1 * x_dir + py0 * y_dir
    p11 = px1 * x_dir + py1 * y_dir
    p01 = px0 * x_dir + py1 * y_dir

    corners = np.vstack((p00, p10, p11, p01)).reshape((-1, 1, 2))

    ycoords = []
    xcoords = []

    for points in span_points:
        pts = points.reshape((-1, 2))
        px_coords = np.dot(pts, x_dir)
        py_coords = np.dot(pts, y_dir)
        ycoords.append(py_coords.mean() - py0)
        xcoords.append(px_coords - px0)

    if DEBUG_LEVEL >= 2:
        visualize_span_points(name, small, span_points, corners)

    return corners, np.array(ycoords), xcoords


def visualize_contours(name, small, cinfo_list):

    regions = np.zeros_like(small)

    for j, cinfo in enumerate(cinfo_list):

        cv2.drawContours(regions, [cinfo.contour], 0,
                         CCOLORS[j % len(CCOLORS)], -1)

    mask = (regions.max(axis=2) != 0)

    display = small.copy()
    display[mask] = (old_div(display[mask],2)) + (old_div(regions[mask],2))

    for j, cinfo in enumerate(cinfo_list):
        color = CCOLORS[j % len(CCOLORS)]
        color = tuple([old_div(c,4) for c in color])

        cv2.circle(display, fltp(cinfo.center), 3,
                   (255, 255, 255), 1, cv2.LINE_AA)

        cv2.line(display, fltp(cinfo.point0), fltp(cinfo.point1),
                 (255, 255, 255), 1, cv2.LINE_AA)

    debug_show(name, 1, 'contours', display)


def visualize_spans(name, small, pagemask, spans):

    regions = np.zeros_like(small)

    for i, span in enumerate(spans):
        contours = [cinfo.contour for cinfo in span]
        cv2.drawContours(regions, contours, -1,
                         CCOLORS[i*3 % len(CCOLORS)], -1)

    mask = (regions.max(axis=2) != 0)

    display = small.copy()
    display[mask] = (old_div(display[mask],2)) + (old_div(regions[mask],2))
    display[pagemask == 0] //= 4

    debug_show(name, 2, 'spans', display)


def visualize_span_points(name, small, span_points, corners):

    display = small.copy()

    for i, points in enumerate(span_points):

        points = norm2pix(small.shape, points, False)

        mean, small_evec = cv2.PCACompute(points.reshape((-1, 2)),
                                          None,
                                          maxComponents=1)

        dps = np.dot(points.reshape((-1, 2)), small_evec.reshape((2, 1)))
        dpm = np.dot(mean.flatten(), small_evec.flatten())

        point0 = mean + small_evec * (dps.min()-dpm)
        point1 = mean + small_evec * (dps.max()-dpm)

        for point in points:
            cv2.circle(display, fltp(point), 3,
                       CCOLORS[i % len(CCOLORS)], -1, cv2.LINE_AA)

        cv2.line(display, fltp(point0), fltp(point1),
                 (255, 255, 255), 1, cv2.LINE_AA)

    cv2.polylines(display, [norm2pix(small.shape, corners, True)],
                  True, (255, 255, 255))

    debug_show(name, 3, 'span points', display)


def imgsize(img):
    height, width = img.shape[:2]
    return '{}x{}'.format(width, height)


def make_keypoint_index(span_counts):

    nspans = len(span_counts)
    npts = sum(span_counts)
    keypoint_index = np.zeros((npts+1, 2), dtype=int)
    start = 1

    for i, count in enumerate(span_counts):
        end = start + count
        keypoint_index[start:start+end, 1] = 8+i
        start = end

    keypoint_index[1:, 0] = np.arange(npts) + 8 + nspans

    return keypoint_index


def optimize_params(name, small, dstpoints, span_counts, params):

    keypoint_index = make_keypoint_index(span_counts)

    def objective(pvec):
        ppts = project_keypoints(pvec, keypoint_index)
        return np.sum((dstpoints - ppts)**2)

    print('  initial objective is', objective(params))

    if DEBUG_LEVEL >= 1:
        projpts = project_keypoints(params, keypoint_index)
        display = draw_correspondences(small, dstpoints, projpts)
        debug_show(name, 4, 'keypoints before', display)

    print('  optimizing', len(params), 'parameters...')
    start = datetime.datetime.now()
    res = scipy.optimize.minimize(objective, params,
                                  method='Powell')
    end = datetime.datetime.now()
    print('  optimization took', round((end-start).total_seconds(), 2), 'sec.')
    print('  final objective is', res.fun)
    params = res.x

    if DEBUG_LEVEL >= 1:
        projpts = project_keypoints(params, keypoint_index)
        display = draw_correspondences(small, dstpoints, projpts)
        debug_show(name, 5, 'keypoints after', display)

    return params


def get_page_dims(corners, rough_dims, params):

    dst_br = corners[2].flatten()

    dims = np.array(rough_dims)

    def objective(dims):
        proj_br = project_xy(dims, params)
        return np.sum((dst_br - proj_br.flatten())**2)

    res = scipy.optimize.minimize(objective, dims, method='Powell')
    dims = res.x

    print('  got page dims', dims[0], 'x', dims[1])

    return dims


def remap_image(name, img, small, page_dims, params, output_path):

    height = 0.5 * page_dims[1] * OUTPUT_ZOOM * img.shape[0]
    height = round_nearest_multiple(height, REMAP_DECIMATE)

    width = round_nearest_multiple(old_div(height * page_dims[0], page_dims[1]),
                                   REMAP_DECIMATE)

    print('  output will be {}x{}'.format(width, height))

    height_small = old_div(height, REMAP_DECIMATE)
    width_small = old_div(width, REMAP_DECIMATE)

    page_x_range = np.linspace(0, page_dims[0], width_small)
    page_y_range = np.linspace(0, page_dims[1], height_small)

    page_x_coords, page_y_coords = np.meshgrid(page_x_range, page_y_range)

    page_xy_coords = np.hstack((page_x_coords.flatten().reshape((-1, 1)),
                                page_y_coords.flatten().reshape((-1, 1))))

    page_xy_coords = page_xy_coords.astype(np.float32)

    image_points = project_xy(page_xy_coords, params)
    image_points = norm2pix(img.shape, image_points, False)

    image_x_coords = image_points[:, 0, 0].reshape(page_x_coords.shape)
    image_y_coords = image_points[:, 0, 1].reshape(page_y_coords.shape)

    image_x_coords = cv2.resize(image_x_coords, (width, height),
                                interpolation=cv2.INTER_CUBIC)

    image_y_coords = cv2.resize(image_y_coords, (width, height),
                                interpolation=cv2.INTER_CUBIC)

    img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    remapped = cv2.remap(img_gray, image_x_coords, image_y_coords,
                         cv2.INTER_CUBIC,
                         None, cv2.BORDER_REPLICATE)

    # thresh = cv2.adaptiveThreshold(remapped, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, ADAPTIVE_WINSZ, 25)
    thresh = cv2.adaptiveThreshold(remapped, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 7, 4)

    pil_image = Image.fromarray(thresh)
    pil_image = pil_image.convert('1')

    threshfile = name + '_thresh.png'

    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    output_threshfile = os.path.join(output_path, threshfile)
    pil_image.save(output_threshfile, dpi=(OUTPUT_DPI, OUTPUT_DPI))

    if DEBUG_LEVEL >= 1:
        height = small.shape[0]
        width = int(round(height * float(thresh.shape[1])/thresh.shape[0]))
        display = cv2.resize(thresh, (width, height),
                             interpolation=cv2.INTER_AREA)
        debug_show(name, 6, 'output', display)

    return output_threshfile


def page_dewarp(imgfile, output_path="threshes"):

    if DEBUG_LEVEL > 0 and DEBUG_OUTPUT != 'file':
        cv2.namedWindow(WINDOW_NAME)

    outfiles = []

    img = cv2.imread(imgfile)
    small = resize_to_screen(img)
    basename = os.path.basename(imgfile)
    name, _ = os.path.splitext(basename)

    print('loaded', basename, 'with size', imgsize(img), end=' ')
    print('and resized to', imgsize(small))

    if DEBUG_LEVEL >= 3:
        debug_show(name, 0.0, 'original', small)

    pagemask, page_outline = get_page_extents(small)

    cinfo_list = get_contours(name, small, pagemask, 'text')
    spans = assemble_spans(name, small, pagemask, cinfo_list)

    if len(spans) < 3:
        print('  detecting lines because only', len(spans), 'text spans')
        cinfo_list = get_contours(name, small, pagemask, 'line')
        spans2 = assemble_spans(name, small, pagemask, cinfo_list)
        if len(spans2) > len(spans):
            spans = spans2

    if len(spans) < 1:
        print('skipping', name, 'because only', len(spans), 'spans')
        return

    span_points = sample_spans(small.shape, spans)

    print('  got', len(spans), 'spans', end=' ')
    print('with', sum([len(pts) for pts in span_points]), 'points.')

    corners, ycoords, xcoords = keypoints_from_samples(name, small,
                                                        pagemask,
                                                        page_outline,
                                                        span_points)

    rough_dims, span_counts, params = get_default_params(corners,
                                                            ycoords, xcoords)

    dstpoints = np.vstack((corners[0].reshape((1, 1, 2)),) +
                            tuple(span_points))

    params = optimize_params(name, small,
                                dstpoints,
                                span_counts, params)

    page_dims = get_page_dims(corners, rough_dims, params)

    outfile = remap_image(name, img, small, page_dims, params, output_path)

    return outfile

In [90]:
img_file = "Sample_images/not_bad_at_coding.jpg"
outfile = page_dewarp(img_file)
print(outfile, type(outfile))

loaded not_bad_at_coding.jpg with size 1280x794 and resized to 640x397
  got 12 spans with 147 points.
  initial objective is 0.000983501056451414
  optimizing 167 parameters...
  optimization took 6.84 sec.
  final objective is 0.00025956530201118067
  got page dims 1.7380212152528396 x 1.1293530256691877
  output will be 704x448
threshes/not_bad_at_coding_thresh.png <class 'str'>


## Integrated CTPN

In [3]:
import os
import shutil
import sys
import time

import cv2
import numpy as np
import tensorflow as tf

sys.path.append(os.getcwd())

from nets import model_train as model
from utils.rpn_msr.proposal_layer import proposal_layer
from utils.text_connector.detectors import TextDetector
textdetector = TextDetector(DETECT_MODE='O')

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [4]:
test_data_path = 'text-detection-ctpn/data/demo/'
output_path = 'text-detection-ctpn/data/res/'
gpu = '0'
checkpoint_path = 'text-detection-ctpn/checkpoints_mlt/'

In [15]:
def resize_image(img):
    img_size = img.shape
    im_size_min = np.min(img_size[0:2])
    im_size_max = np.max(img_size[0:2])

    im_scale = float(600) / float(im_size_min)
    if np.round(im_scale * im_size_max) > 1200:
        im_scale = float(1200) / float(im_size_max)
    new_h = int(img_size[0] * im_scale)
    new_w = int(img_size[1] * im_scale)

    new_h = new_h if new_h // 16 == 0 else (new_h // 16 + 1) * 16
    new_w = new_w if new_w // 16 == 0 else (new_w // 16 + 1) * 16

    re_im = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
    return re_im, (new_h / img_size[0], new_w / img_size[1])


def ctpn(imgfile):
    if imgfile.strip() == "":
        raise IOError
    #if os.path.exists(output_path):
        #shutil.rmtree(output_path)
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    os.environ['CUDA_VISIBLE_DEVICES'] = gpu
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # INFO and WARNING messages are not printed
    
    output_img_file, txt_file = "", ""

    try:
        with tf.get_default_graph().as_default():
            input_image = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_image')
            input_im_info = tf.placeholder(tf.float32, shape=[None, 3], name='input_im_info')

            global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)

            bbox_pred, cls_pred, cls_prob = model.model(input_image)

            variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step)
            saver = tf.train.Saver(variable_averages.variables_to_restore())

            with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
                ckpt_state = tf.train.get_checkpoint_state(checkpoint_path)
                model_path = os.path.join(checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path))
                print('Restore from {}'.format(model_path))
                saver.restore(sess, model_path)

                start = time.time()
                try:
                    im = cv2.imread(imgfile)[:, :, ::-1]
                except:
                    print("Error reading image {}!".format(imgfile))
                    tf.reset_default_graph()
                    raise IOError

                img, (rh, rw) = resize_image(im)
                h, w, c = img.shape
                im_info = np.array([h, w, c]).reshape([1, 3])
                bbox_pred_val, cls_prob_val = sess.run([bbox_pred, cls_prob],
                                                    feed_dict={input_image: [img],
                                                            input_im_info: im_info})

                textsegs, _ = proposal_layer(cls_prob_val, bbox_pred_val, im_info)
                scores = textsegs[:, 0]
                textsegs = textsegs[:, 1:5]

                textdetector = TextDetector(DETECT_MODE='O')
                boxes = textdetector.detect(textsegs, scores[:, np.newaxis], img.shape[:2])
                boxes = np.array(boxes, dtype=np.int)

                cost_time = (time.time() - start)
                print("cost time: {:.2f}s".format(cost_time))

                for i, box in enumerate(boxes):
                    cv2.polylines(img, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0),
                                thickness=2)
                img = cv2.resize(img, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR)
                output_img_file = os.path.join(output_path, os.path.basename(imgfile))
                cv2.imwrite(output_img_file, img[:, :, ::-1])

                txt_file = os.path.join(output_path, os.path.splitext(os.path.basename(imgfile))[0]) + ".txt"
                with open(txt_file, "w") as f:
                    for i, box in enumerate(boxes):
                        line = ",".join(str(box[k]) for k in range(8))
                        line += "," + str(scores[i]) + "\r\n"
                        f.writelines(line)
    except:
        tf.reset_default_graph()
        traceback.print_exc()

    tf.reset_default_graph()
    return output_img_file, txt_file

In [16]:
tf.__version__

'1.15.0'

In [39]:
fi, ftxt = ctpn("text-detection-ctpn/data/demo/hadalabo_thresh.png")

Restore from text-detection-ctpn/checkpoints_mlt/ctpn_50000.ckpt
INFO:tensorflow:Restoring parameters from text-detection-ctpn/checkpoints_mlt/ctpn_50000.ckpt
cost time: 2.40s


## The OCR Shit
This part is everything after dewarp and ctpn, taken from Hien's work

In [1]:
def get_bounding_box(txt):
    annotation = txt
    with open(annotation, "r") as file1:
        bounding_boxes = file1.read()
        
    bounding_boxes = bounding_boxes.split('\n')[:-1]
    boxes = [i.split(',')[:-1] for i in bounding_boxes]

    new_boxes = []
    for box in boxes:
        new_box = []
        for i, each in enumerate(box):
            num = int(each)
            if i in [0, 1, 3, 6]:
                num -= 3
            else: 
                num += 3
            new_box.append(num)
        new_boxes.append(new_box)
    new_boxes.sort(key=lambda x: x[1])
    
    return new_boxes

In [8]:
def clean_string(string):
    text = string.replace('INACTIVE INGREDIENTS:', '') # added
    text = text.replace('ACTIVE INGREDIENTS:', '') # added
    text = text.split(':')[1]
    
    pattern = "[\|\*\_\'\{}&]".format('"')
    regex = re.compile('\\\S+')
    
    text = re.sub(pattern, "", text)
    text = re.sub(",, ", ", ", text)
    text = re.sub(regex, " ", text)
    text = re.sub('\.', " ", text)
    text_tokens = word_tokenize(text)
    text_wo_sw = [w for w in text_tokens if not w in stopwords.words()]
    text = ' '.join(text_wo_sw)
    text = text.strip()

    return text

def string_to_list(text):
    pattern = "[\|\*\_\'\{}]".format('"')
    text = re.sub(pattern, "", text)
    split = [remove_water(x) for x in re.split("[,.]", text)]
    
    return split

In [7]:
def remove_water(string):
    water = ['WATER (AQUA)', 'AQUA', 'EAU', 'AQUA/WATER/EAU', 'AQUA / WATER / EAU', 
             'PURIFIED WATER', 'DISTILLED WATER', 'D.I. WATER', 'AQUA (WATER)', 'AQUA (PURIFIED)']
    text = string.upper()
    if text in water:
        text = 'WATER'
    text = text.strip('  ')
    
    return text

In [36]:
def crop_line(img_path, box):
    img = cv2.imread(img_path)
    img, (rh, rw) = resize_image(img)
    # points for test.jpg
    cnt = np.array([
            [[box[0], box[1]]],
            [[box[2], box[3]]],
            [[box[4], box[5]]],
            [[box[6], box[7]]]
        ])
    # print("shape of cnt: {}".format(cnt.shape))
    rect = cv2.minAreaRect(cnt)
#     print("rect: {}".format(rect))

    # the order of the box points: bottom left, top left, top right,
    # bottom right
    box = cv2.boxPoints(rect)
    box = np.int0(box)

    # print("bounding box: {}".format(box))
    cv2.drawContours(img, [box], 0, (0, 0, 255), 2)

    # get width and height of the detected rectangle
    width = int(rect[1][0])
    height = int(rect[1][1])
    angle = rect[2]

    src_pts = box.astype("float32")
    # coordinate of the points in box points after the rectangle has been
    # straightened
    dst_pts = np.array([[0, height+2],
                        [0, 0],
                        [width, 0],
                        [width, height+2]], dtype="float32")

    # the perspective transformation matrix
    M = cv2.getPerspectiveTransform(src_pts, dst_pts)

    # directly warp the rotated rectangle to get the straightened rectangle
    warped = cv2.warpPerspective(img, M, (width, height))

    # cv2.imwrite("crop_img.jpg", warped)
    
    # cv2.waitKey(0)
    if angle < -45:
      warped = np.transpose(warped,(1,0,2))
      warped = warped[::-1]

#     cv2.imshow('croped', warped)
#     cv2.waitKey(0)
#     cv2.destroyAllWindows()


    return warped

In [37]:
def ocr(img, oem=3, psm=6):
    """
    @param img: The image to be OCR'd
    @param oem: for specifying the type of Tesseract engine( default=1 for LSTM OCR Engine)
    """
    config = ('-l eng --oem {oem} --psm {psm}'.format(oem=oem,psm=psm))
    # config = ('-l eng --tessdata-dir "/usr/share/tesseract-ocr/tessdata" --oem {oem} -- psm {psm}'.format(oem=oem,psm=psm))

    try:
#         img = Image.fromarray(img)
        text = pytesseract.image_to_string(img, config=config)

        return text
    except:
        
        return ""

In [81]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.GaussianBlur(image, (5,5), 0)

# thresholding
def thresholding(image):
    return cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 7, 4)

def preprocess_for_ocr(img, enhance=1):
    """
    @param img: image to which the pre-processing steps being applied
    """
    if enhance > 1:
        img = Image.fromarray(img)

        contrast = ImageEnhance.Contrast(img)

        img = contrast.enhance(enhance)

        img = np.asarray(img)
    
    
    gray = get_grayscale(img)
    blur = remove_noise(gray)
    res = thresholding(blur)

    img = cv2.cvtColor(res, cv2.COLOR_GRAY2BGR)

    return img

In [80]:
# Fuzzydict

from fuzzywuzzy import fuzz 

class FuzzyDict(dict):
    "Provides a dictionary that performs fuzzy lookup"
    def __init__(self, items = None, cutoff = .6):
        """Construct a new FuzzyDict instance

        items is an dictionary to copy items from (optional)
        cutoff is the match ratio below which mathes should not be considered
        cutoff needs to be a float between 0 and 1 (where zero is no match
        and 1 is a perfect match)"""
        super(FuzzyDict, self).__init__()

        if items:
            self.update(items)
        self.cutoff =  cutoff

        # short wrapper around some super (dict) methods
        self._dict_contains = lambda key: \
            super(FuzzyDict,self).__contains__(key)

        self._dict_getitem = lambda key: \
            super(FuzzyDict,self).__getitem__(key)

    def _search(self, lookfor, stop_on_first = False):
        """Returns the value whose key best matches lookfor

        if stop_on_first is True then the method returns as soon
        as it finds the first item
        """

        # if the item is in the dictionary then just return it
        if self._dict_contains(lookfor):
            return True, lookfor, self._dict_getitem(lookfor), 1

        # set up the fuzzy matching tool
        #ratio_calc = difflib.SequenceMatcher()
        #ratio_calc.set_seq1(lookfor)

        # test each key in the dictionary
        best_ratio = 0
        best_match = None
        best_key = None
        for key in self:

            # if the current key is not a string
            # then we just skip it
            if not isinstance(key, str):
                continue

            # we get an error here if the item to look for is not a
            # string - if it cannot be fuzzy matched and we are here
            # this it is defintely not in the dictionary
            try:
            # calculate the match value
                ratio = fuzz.ratio(lookfor, key)/100
            except TypeError:
                break

            # if this is the best ratio so far - save it and the value
            if ratio > best_ratio:
                best_ratio = ratio
                best_key = key
                best_match = self._dict_getitem(key)

            if stop_on_first and ratio >= self.cutoff:
                break

        return (
            best_ratio >= self.cutoff,
            best_key,
            best_match,
            best_ratio)


    def __contains__(self, item):
        "Overides Dictionary __contains__ to use fuzzy matching"
        if self._search(item, True)[0]:
            return True
        else:
            return False

    def __getitem__(self, lookfor):
        "Overides Dictionary __getitem__ to use fuzzy matching"
        matched, key, item, ratio = self._search(lookfor)

        if not matched:
            raise KeyError(
                "'%s'. closest match: '%s' with ratio %.3f"%
                    (str(lookfor), str(key), ratio))

        return item

In [53]:
def fuzzy_match_ingredients(ing_list, fuzdict):
    match_dict = {}
    for ing in tqdm(ing_list):
        if ing in match_dict.keys():
            continue
        upper_ing = ing.upper()
        if fuzdict.__contains__(upper_ing):
            match_dict[ing] = fuzdict[upper_ing]
        else:
            match_dict[ing] = 'unknown'
    
    return match_dict

In [52]:
def create_dict_english(df_inci, df_cosing):
    rating_inci = {}
    irritancy_inci = {}
    comedogenicity_inci = {}
    function_inci = {}
    qfacts_inci = {}
    desc_inci = {}
    
    desc_cosing = {}
    function_cosing = {}
    
    for idx, row in tqdm(df_inci.iterrows()):
        for name in row['ingredient_name'].split('/'):
            chem_name = name.strip()
            rating_inci[chem_name] = row['rating']
            irritancy_inci[chem_name] = row['irritancy']
            comedogenicity_inci[chem_name] = row['comedogenicity']
            function_inci[chem_name] = row['functions']
            qfacts_inci[chem_name] = row['quick_facts']
            desc_inci[chem_name] = row['description']
            
    for idx, row in tqdm(df_cosing.iterrows()):
        for name in row['ingredient_name'].split('/'):
            desc_cosing[name] = row['description']
            function_cosing[name] = row['functions']    
    
    return rating_inci, irritancy_inci, comedogenicity_inci, function_inci, qfacts_inci, desc_inci, desc_cosing, function_cosing

In [12]:
def lookup_all_english(ingredient_list, match_dict_inci, match_dict_cosing,
               df_inci, df_cosing, option=''):

    with open('eng_rating_inci.pickle', 'rb') as handle:
        rating_inci = pickle.load(handle)
    with open('eng_irritancy_inci.pickle', 'rb') as handle:
        irritancy_inci = pickle.load(handle)
    with open('eng_comedogenicity_inci.pickle', 'rb') as handle:
        comedogenicity_inci = pickle.load(handle)
    with open('eng_function_inci.pickle', 'rb') as handle:
        function_inci = pickle.load(handle)
    with open('eng_qfacts_inci.pickle', 'rb') as handle:
        qfacts_inci = pickle.load(handle)
    with open('eng_desc_inci.pickle', 'rb') as handle:
        desc_inci = pickle.load(handle)
    with open('eng_desc_cosing.pickle', 'rb') as handle:
        desc_cosing = pickle.load(handle)
    with open('eng_function_cosing.pickle', 'rb') as handle:
        function_cosing = pickle.load(handle)
        
    res = []
    
    for item in tqdm(ingredient_list):
        
        value = match_dict_inci[item]
        if value == 'unknown':
            key = match_dict_cosing.get(item, 'unknown')
            rating = 'No rating'
            irritancy = np.nan
            comedogenicity = np.nan
            functions = function_cosing.get(key, [])
            quickfacts = np.nan
            description = desc_cosing.get(key, [])        
                
        else:
            key = match_dict_inci.get(item, 'unknown')
            rating = rating_inci.get(key, 'No rating')
            irritancy = irritancy_inci.get(key, np.nan)
            comedogenicity = comedogenicity_inci.get(key, np.nan)
            functions = function_inci.get(key, [])
            quickfacts = qfacts_inci.get(key, [])
            description = desc_inci.get(key, [])
            
        if key != 'unknown':    
            if option == 'ingredient':
                res.append(key)
            elif option == 'rating':
                res.append(rating)
            elif option == 'irritancy':
                res.append(irritancy)
            elif option == 'comedogenicity':
                res.append(comedogenicity)
            elif option == 'functions':
                res.append(functions)
            elif option == 'quickfacts':
                res.append(quickfacts)
            elif option == 'description':
                res.append(description)
            else:
                res.extend([[key, functions, rating, irritancy, comedogenicity, quickfacts, description]])
            
    df_res = pd.DataFrame(res, columns=['Ingredient_name', 'Functions', 'Rating', 'Irritancy',
                                        'Comedogenicity', 'Quick_facts', 'Description'])
    
    return df_res

In [None]:
def lookup_all_vietnamese(ingredient_list, match_dict_cmd, match_dict_cosing,
               df_cmd, df_cosing, option=''):
    
    with open('vie_ratingscore_cmd.pickle', 'rb') as handle:
        ratingscore_cmd = pickle.load(handle)
    with open('vie_function_cmd.pickle', 'rb') as handle:
        function_cmd = pickle.load(handle)
    with open('vie_desc_cmd.pickle', 'rb') as handle:
        desc_cmd = pickle.load(handle)    
    
    with open('eng_desc_cosing.pickle', 'rb') as handle:
        desc_cosing = pickle.load(handle)
    with open('eng_function_cosing.pickle', 'rb') as handle:
        function_cosing = pickle.load(handle)
    
    res = []
    
    for item in tqdm(ingredient_list):
        
        value = match_dict_cmd[item]

        if value == 'unknown':
            key = match_dict_cosing.get(item, 'unknown')
            rating_score = 'Chưa đánh giá'
            functions = function_cosing.get(key, [])
            description = desc_cosing.get(key, [])
        else:
            key = match_dict_cmd.get(item, 'unknown')
            rating_score = ratingscore_cmd.get(key, np.nan)
            functions = function_cmd.get(key, [])
            description = desc_cmd.get(key, [])
            
        if key != 'unknown':             
            if option == 'ingredient':
                res.append(key)
            elif option == 'rating_score':
                res.append(rating_score)
            elif option == 'functions':
                res.append(functions)
            elif option == 'description':
                res.append(description)
            else:
                res.extend([[key, rating_score, functions, description]])
            
    df_res = pd.DataFrame(res, columns=['Ingredient_name', 'Rating_score', 'Functions', 'Description'])
    
    return df_res

In [13]:
def ocr_everything(img_path, boundingtxt_file, inci_path, cmd_path, cosing_path, language, debug=False):
    boxes = get_bounding_box(boundingtxt_file)
    
    # Preprocess image for OCR:
    img = cv2.imread(img_path)
    
    # doing OCR
    text = ''
    for box in boxes:
        cropped = crop_line(img_path, box)
        string = ocr(cropped)
        text = text + ' ' + str(string.strip('\n').strip('\x0c').strip())
    
    if debug:
        print(text)
    
    # Cleaning result from OCR
    text_result = clean_string(text)
    ing_list = string_to_list(text_result)
    
    if debug:
        print("-----")
        print(text_result)
        
    # Loading ingredient dataframe
    
    df_cosing = pd.read_csv(cosing_path) #'../Database/ingredient_cosing_37309.csv'
    # fd_cosing
    cosing_dict = {name.strip(): name.strip() for name in df_cosing['ingredient_name']}
    fd_cosing = FuzzyDict(cosing_dict, cutoff = .6)
    match_dict_cosing = fuzzy_match_ingredients(ing_list, fd_cosing)
    
    # Input for later models: KNN and randomforest
    model_input = [[name for name in match_dict_cosing.values()]]
    
    # fd main
    if language == 'Vietnamese':
        df_cmd = pd.read_csv(cmd_path) # Vietnamese database
        cmd_dict = {name.strip(): name.strip() for name in df_cmd['ingredient_name']}
        fd_cmd = FuzzyDict(cmd_dict, cutoff = .7)
        match_dict_fuzzy = fuzzy_match_ingredients(ing_list, fd_cmd)
        
    else:
        df_inci = pd.read_csv(inci_path) # '../Database/CALLMEDUY/ingredient_vietnamese_3818.csv'
        inci_dict = {name.strip(): name.strip() for name in df_inci['ingredient_name']}
        fd_inci = FuzzyDict(inci_dict, cutoff = .7)
        match_dict_fuzzy = fuzzy_match_ingredients(ing_list, fd_inci)

    
    # Compare product ingredient list and database
    # match_dict = find_matching_ingredient(ing_list, rating, 0.55)
    
    if debug:
        print(match_dict_fuzzy)
        print(list(match_dict_fuzzy.values()))

    if debug:
        print("length match_dict_fuzzy", len(match_dict_fuzzy))
        print("length match_dict_extra", len(match_dict_cosing))
    
    # Analyzing ingredient
    if language == 'Vietnamese':
        df_res = lookup_all_vietnamese(ing_list, match_dict_fuzzy, match_dict_cosing, df_cmd, df_cosing)
    
    else:
        df_res = lookup_all_english(ing_list, match_dict_fuzzy, match_dict_cosing, df_inci, df_cosing)
        
    return df_res, model_input

In [59]:
resdf = ocr_everything(fi, ftxt, 'Database/INCI/ingredient_inci_1570.csv', 'Database/ingredient_cosing_37309.csv', debug=True)

 “co LOW ifritation. Free of fragrances, Ween and mineral oil Yirections: Wet your palma and squeeza a smal amount « ito palm. Lather with water and massaga gently onto fice. Rinse thoroughly with water, ingredients: Water, Laurie Acid. Glycerin, Stearic Acid, Tocamidopropyl Belaine/ Water, Potassium Hydroxide, Butylene Glyool, Paimitie Acid, Acriates Copolymer / Water, Glycol Distearate, Polyquaternium-7, Sodium Laureth Sulfate, Cocamida DEA, Potassium Cocov’ Glycinate f Water, Stearic Acid / Disteardimonium Hectorite, Disodium EDTA, Mathyllsothiazolinone / lodopropyny! Butyicarbamate / Water, Glycerin / Water / Epiloblum Fleischet Eeract / Citic Acid Sodium Hyaluronate D Mported by-_| Dimport oleh: “eK Pharma Pte Lid Rohto-Mantholatum (Mf) Sdn Shd jg OL North Way #0101 (54691-L) Sa%i9 AGA ZN. ‘Unt 9-1, Level 9, Wisma 7 “S48 nina    69200 Kuala Lump Net te eae Ub8nBed by Rol lutica! Co, Jae? if [ie PTvErrnisce WAG manitachc| Ratu (China) 


  0%|          | 0/27 [00:00<?, ?it/s]

-----
“ co LOW ifritation . Free fragrances , Ween mineral oil Yirections : Wet palma squeeza smal amount « ito palm . Lather water massaga gently onto fice . Rinse thoroughly water , ingredients : Water , Laurie Acid . Glycerin , Stearic Acid , Tocamidopropyl Belaine/ Water , Potassium Hydroxide , Butylene Glyool , Paimitie Acid , Acriates Copolymer / Water , Glycol Distearate , Polyquaternium-7 , Sodium Laureth Sulfate , Cocamida DEA , Potassium Cocov ’ Glycinate f Water , Stearic Acid / Disteardimonium Hectorite , Disodium EDTA , Mathyllsothiazolinone / lodopropyny ! Butyicarbamate / Water , Glycerin / Water / Epiloblum Fleischet Eeract / Citic Acid Sodium Hyaluronate D Mported by- Dimport : “ eK Pharma Pte Lid Rohto-Mantholatum ( Mf ) Sdn Shd jg OL North Way # 0101 ( 54691-L ) Sa % i9 AGA ZN . ‘ Unt 9-1 , Level 9 , Wisma 7 “ S48 nina 69200 Kuala Lump Net eae Ub8nBed Rol lutica ! Co , Jae ? [ ie PTvErrnisce WAG manitachc Ratu ( China )


100%|██████████| 27/27 [00:00<00:00, 171.70it/s]
  9%|▉         | 1/11 [00:00<00:01,  8.49it/s]

{'“ co low ifritation': 'unknown', 'free fragrances': 'FRAGRANCE', 'ween mineral oil yirections : wet palma squeeza smal amount « ito palm': 'unknown', 'lather water massaga gently onto fice': 'unknown', 'rinse thoroughly water': 'unknown', 'ingredients : water': 'unknown', 'laurie acid': 'LAURIC ACID', 'glycerin': 'GLYCERIN', 'stearic acid': 'STEARIC ACID', 'tocamidopropyl belaine/ water': 'COCAMIDOPROPYL BETAINE', 'potassium hydroxide': 'POTASSIUM HYDROXIDE', 'butylene glyool': 'BUTYLENE GLYCOL', 'paimitie acid': 'PALMITIC ACID', 'acriates copolymer / water': 'ACRYLATES COPOLYMER', 'glycol distearate': 'GLYCOL DISTEARATE', 'polyquaternium-7': 'POLYQUATERNIUM-37', 'sodium laureth sulfate': 'SODIUM LAURETH SULFATE', 'cocamida dea': 'COCAMIDE DEA', 'potassium cocov ’ glycinate f water': 'POTASSIUM COCOYL GLYCINATE', 'stearic acid / disteardimonium hectorite': 'DISTEARDIMONIUM HECTORITE', 'disodium edta': 'DISODIUM EDTA', 'mathyllsothiazolinone / lodopropyny ! butyicarbamate / water': 'u

100%|██████████| 11/11 [00:01<00:00,  5.73it/s]
1531it [00:00, 7805.48it/s]

length match_dict_fuzzy 27
length match_dict_extra 11


1570it [00:00, 7509.74it/s]
37309it [00:03, 11041.70it/s]
100%|██████████| 27/27 [00:00<00:00, 59322.27it/s]


In [60]:
resdf

Unnamed: 0,Ingredient_name,Functions,Rating,Irritancy,Comedogenicity,Quick_facts,Description
0,unknown,[],unknown,unknown,unknown,unknown,[]
1,FRAGRANCE,{'perfuming': '/ingredient-functions/perfuming'},icky,,,,['Exactly what it sounds: nice smelling stuff ...
2,unknown,[],unknown,unknown,unknown,unknown,[]
3,unknown,[],unknown,unknown,unknown,unknown,[]
4,unknown,[],unknown,unknown,unknown,unknown,[]
5,ALPINIA URAIENSIS LEAF WATER,"FRAGRANCE, HUMECTANT",unknown,unknown,unknown,unknown,Alpinia Uraiensis Leaf Water is the aqueous so...
6,LAURIC ACID,{'anti-acne': '/ingredient-functions/anti-acne...,goodie,1,4,,"[""A 12 carbon length fatty acid that can be fo..."
7,GLYCERIN,{'skin-identical ingredient': '/ingredient-fun...,superstar,0,0,['A natural moisturizer that’s also in our ski...,['Glycerin doesn’t sound very glamorous but it...
8,STEARIC ACID,{'emollient': '/ingredient-functions/emollient...,No rating,0,2020-02-03 00:00:00,,['A common multi-tasker fatty acid. It makes y...
9,COCAMIDOPROPYL BETAINE,{'surfactant/cleansing': '/ingredient-function...,No rating,,,,['Super common ingredient in all kinds of clea...


In [61]:
def end_to_end(imgfile, inci_path, cosing_path, debug=False):
    dewarped_img = page_dewarp(imgfile)
    _, ctpn_txt = ctpn(dewarped_img)
    return ocr_everything(dewarped_img, ctpn_txt, inci_path, cosing_path, debug)


In [77]:
def e2e_no_dewarp(imgfile, inci_path, cosing_path, debug=False):
    img = cv2.imread(imgfile)
    processed_img = preprocess_for_ocr(img)
    
    basename = os.path.basename(imgfile)
    name, _ = os.path.splitext(basename)
    output_folder = "preprocessed"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    output_pp = os.path.join(output_folder, basename)
    cv2.imwrite(output_pp, processed_img)
    _, ctpn_txt = ctpn(output_pp)
    return ocr_everything(imgfile, ctpn_txt, inci_path, cosing_path, debug)

In [74]:
test_res = end_to_end("Sample_images/not_bad_at_coding.jpg", 
                      'Database/INCI/ingredient_inci_1570.csv', 
                      'Database/ingredient_cosing_37309.csv', 
                      debug=True)

loaded not_bad_at_coding.jpg with size 1280x794 and resized to 640x397
  got 12 spans with 147 points.
  initial objective is 0.000983501056451414
  optimizing 167 parameters...
  optimization took 7.03 sec.
  final objective is 0.00025956530201118067
  got page dims 1.7380212152528396 x 1.1293530256691877
  output will be 704x448
Restore from text-detection-ctpn/checkpoints_mlt/ctpn_50000.ckpt
INFO:tensorflow:Restoring parameters from text-detection-ctpn/checkpoints_mlt/ctpn_50000.ckpt
cost time: 2.66s
 AVIS UU UIING, UU PU OTT, PUTT TTT a ment. Nourrie e adoucle, votre peou retrouve son éciot naturel Tes dernatologiquenen. Ke pas appliquer sur i¢ visage, 4QLTRA DOUX S'ENGAGE POUR VOUS, Bean ct Wa MUM AU CC MY Wasa 0 ie Ultra Doux s’ engage 4 raspecier une chase de qualite al de bienveilkance. NOUS AVONS frovoilie CCTVEMeN! How vous ¢ Offtis des formules bonnes pour votre peau et de plus en plus respectueuses de lo planéta, an associan! notomment dé ia stveérine vegetole of Gu beurre 

  0%|          | 0/29 [00:00<?, ?it/s]

-----
AVIS UU UIING , UU PU OTT , PUTT TTT ment . Nourrie adoucle , peou retrouve éciot naturel Tes dernatologiquenen . Ke appliquer i¢ visage , 4QLTRA DOUX SENGAGE POUR VOUS , Bean ct Wa MUM AU CC MY Wasa 0 ie Ultra Doux ’ engage 4 raspecier chase qualite bienveilkance . NOUS AVONS frovoilie CCTVEMeN ! How ¢ Offtis formules bonnes peau plus plus respectueuses planéta , associan ! notomment dé stveérine vegetole Gu beurre karilé issu dun comme : équitable , INPRITUEL DE SOIN ADOUCISSANT Déecouvrez Gussi Note Douche Soin nourrissante ef assoulissanie , fe Lott Coco 0d Nok Mocodomid , peau nadie foul couceur . CTS 5 ~ INGREDIENTS : AIA ! WATER , GLYCERIN , PARUFEINUM LIDUIDUM / MURAL OF GUTYROOPO IMUM PAID UTTER ( SHEA SUTTER CETEARYL ALCOHOL DIMETHICOME CARRY YL GLICK CARMAN CARROREA 0 10709 / IRTL aor CCERA EMIT EXTRACT COCOM IT FRIIT XTRAS GIYCHEM VIP AD , ( OCIS Wi STEAMATE , MACAD # MiR TERNIFOLIA SEED CHL , MYSTIC ACID , PALMITIC AGED , Z28RO01 FEG-i00 STEARATE . “ HEMONY ETHANOL P

100%|██████████| 29/29 [00:00<00:00, 124.55it/s]
  5%|▍         | 1/21 [00:00<00:02,  8.08it/s]

{'avis uu uiing': 'unknown', 'uu pu ott': 'unknown', 'putt ttt ment': 'unknown', 'nourrie adoucle': 'unknown', 'peou retrouve éciot naturel tes dernatologiquenen': 'unknown', 'ke appliquer i¢ visage': 'unknown', '4qltra doux sengage pour vous': 'unknown', 'bean ct wa mum au cc my wasa 0 ie ultra doux ’ engage 4 raspecier chase qualite bienveilkance': 'unknown', 'nous avons frovoilie cctvemen ! how ¢ offtis formules bonnes peau plus plus respectueuses planéta': 'unknown', 'associan ! notomment dé stveérine vegetole gu beurre karilé issu dun comme : équitable': 'unknown', 'inprituel de soin adoucissant déecouvrez gussi note douche soin nourrissante ef assoulissanie': 'unknown', 'fe lott coco 0d nok mocodomid': 'unknown', 'peau nadie foul couceur': 'unknown', 'cts 5 ~ ingredients : aia ! water': 'unknown', 'glycerin': 'GLYCERIN', 'parufeinum liduidum / mural of gutyroopo imum paid utter ( shea sutter cetearyl alcohol dimethicome carry yl glick carman carrorea 0 10709 / irtl aor ccera emit

100%|██████████| 21/21 [00:03<00:00,  6.17it/s]
692it [00:00, 6909.50it/s]

length match_dict_fuzzy 29
length match_dict_extra 21


1570it [00:00, 6784.75it/s]
37309it [00:03, 9831.50it/s] 
100%|██████████| 29/29 [00:00<00:00, 75549.58it/s]


In [75]:
test_res

Unnamed: 0,Ingredient_name,Functions,Rating,Irritancy,Comedogenicity,Quick_facts,Description
0,GLYCERIN,{'skin-identical ingredient': '/ingredient-fun...,superstar,0,0,['A natural moisturizer that’s also in our ski...,['Glycerin doesn’t sound very glamorous but it...
1,ISOSTEARATE,"HUMECTANT, SKIN CONDITIONING",unknown,unknown,unknown,unknown,"Fatty acids, C18-unsatd., dimers, hydrogenated..."
2,MACADAMIA TERNIFOLIA SEED OIL,{'emollient': '/ingredient-functions/emollient'},goodie,,,,"[""The golden yellow oil coming from the Macada..."
3,MYRISTIC ACID,{'surfactant/cleansing': '/ingredient-function...,No rating,0,3,,"[""A 14 carbon length fatty acid that can be na..."
4,PALMITIC ACID,{'skin-identical ingredient': '/ingredient-fun...,No rating,0,2,,"[""A fatty acid that can be found naturally in ..."
5,PEG-100 STEARATE,{'surfactant/cleansing': '/ingredient-function...,No rating,0,0,,['A very common water-loving surfactant and em...
6,POTASSIUM SORBATE,"FRAGRANCE, PRESERVATIVE",unknown,unknown,unknown,unknown,"Potassium (E,E)-hexa-2,4-dienoate"
7,PRUNUS AMYGDALUS DULCIS OIL,{'emollient': '/ingredient-functions/emollient'},goodie,0,2020-01-03 00:00:00,,['The emollient plant oil that comes from almo...
8,SWEET ALMOND OIL,"EMULSION STABILISING, HAIR CONDITIONING, HUMEC...",unknown,unknown,unknown,unknown,Pseudozyma Epicola/Soybean Flour/Apricot Kerne...
9,STEARIC ACID,{'emollient': '/ingredient-functions/emollient...,No rating,0,2020-02-03 00:00:00,,['A common multi-tasker fatty acid. It makes y...


In [82]:
test_res = e2e_no_dewarp("Sample_images/not_bad_at_coding.jpg", 
                      'Database/INCI/ingredient_inci_1570.csv', 
                      'Database/ingredient_cosing_37309.csv', 
                      debug=True)

Restore from text-detection-ctpn/checkpoints_mlt/ctpn_50000.ckpt
INFO:tensorflow:Restoring parameters from text-detection-ctpn/checkpoints_mlt/ctpn_50000.ckpt
cost time: 2.84s
 fexture UNTO TONGUTHS, UU PUTTUITT QUGrTTRaTy, | A Ww ment. Nourrie et adoucie, votre peau retrouve son éciat nature! Testé dermatologiquement. Ne pas appliquer sur le visage, | ULTRA DOUX S’ENGAGE POUR VOUS, POUR BATIR UN AVENIR MEILLEUR Ultra Doux s‘engage 4 respecter une charte de qualité et ¢: gt Oe... bienveillance. Nous avons travaillé activement pour yo. -  offrir des formules bonnes pour votre peau et de plus «- « plus respectueuses de la planéte, en associant notamme —_ rs do in otyeérine véegétale et du beurre de karité issu ¢ 0 Pow'®®” comm: —: equitable. — wo Uwe Découvrez aussi notre Douche Soin nourrissante et assouplissante, qui assoc \e Lait de Coco 4 la Noix de Macadamia, pour une peau nourrie tout en douceur ny Vi
ol 967365 5 - INGREDIENTS ; AQUA / WATER, GLYCERIN. PARAFFINUM LIQUIDUM / MINERAL

  0%|          | 0/31 [00:00<?, ?it/s]

-----
fexture UNTO TONGUTHS , UU PUTTUITT QUGrTTRaTy , A Ww ment . Nourrie adoucie , peau retrouve éciat nature ! Testé dermatologiquement . Ne appliquer visage , ULTRA DOUX S ’ ENGAGE POUR VOUS , POUR BATIR UN AVENIR MEILLEUR Ultra Doux ‘ engage 4 respecter charte qualité ¢ : gt Oe ... bienveillance . Nous travaillé activement . - offrir formules bonnes peau plus « - « plus respectueuses planéte , associant notamme — rs otyeérine véegétale beurre karité issu ¢ 0 Pow®® ” comm : — : equitable . — Uwe Découvrez aussi Douche Soin nourrissante assouplissante , assoc Lait Coco 4 Noix Macadamia , peau nourrie tout douceur ny Vi ol 967365 5 - INGREDIENTS ; AQUA / WATER , GLYCERIN . PARAFFINUM LIQUIDUM / MINERAL OIL BUTYROSPERMUM P STEARATE , MACADAMIA TERNIFOLIA SEED OIL , MYRISTIC ACID , PALMITIC ACID , 28R001 PEG-100 STEARATE . PMENOXYETHANOL POTASSIUM SORBATE PRUNUS AMYGDALUS AMTHAN , DULCIS Ol . / SWEET ALMOND ON SODIUM HYDROXIDE . STEARIC + ACID , KF GUM . PAREUM FRAGRANCE ( FL . 8213547

100%|██████████| 31/31 [00:00<00:00, 160.26it/s]
  4%|▍         | 1/23 [00:00<00:02,  7.56it/s]

{'fexture unto tonguths': 'unknown', 'uu puttuitt qugrttraty': 'unknown', 'a ww ment': 'unknown', 'nourrie adoucie': 'unknown', 'peau retrouve éciat nature ! testé dermatologiquement': 'unknown', 'ne appliquer visage': 'unknown', 'ultra doux s ’ engage pour vous': 'unknown', 'pour batir un avenir meilleur ultra doux ‘ engage 4 respecter charte qualité ¢ : gt oe': 'unknown', '': 'unknown', 'bienveillance': 'unknown', 'nous travaillé activement': 'unknown', '- offrir formules bonnes peau plus « - « plus respectueuses planéte': 'unknown', 'associant notamme — rs otyeérine véegétale beurre karité issu ¢ 0 pow®® ” comm : — : equitable': 'unknown', '— uwe découvrez aussi douche soin nourrissante assouplissante': 'unknown', 'assoc lait coco 4 noix macadamia': 'unknown', 'peau nourrie tout douceur ny vi ol 967365 5 - ingredients ; aqua / water': 'unknown', 'glycerin': 'GLYCERIN', 'paraffinum liquidum / mineral oil butyrospermum p stearate': 'unknown', 'macadamia ternifolia seed oil': 'MACADAMI

100%|██████████| 23/23 [00:03<00:00,  6.89it/s]
1570it [00:00, 8042.75it/s]
0it [00:00, ?it/s]

length match_dict_fuzzy 30
length match_dict_extra 23


37309it [00:03, 10176.13it/s]
100%|██████████| 31/31 [00:00<00:00, 77858.34it/s]


In [83]:
test_res

Unnamed: 0,Ingredient_name,Functions,Rating,Irritancy,Comedogenicity,Quick_facts,Description
0,LAC FERMENT,SKIN CONDITIONING,unknown,unknown,unknown,unknown,Lactobacillus/Lac Ferment is a product obtaine...
1,GLYCERIN,{'skin-identical ingredient': '/ingredient-fun...,superstar,0,0,['A natural moisturizer that’s also in our ski...,['Glycerin doesn’t sound very glamorous but it...
2,MACADAMIA TERNIFOLIA SEED OIL,{'emollient': '/ingredient-functions/emollient'},goodie,,,,"[""The golden yellow oil coming from the Macada..."
3,MYRISTIC ACID,{'surfactant/cleansing': '/ingredient-function...,No rating,0,3,,"[""A 14 carbon length fatty acid that can be na..."
4,PALMITIC ACID,{'skin-identical ingredient': '/ingredient-fun...,No rating,0,2,,"[""A fatty acid that can be found naturally in ..."
5,PEG-100 STEARATE,{'surfactant/cleansing': '/ingredient-function...,No rating,0,0,,['A very common water-loving surfactant and em...
6,EDULIS,,unknown,unknown,unknown,unknown,Boletus Aereus/Aestivalis/Edulis/Pinicola Extr...
7,AMMONIUM HYDROXIDE,"BUFFERING, DENATURANT",unknown,unknown,unknown,unknown,Ammonium hydroxyde
8,STEARIC ACID,{'emollient': '/ingredient-functions/emollient...,No rating,0,2020-02-03 00:00:00,,['A common multi-tasker fatty acid. It makes y...
9,PARFUM/ FRAGRANCE,[],unknown,unknown,unknown,[],[]
