In [1]:
import re
import cv2
import numpy as np
import pandas as pd
import shutil
import os
import random
import pytesseract
from pytesseract import Output
from matplotlib import pyplot as plt
# from google.colab.patches import cv2_imshow
try:
    from PIL import Image, ImageEnhance
except ImportError:
    import Image

    import re

import operator
import itertools
import gc
import pickle

from difflib import SequenceMatcher
from ast import literal_eval
from collections import Counter
from tqdm import tqdm

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.GaussianBlur(image, (5,5), 0)

# thresholding
def thresholding(image):
    return cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 7, 4)
                            
# return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

# dilation: enhance the bright area
def dilate(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.dilate(image, kernel, iterations=1)

# erosion: enhance the dark area
def erode(image):
  kernel = np.ones((5,5), np.unint8)
  return cv2.erode(image, kernel, iterations=1)

# opening: erosion follow by a dilation
def opening(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

# closing: Dilation followed by Erosion. Removing black holes inside the object 
def closing(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.morphologyEx(opening(image), cv2.MORPH_CLOSE, kernel)

# canny
def canny(image):
      return cv2.Canny(image, 100, 200)

# deskew image
def deskew(image):
    coords = np.column_stack(np.where(image>0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

# template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)

In [3]:
def preprocess_for_ocr(img, enhance=1):
    """
    @param img: image to which the pre-processing steps being applied
    """
    if enhance > 1:
        img = Image.fromarray(img)

        contrast = ImageEnhance.Contrast(img)

        img = contrast.enhance(enhance)

        img = np.asarray(img)
    
    
    gray = get_grayscale(img)
    blur = remove_noise(gray)
    res = thresholding(blur)

    img = cv2.cvtColor(res, cv2.COLOR_GRAY2BGR)

    return img

In [4]:
def get_bounding_box(txt):
    annotation = txt
    with open(annotation, "r") as file1:
        bounding_boxes = file1.read()
        
    bounding_boxes = bounding_boxes.split('\n')[:-1]
    boxes = [i.split(',')[:-1] for i in bounding_boxes]

    new_boxes = []
    for box in boxes:
        new_box = []
        for i, each in enumerate(box):
            num = int(each)
            if i in [0, 1, 3, 6]:
                num -= 3
            else: 
                num += 3
            new_box.append(num)
        new_boxes.append(new_box)
    new_boxes.sort(key=lambda x: x[1])
    
    return new_boxes

In [5]:
def crop_line(img_path, box):
    img = cv2.imread(img_path)
    img, (rh, rw) = resize_image(img)
    # points for test.jpg
    cnt = np.array([
            [[box[0], box[1]]],
            [[box[2], box[3]]],
            [[box[4], box[5]]],
            [[box[6], box[7]]]
        ])
    # print("shape of cnt: {}".format(cnt.shape))
    rect = cv2.minAreaRect(cnt)
#     print("rect: {}".format(rect))

    # the order of the box points: bottom left, top left, top right,
    # bottom right
    box = cv2.boxPoints(rect)
    box = np.int0(box)

    # print("bounding box: {}".format(box))
    cv2.drawContours(img, [box], 0, (0, 0, 255), 2)

    # get width and height of the detected rectangle
    width = int(rect[1][0])
    height = int(rect[1][1])
    angle = rect[2]

    src_pts = box.astype("float32")
    # coordinate of the points in box points after the rectangle has been
    # straightened
    dst_pts = np.array([[0, height+2],
                        [0, 0],
                        [width, 0],
                        [width, height+2]], dtype="float32")

    # the perspective transformation matrix
    M = cv2.getPerspectiveTransform(src_pts, dst_pts)

    # directly warp the rotated rectangle to get the straightened rectangle
    warped = cv2.warpPerspective(img, M, (width, height))

    # cv2.imwrite("crop_img.jpg", warped)
    
    # cv2.waitKey(0)
    if angle < -45:
      warped = np.transpose(warped,(1,0,2))
      warped = warped[::-1]

#     cv2.imshow('croped', warped)
#     cv2.waitKey(0)
#     cv2.destroyAllWindows()


    return warped

In [8]:
def resize_image(img):
    img_size = img.shape
    im_size_min = np.min(img_size[0:2])
    im_size_max = np.max(img_size[0:2])

    im_scale = float(600) / float(im_size_min)
    if np.round(im_scale * im_size_max) > 1200:
        im_scale = float(1200) / float(im_size_max)
    new_h = int(img_size[0] * im_scale)
    new_w = int(img_size[1] * im_scale)

    new_h = new_h if new_h // 16 == 0 else (new_h // 16 + 1) * 16
    new_w = new_w if new_w // 16 == 0 else (new_w // 16 + 1) * 16

    re_im = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
    return re_im, (new_h / img_size[0], new_w / img_size[1])

def crop_rect(img, rect):
    # get the parameter of the small rectangle
    center, size, angle = rect[0], rect[1], rect[2]
    center, size = tuple(map(int, center)), tuple(map(int, size))

    # get row and col num in img
    height, width = img.shape[0], img.shape[1]

    # calculate the rotation matrix
    M = cv2.getRotationMatrix2D(center, angle, 1)
    # rotate the original image
    img_rot = cv2.warpAffine(img, M, (width, height))

    # now rotated rectangle becomes vertical, and we crop it
    img_crop = cv2.getRectSubPix(img_rot, size, center)

    return img_crop, img_rot

In [9]:
def ocr(img, oem=3, psm=6):
    """
    @param img: The image to be OCR'd
    @param oem: for specifying the type of Tesseract engine( default=1 for LSTM OCR Engine)
    """
    config = ('-l eng --oem {oem} --psm {psm}'.format(oem=oem,psm=psm))
    # config = ('-l eng --tessdata-dir "/usr/share/tesseract-ocr/tessdata" --oem {oem} -- psm {psm}'.format(oem=oem,psm=psm))

    try:
#         img = Image.fromarray(img)
        text = pytesseract.image_to_string(img, config=config)

        return text
    except:
        
        return ""

In [10]:
def ocr_metrics(predicts, ground_truth, norm_accentuation=False, norm_punctuation=False):
    """Calculate Character Error Rate (CER), Word Error Rate (WER) and Sequence Error Rate (SER)"""
    
    import string
    import unicodedata
    import editdistance

    if len(predicts) == 0 or len(ground_truth) == 0:
        return (1, 1, 1)

    cer, wer, ser = [], [], []

    for (pd, gt) in zip(predicts, ground_truth):

        if norm_accentuation:
            pd = unicodedata.normalize("NFKD", pd).encode("ASCII", "ignore").decode("ASCII")
            gt = unicodedata.normalize("NFKD", gt).encode("ASCII", "ignore").decode("ASCII")

        if norm_punctuation:
            pd = pd.translate(str.maketrans("", "", string.punctuation))
            gt = gt.translate(str.maketrans("", "", string.punctuation))

        pd_cer, gt_cer = list(pd.lower()), list(gt.lower())
        dist = editdistance.eval(pd_cer, gt_cer)
        cer.append(dist / (max(len(pd_cer), len(gt_cer))))

        pd_wer, gt_wer = pd.lower().split(), gt.lower().split()
        dist = editdistance.eval(pd_wer, gt_wer)
        wer.append(dist / (max(len(pd_wer), len(gt_wer))))

        pd_ser, gt_ser = [pd], [gt]
        dist = editdistance.eval(pd_ser, gt_ser)
        ser.append(dist / (max(len(pd_ser), len(gt_ser))))

    cer_f = sum(cer) / len(cer)
    wer_f = sum(wer) / len(wer)
    ser_f = sum(ser) / len(ser)
    
    evaluate = (cer_f, wer_f, ser_f)
    
    e_corpus = "\n".join([
    "Metrics:",
    "Character Error Rate: {}".format(evaluate[0]),
    "Word Error Rate:      {}".format(evaluate[1]),
    "Sequence Error Rate:  {}".format(evaluate[2]),
    ])

    return print(e_corpus)

In [134]:
def clean_string(string):
#     string = string + ' ' + str(string.strip('\n').strip('\x0c').strip())
    text = string.replace('INACTIVE INGREDIENTS:', '') # added
    text = text.replace('ACTIVE INGREDIENTS:', '') # added
    text = text.split(':')[1]
    text = text.strip()

    return text

def string_to_list(text):
    pattern = "[\|\*\_\'\{}]".format('"')
    text = re.sub(pattern, "", text)  
    split = [remove_water(x) for x in text.split(',')]
    
    return split

In [170]:
def remove_water(string):
    water = ['WATER (AQUA)', 'AQUA', 'EAU', 'AQUA/WATER/EAU', 'AQUA / WATER / EAU', 
             'PURIFIED WATER', 'DISTILLED WATER', 'D.I. WATER', 'AQUA (WATER)', 'AQUA (PURIFIED)']
    text = string.upper()
    if text in water:
        text = 'WATER'
    text = text.strip('  ')
    
    return text

In [112]:
def clean_string_name(string):
    text = string.replace(' Size: 4 oz. * USDA Certified Organic Ingredient ** None remains after saponifying oils into soap and glycerin', '')
    pattern = "[\|\*\_\'\{}&]".format('"')
    regex1 = re.compile('\[.*]')
    regex2 = re.compile('\[.*}')
    text = re.sub(pattern, "", text)
    text = re.sub(regex1, "", text)
    text = re.sub(regex2, "", text)
    text = text.strip('\x0c')
    text = text.strip(' ')
    text = text.upper()
    text = text.replace('INACTIVE INGREDIENTS:', '')
    text = text.replace('ACTIVE INGREDIENTS:', '')

    return text

In [13]:
def find_matching_ingredient(my_ingredients, fd, thresh=0.25): # rating_dict, category_dict, 
    ''' my_ingredients: list of product's ingredients
        Loop thru each ingredient in the ingredient list of the products
        then check if that ingredient appears in our ingredient list
        Calculate match_metric using SequenceMatcher and return the highest score and the best match
        Compare the match_metric with thresh > append to match_dict
    '''
    match_dict = {}
    for ingredient in tqdm(my_ingredients):
        if ingredient in match_dict.keys():
            continue
#         match_metric = {key: SequenceMatcher(None, key, ingredient).ratio() for key in rating_dict.keys()}
        match_metric = {key: SequenceMatcher(None, key, ingredient).ratio() for key in fd}
        best_match, best_metric = max(match_metric.items(), key=operator.itemgetter(1)) # What is operator.itemgetter(1)
        if best_metric > thresh:
            match_dict[ingredient] = best_match
        else:
            match_dict[ingredient] = 'unknown'
    return match_dict

In [150]:
def create_dict_english(df_inci, df_cosing):
    rating_inci = {}
    irritancy_inci = {}
    comedogenicity_inci = {}
    function_inci = {}
    qfacts_inci = {}
    desc_inci = {}
    
    desc_cosing = {}
    function_cosing = {}
    
    for idx, row in tqdm(df_inci.iterrows()):
        for name in row['ingredient_name'].split('/'):
            chem_name = name.strip()
            rating_inci[chem_name] = row['rating']
            irritancy_inci[chem_name] = row['irritancy']
            comedogenicity_inci[chem_name] = row['comedogenicity']
            function_inci[chem_name] = row['functions']
            qfacts_inci[chem_name] = row['quick_facts']
            desc_inci[chem_name] = row['description']
            
    for idx, row in tqdm(df_cosing.iterrows()):
        for name in row['ingredient_name'].split('/'):
            desc_cosing[name] = row['description']
            function_cosing[name] = row['functions']    
    
    return rating_inci, irritancy_inci, comedogenicity_inci, function_inci, qfacts_inci, desc_inci, desc_cosing, function_cosing

In [151]:
rating_inci, irritancy_inci, comedogenicity_inci, function_inci, qfacts_inci, desc_inci, desc_cosing, function_cosing = create_dict_english(df_inci, df_cosing)

1570it [00:00, 3647.06it/s]
37309it [00:08, 4371.06it/s]


In [153]:
irritancy_inci

{'OUBAKU EKISU': nan,
 'TYPE OF CLAY': '0',
 '18:2 CIS-9': nan,
 '3-O-ETHYL ASCORBATE': nan,
 'ALA': nan,
 'ATIP': nan,
 'ACETYL GLUTAMYL HEPTAPEPTIDE-1': nan,
 'ACETYL HEXAPEPTIDE-3': nan,
 'ACID RED 92 PHLOXINE': '0.0',
 'AL2O3': nan,
 'ASCORBYL ISOTETRAPALMITATE': nan,
 'ASCORBYL TETRAISOPALMITATE': nan,
 'ASTRAGALI RADIX': nan,
 'BEMOTRIZINOL': nan,
 'BISDISULIZOLE DISODIUM': nan,
 'BISOCTRIZOLE': nan,
 'CHILEAN WINEBERRY OIL': nan,
 'CHINA CLAY': '0.0',
 'CHOPI': nan,
 'CITRUS SINENSIS OIL': nan,
 'COCOCIN': nan,
 'COCONUT LIQUID ENDOSPERM': nan,
 'COLLOIDAL OATMEAL': nan,
 'COPPER PEPTIDE': nan,
 'CURCUMIN': nan,
 'D&C RED 33': '2.0',
 'DHHB': nan,
 'EAC': nan,
 'EGF': nan,
 'EPO': '2.0',
 'EDELWEISS STEM CELL EXTRACT': nan,
 'ELIX-IR': nan,
 'EUSOLEX 6007': nan,
 'FGF1': nan,
 'FGF2': nan,
 'FERRIC AMMONIUM FERROCYANIDE': nan,
 'FILIPENDULA ULMARIA EXTRACT': nan,
 'FLAXSEED OIL': nan,
 'FORM OF RETINOIDS': nan,
 'FORM OF VITAMIN F': nan,
 'FORMERLY PALMITOYL PENTAPEPTIDE-3': nan

In [168]:
def lookup_all(ingredient_list, match_dict, match_dict_extra,
               df_inci, df_cosing, option=''):

    rating_inci, irritancy_inci, comedogenicity_inci, function_inci, qfacts_inci, desc_inci, desc_cosing, function_cosing = create_dict_english(df_inci, df_cosing)
    
    res = []
    
    for item in tqdm(ingredient_list):
        
        value = match_dict[item]
        key = match_dict.get(item, 'unknown')
        if value == 'unknown':
            key = match_dict_extra.get(item, 'unknown')
            rating = 'unknown'
            irritancy = 'unknown'
            comedigenicity = 'unknown'
            functions = function_cosing.get(key, [])
            quickfacts = 'unknown'
            description = desc_cosing.get(key, [])
        else:
            rating = rating_inci.get(key, 'unknown')
            irritancy = irritancy_inci.get(key, 'unknown')
            comedogenicity = comedogenicity_inci.get(key, 'unknown')
            functions = function_inci.get(key, [])
            quickfacts = qfacts_inci.get(key, [])
            description = desc_inci.get(key, [])
            
        if option == 'ingredient':
            res.append(key)
        elif option == 'rating':
            res.append(rating)
        elif option == 'irritancy':
            res.append(irritancy)
        elif option == 'comedogenicity':
            res.append(comedogenicity)
        elif option == 'functions':
            res.append(functions)
        elif option == 'quickfacts':
            res.append(quickfacts)
        elif option == 'description':
            res.append(description)
        else:
            res.extend([[key, functions, rating, irritancy, comedogenicity, quickfacts, description]])
            
    df_res = pd.DataFrame(res, columns=['Ingredient_name', 'Functions', 'Rating', 'Irritancy',
                                        'Comedogenicity', 'Quick_facts', 'Description'])
    
    return df_res

# **Function for fuzzy dict**

In [16]:
# Fuzzydict

from fuzzywuzzy import fuzz 

class FuzzyDict(dict):
    "Provides a dictionary that performs fuzzy lookup"
    def __init__(self, items = None, cutoff = .6):
        """Construct a new FuzzyDict instance

        items is an dictionary to copy items from (optional)
        cutoff is the match ratio below which mathes should not be considered
        cutoff needs to be a float between 0 and 1 (where zero is no match
        and 1 is a perfect match)"""
        super(FuzzyDict, self).__init__()

        if items:
            self.update(items)
        self.cutoff =  cutoff

        # short wrapper around some super (dict) methods
        self._dict_contains = lambda key: \
            super(FuzzyDict,self).__contains__(key)

        self._dict_getitem = lambda key: \
            super(FuzzyDict,self).__getitem__(key)

    def _search(self, lookfor, stop_on_first = False):
        """Returns the value whose key best matches lookfor

        if stop_on_first is True then the method returns as soon
        as it finds the first item
        """

        # if the item is in the dictionary then just return it
        if self._dict_contains(lookfor):
            return True, lookfor, self._dict_getitem(lookfor), 1

        # set up the fuzzy matching tool
        #ratio_calc = difflib.SequenceMatcher()
        #ratio_calc.set_seq1(lookfor)

        # test each key in the dictionary
        best_ratio = 0
        best_match = None
        best_key = None
        for key in self:

            # if the current key is not a string
            # then we just skip it
            if not isinstance(key, str):
                continue

            # we get an error here if the item to look for is not a
            # string - if it cannot be fuzzy matched and we are here
            # this it is defintely not in the dictionary
            try:
            # calculate the match value
                ratio = fuzz.ratio(lookfor, key)/100
            except TypeError:
                break

            # if this is the best ratio so far - save it and the value
            if ratio > best_ratio:
                best_ratio = ratio
                best_key = key
                best_match = self._dict_getitem(key)

            if stop_on_first and ratio >= self.cutoff:
                break

        return (
            best_ratio >= self.cutoff,
            best_key,
            best_match,
            best_ratio)


    def __contains__(self, item):
        "Overides Dictionary __contains__ to use fuzzy matching"
        if self._search(item, True)[0]:
            return True
        else:
            return False

    def __getitem__(self, lookfor):
        "Overides Dictionary __getitem__ to use fuzzy matching"
        matched, key, item, ratio = self._search(lookfor)

        if not matched:
            raise KeyError(
                "'%s'. closest match: '%s' with ratio %.3f"%
                    (str(lookfor), str(key), ratio))

        return item

In [17]:
def fuzzy_match_ingredients(ing_list, fuzdict):
    match_dict = {}
    for ing in tqdm(ing_list):
        if ing in match_dict.keys():
            continue
        upper_ing = ing.upper()
        if fuzdict.__contains__(upper_ing):
            match_dict[ing] = fuzdict[upper_ing]
        else:
            match_dict[ing] = 'unknown'
    
    return match_dict


# **Testing: Englist**

In [93]:
# df_ingredient = pd.read_csv(ingredient_df_path)
df_inci = pd.read_csv('../Database/INCI/ingredient_inci_1570.csv')
df_cosing = pd.read_csv('../Database/ingredient_cosing_37309.csv')
df_paula = pd.read_csv('../Database/PAULA/ingredient_paula_1833_new.csv')

In [90]:
# df_inci.rename(columns={'Ingredient_name': 'ingredient_name',
#                        'Rating': 'rating',
#                         'Irritancy': 'irritancy',
#                         'Comedogenicity': 'comedogenicity',
#                         'Rating_score': 'rating_score',
#                        'Functions': 'functions',
#                        'Quick_facts': 'quick_facts',
#                        'Details': 'description',
#                        'Link': 'link'}, inplace=True)
# df_inci['ingredient_name'] = df_inci['ingredient_name'].apply(lambda x: x.upper())

In [None]:
# # df_paula['rating'] = df_paula['rating'].apply(lambda x: 'Good' if x == 'GOOD' else x)
# df_paula['rating'] = df_paula['rating'].map({'Best': 'superstar',
#                                             'Good': 'goodie',
#                                              'GOOD': 'goodie',
#                                             'Poor': 'icky',
#                                             'Average': 'average'
#                                             })
# df_paula['rating_score'] = df_paula['rating'].map({'icky': 0,
#                                                  'average': 1,
#                                                  'goodie': 2,
#                                                  'superstar': 3})
# df_paula.rename(columns={'name': 'ingredient_name',
#                         'category': 'functions',
#                         }, inplace=True)
# df_paula['ingredient_name'] = df_paula['ingredient_name'].apply(lambda x: x.upper())

In [137]:
boxes = get_bounding_box('../text-detection-ctpn/data/res/sample5.txt')
img_path = '../Sample_images/sample5.JPG'
# img = cv2.imread('../Sample_images/sample5.JPG')
# img_ocr = preprocess_for_ocr(img)
# cv2.imshow('sample', img_ocr)
# cv2.waitKey(0)
# cv2.destroyAllWindows()

text = ''
for box in boxes:
    croped = crop_line(img_path, box)
    string = ocr(croped)
    text = text + ' ' + str(string.strip('\n').strip('\x0c').strip())

text_result = clean_string(text)
ing_list = string_to_list(text_result)


In [138]:
print(text_result)

AQUA (WATER), CAPRYLIC/CAPRIC. TRIGLYCERIDE, CETYL ALCOHOL, PROPANEDIOL, STEARYL ALCOHOL, GLYCERIN, SODIUM HYALURONATE, ARGININE, ASPARTIC ACID, GLYCINE, ALANINE, SERINE, VALINE, ISOLEUCINE, PROLINE, THREONINE, HISTIDINE, PHENYLALANINE, GLUCOSE, MALTOSE, FRUCTOSE, TREHALOSE, SODIUM PCA. PCA. SODIUM LACTATE, UREA, ALLANTOIN, LINOLEIC ACID, OLEIC ACID, PHYTOSTERYL CANOLA GLYCERIDES, PALMITIC ACID, STEARIC ACID, LECITHIN, TRIOLEIN, TOCOPHEROL, CARBOMER, ISOCETETH-20, POLYSORBATE 60, SODIUM CHLORIDE, CITRIC ACID, TRISODIUM ETHYLENEDIAMINE DISUCCINATE, PENTYLENE GLYCOL. TRIETHANOLAMINE, SODIUM HYDROXIDE, PHENOXYETHANOL, CHLORPHENESIN. eee ere


In [139]:
print(ing_list)

['WATER', 'CAPRYLIC/CAPRIC. TRIGLYCERIDE', 'CETYL ALCOHOL', 'PROPANEDIOL', 'STEARYL ALCOHOL', 'GLYCERIN', 'SODIUM HYALURONATE', 'ARGININE', 'ASPARTIC ACID', 'GLYCINE', 'ALANINE', 'SERINE', 'VALINE', 'ISOLEUCINE', 'PROLINE', 'THREONINE', 'HISTIDINE', 'PHENYLALANINE', 'GLUCOSE', 'MALTOSE', 'FRUCTOSE', 'TREHALOSE', 'SODIUM PCA. PCA. SODIUM LACTATE', 'UREA', 'ALLANTOIN', 'LINOLEIC ACID', 'OLEIC ACID', 'PHYTOSTERYL CANOLA GLYCERIDES', 'PALMITIC ACID', 'STEARIC ACID', 'LECITHIN', 'TRIOLEIN', 'TOCOPHEROL', 'CARBOMER', 'ISOCETETH-20', 'POLYSORBATE 60', 'SODIUM CHLORIDE', 'CITRIC ACID', 'TRISODIUM ETHYLENEDIAMINE DISUCCINATE', 'PENTYLENE GLYCOL. TRIETHANOLAMINE', 'SODIUM HYDROXIDE', 'PHENOXYETHANOL', 'CHLORPHENESIN. EEE ERE']


In [131]:
cosing_dict = {name.strip(): name.strip() for name in df_cosing['ingredient_name']}
fd_cosing = FuzzyDict(cosing_dict, cutoff = .6)
print(len(fd_cosing))

paula_dict = {name.strip(): name.strip() for name in df_paula['ingredient_name']}
fd_paula = FuzzyDict(paula_dict, cutoff = .6)
print(len(fd_paula))

inci_dict = {name.strip(): name.strip() for name in df_inci['ingredient_name']}
fd_inci = FuzzyDict(inci_dict, cutoff = .7)
print(len(fd_inci))

29908
1833
1528


In [140]:
match_dict_fuzzy = fuzzy_match_ingredients(ing_list, fd_inci)

100%|██████████| 43/43 [00:00<00:00, 314.86it/s]


In [141]:
match_dict_fuzzy

{'WATER': 'WATER',
 'CAPRYLIC/CAPRIC. TRIGLYCERIDE': 'CAPRYLIC/ CAPRIC TRIGLYCERIDE',
 'CETYL ALCOHOL': 'CETYL ALCOHOL',
 'PROPANEDIOL': 'PROPANEDIOL',
 'STEARYL ALCOHOL': 'STEARYL ALCOHOL',
 'GLYCERIN': 'GLYCERIN',
 'SODIUM HYALURONATE': 'SODIUM HYALURONATE',
 'ARGININE': 'ARGININE',
 'ASPARTIC ACID': 'ASPARTIC ACID',
 'GLYCINE': 'GLYCINE',
 'ALANINE': 'ALANINE',
 'SERINE': 'SERINE',
 'VALINE': 'VALINE',
 'ISOLEUCINE': 'ISOLEUCINE',
 'PROLINE': 'PROLINE',
 'THREONINE': 'THREONINE',
 'HISTIDINE': 'HISTIDINE',
 'PHENYLALANINE': 'PHENYLALANINE',
 'GLUCOSE': 'GLUCOSE',
 'MALTOSE': 'MANNOSE',
 'FRUCTOSE': 'FRUCTOSE',
 'TREHALOSE': 'TREHALOSE',
 'SODIUM PCA. PCA. SODIUM LACTATE': 'unknown',
 'UREA': 'UREA',
 'ALLANTOIN': 'ALLANTOIN',
 'LINOLEIC ACID': 'LINOLEIC ACID',
 'OLEIC ACID': 'OLEIC ACID',
 'PHYTOSTERYL CANOLA GLYCERIDES': 'unknown',
 'PALMITIC ACID': 'PALMITIC ACID',
 'STEARIC ACID': 'STEARIC ACID',
 'LECITHIN': 'LECITHIN',
 'TRIOLEIN': 'TRETINOIN',
 'TOCOPHEROL': 'TOCOPHEROL',
 'CA

In [142]:
match_dict_fuzzy.values
missing_ing = []
for key, value in match_dict_fuzzy.items():
    if value == 'unknown':
        missing_ing.append(key)
missing_ing

match_dict_extra = fuzzy_match_ingredients(missing_ing, fd_cosing)
match_dict_extra

100%|██████████| 2/2 [00:00<00:00,  9.14it/s]


{'SODIUM PCA. PCA. SODIUM LACTATE': 'SODIUM ALUMINUM LACTATE',
 'PHYTOSTERYL CANOLA GLYCERIDES': 'PHYTOSTERYL CANOLA GLYCERIDES'}

# **End to end**

In [157]:
def before_ctpn(img_path):
    img = cv2.imread(img_path)
    img_ctpn = preprocess_for_ocr(img, enhance=2)
    
    return img_ctpn

In [160]:
def after_ctpn_english(img_path, txt_path, inci_path, cosing_path):
    # Get annotations of bounding boxes
    boxes = get_bounding_box(txt_path)
    
#     # Preprocess image for OCR:
#     img = cv2.imread(img_path)
#     #img_ocr = preprocess_for_ocr(img)
#     img_ocr = img
    
    # doing OCR
    text = ''
    for box in boxes:
        croped = crop_line(img_path, box)
        string = ocr(croped)
        text = text + ' ' + str(string.strip('\n').strip('\x0c').strip())
    
    print(text)
    
    # Cleaning result from OCR
    text_result = clean_string(text)
    ing_list = string_to_list(text_result)
    
    print("-----")
    print(text_result)
    
    # Loading ingredient dataframe
    df_inci = pd.read_csv(inci_path) # '../Database/CALLMEDUY/ingredient_vietnamese_3818.csv'
    df_cosing = pd.read_csv(cosing_path) #'../Database/ingredient_cosing_37309.csv'
    
#     rating_inci, irritancy_inci, comedogenicity_inci, function_inci, qfacts_inci, desc_inci, desc_cosing, function_cosing = create_dict_english(df_inci, df_cosing)
    
    # fd_cmd
    inci_dict = {name.strip(): name.strip() for name in df_inci['ingredient_name']}
    fd_inci = FuzzyDict(inci_dict, cutoff = .7)
    
    # fd_cosing
    cosing_dict = {name.strip(): name.strip() for name in df_cosing['ingredient_name']}
    fd_cosing = FuzzyDict(cosing_dict, cutoff = .6)
    
    # Compare product ingredient list and database
    # match_dict = find_matching_ingredient(ing_list, rating, 0.55)
    match_dict_fuzzy = fuzzy_match_ingredients(ing_list, fd_inci)

    missing_ing = []
    for key, value in match_dict_fuzzy.items():
        if value == 'unknown':
            missing_ing.append(key)

    match_dict_extra = fuzzy_match_ingredients(missing_ing, fd_cosing)
    print(len(match_dict_fuzzy))
    print(len(match_dict_extra))
    
    # Analyzing ingredient
    df_res = lookup_all(ing_list, match_dict_fuzzy, match_dict_extra, df_inci, df_cosing)
    
    return df_res

In [169]:
df_res = after_ctpn_english('../Sample_images/sample5.JPG',
                               '../text-detection-ctpn/data/res/sample5.txt', 
                               '../Database/INCI/ingredient_inci_1570.csv',
                               '../Database/ingredient_cosing_37309.csv')

  0%|          | 0/43 [00:00<?, ?it/s]

 Ingredients / Ingrédients: AQUA (WATER), CAPRYLIC/CAPRIC. TRIGLYCERIDE, CETYL ALCOHOL, PROPANEDIOL, STEARYL ALCOHOL, GLYCERIN, SODIUM HYALURONATE, ARGININE, ASPARTIC ACID, GLYCINE, ALANINE, SERINE, VALINE, ISOLEUCINE, PROLINE, THREONINE, HISTIDINE, PHENYLALANINE, GLUCOSE, MALTOSE, FRUCTOSE, TREHALOSE, SODIUM PCA. PCA. SODIUM LACTATE, UREA, ALLANTOIN, LINOLEIC ACID, OLEIC ACID, PHYTOSTERYL CANOLA GLYCERIDES, PALMITIC ACID, STEARIC ACID, LECITHIN, TRIOLEIN, TOCOPHEROL, CARBOMER, ISOCETETH-20, POLYSORBATE 60, SODIUM CHLORIDE, CITRIC ACID, TRISODIUM ETHYLENEDIAMINE DISUCCINATE, PENTYLENE GLYCOL. TRIETHANOLAMINE, SODIUM HYDROXIDE, PHENOXYETHANOL, CHLORPHENESIN. eee ere
-----
AQUA (WATER), CAPRYLIC/CAPRIC. TRIGLYCERIDE, CETYL ALCOHOL, PROPANEDIOL, STEARYL ALCOHOL, GLYCERIN, SODIUM HYALURONATE, ARGININE, ASPARTIC ACID, GLYCINE, ALANINE, SERINE, VALINE, ISOLEUCINE, PROLINE, THREONINE, HISTIDINE, PHENYLALANINE, GLUCOSE, MALTOSE, FRUCTOSE, TREHALOSE, SODIUM PCA. PCA. SODIUM LACTATE, UREA, ALLAN

100%|██████████| 43/43 [00:00<00:00, 302.55it/s]
100%|██████████| 2/2 [00:00<00:00,  5.15it/s]
261it [00:00, 2609.14it/s]

43
2


1570it [00:00, 3223.57it/s]
37309it [00:06, 5734.61it/s]
100%|██████████| 43/43 [00:00<00:00, 76812.21it/s]


In [165]:
df_res

Unnamed: 0,Ingredient_name,Functions,Rating,Irritancy,Comedogenicity,Quick_facts,Description
0,WATER,{'solvent': '/ingredient-functions/solvent'},No rating,,,,"['Good old water, aka H2O. The most common ski..."
1,CAPRYLIC/ CAPRIC TRIGLYCERIDE,[],unknown,unknown,unknown,[],[]
2,CETYL ALCOHOL,{'emollient': '/ingredient-functions/emollient...,No rating,2,2,,"['A so-called\xa0fatty (the good, non-drying k..."
3,PROPANEDIOL,"{'solvent': '/ingredient-functions/solvent', '...",No rating,,,,"[""Propanediol is a natural alternative for the..."
4,STEARYL ALCOHOL,{'emollient': '/ingredient-functions/emollient...,No rating,2,2,,"['A handy multi-tasker,\xa0white to light yell..."
5,GLYCERIN,{'skin-identical ingredient': '/ingredient-fun...,superstar,0,0,['A natural moisturizer that’s also in our ski...,['Glycerin doesn’t sound very glamorous but it...
6,SODIUM HYALURONATE,{'skin-identical ingredient': '/ingredient-fun...,goodie,0,0,,"[""It’s the - sodium form - cousin of the famou..."
7,ARGININE,{'skin-identical ingredient': '/ingredient-fun...,goodie,,,,"[""A semi-essential (infants cannot synthesize ..."
8,ASPARTIC ACID,{'skin-identical ingredient': '/ingredient-fun...,goodie,,,,['A non-essential\xa0amino acid \xa0(important...
9,GLYCINE,{'skin-identical ingredient': '/ingredient-fun...,goodie,,,,['A non-essential amino acid (the building blo...


In [223]:
ing_list = df_res.iloc[:, 0].to_list()
ing_list

['WATER',
 'CAPRYLIC/ CAPRIC TRIGLYCERIDE',
 'CETYL ALCOHOL',
 'PROPANEDIOL',
 'STEARYL ALCOHOL',
 'GLYCERIN',
 'SODIUM HYALURONATE',
 'ARGININE',
 'ASPARTIC ACID',
 'GLYCINE',
 'ALANINE',
 'SERINE',
 'VALINE',
 'ISOLEUCINE',
 'PROLINE',
 'THREONINE',
 'HISTIDINE',
 'PHENYLALANINE',
 'GLUCOSE',
 'MANNOSE',
 'FRUCTOSE',
 'TREHALOSE',
 'SODIUM ALUMINUM LACTATE',
 'UREA',
 'ALLANTOIN',
 'LINOLEIC ACID',
 'OLEIC ACID',
 'PHYTOSTERYL CANOLA GLYCERIDES',
 'PALMITIC ACID',
 'STEARIC ACID',
 'LECITHIN',
 'TRETINOIN',
 'TOCOPHEROL',
 'CARBOMER',
 'ISOCETETH-20',
 'POLYSORBATE 60',
 'SODIUM CHLORIDE',
 'CITRIC ACID',
 'TRISODIUM ETHYLENEDIAMINE DISUCCINATE',
 'NEOPENTYL GLYCOL DIHEPTANOATE',
 'SODIUM HYDROXIDE',
 'PHENOXYETHANOL',
 'CHLORPHENESIN']

In [225]:
with open('../Model_product_evaluator/ingredient_idx_1000.pickle', 'rb') as handle:
    ingredient_idx = pickle.load(handle)

In [226]:
match_dict_token = fuzzy_match_ingredients(ing_list, fd_cosing)
token = [[value for value in match_dict_token.values()]]

100%|██████████| 43/43 [00:00<00:00, 69.23it/s]


In [230]:
toke = IngredientTokenizer()
a = toke.transform(token)

In [219]:
from sklearn.base import BaseEstimator, TransformerMixin

class IngredientTokenizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X): # X: nested list
        M = len(X)
        N = 1000
        A = np.zeros((M, N), dtype=np.uint8)
        i= 0
        for ing_list in X:
            x = np.zeros(N, dtype=np.uint8)
            for ingredient in ing_list:
                # Get the index for each ingredient
                if ingredient in ingredient_idx.keys():
                    idx = ingredient_idx[ingredient]
                    x[idx] = 1
                else:
                    pass
            
            A[i, :] = x
            i += 1
        return A