In [1]:
import re
import cv2
import numpy as np
import pandas as pd
import shutil
import os
import random
import pytesseract
from pytesseract import Output
from matplotlib import pyplot as plt
# from google.colab.patches import cv2_imshow
try:
    from PIL import Image, ImageEnhance
except ImportError:
    import Image

    import re

import operator
import itertools
import gc
import pickle

from difflib import SequenceMatcher
from ast import literal_eval
from collections import Counter
from tqdm import tqdm

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbookpro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/macbookpro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.GaussianBlur(image, (5,5), 0)

# thresholding
def thresholding(image):
    return cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 7, 4)
                            
# return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

# dilation: enhance the bright area
def dilate(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.dilate(image, kernel, iterations=1)

# erosion: enhance the dark area
def erode(image):
  kernel = np.ones((5,5), np.unint8)
  return cv2.erode(image, kernel, iterations=1)

# opening: erosion follow by a dilation
def opening(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

# closing: Dilation followed by Erosion. Removing black holes inside the object 
def closing(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.morphologyEx(opening(image), cv2.MORPH_CLOSE, kernel)

# canny
def canny(image):
      return cv2.Canny(image, 100, 200)

# deskew image
def deskew(image):
    coords = np.column_stack(np.where(image>0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

# template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)

In [4]:
def preprocess_for_ocr(img, enhance=1):
    """
    @param img: image to which the pre-processing steps being applied
    """
    if enhance > 1:
        img = Image.fromarray(img)

        contrast = ImageEnhance.Contrast(img)

        img = contrast.enhance(enhance)

        img = np.asarray(img)
    
    
    gray = get_grayscale(img)
    blur = remove_noise(gray)
    res = thresholding(blur)

    img = cv2.cvtColor(res, cv2.COLOR_GRAY2BGR)

    return img

In [5]:
def get_bounding_box(txt):
    annotation = txt
    with open(annotation, "r") as file1:
        bounding_boxes = file1.read()
        
    bounding_boxes = bounding_boxes.split('\n')[:-1]
    boxes = [i.split(',')[:-1] for i in bounding_boxes]

    new_boxes = []
    for box in boxes:
        new_box = []
        for i, each in enumerate(box):
            num = int(each)
            if i in [0, 1, 3, 6]:
                num -= 3
            else: 
                num += 3
            new_box.append(num)
        new_boxes.append(new_box)
    new_boxes.sort(key=lambda x: x[1])
    
    return new_boxes

In [6]:
def crop_line(img_path, box):
    img = cv2.imread(img_path)
    img, (rh, rw) = resize_image(img)
    # points for test.jpg
    cnt = np.array([
            [[box[0], box[1]]],
            [[box[2], box[3]]],
            [[box[4], box[5]]],
            [[box[6], box[7]]]
        ])
    # print("shape of cnt: {}".format(cnt.shape))
    rect = cv2.minAreaRect(cnt)
#     print("rect: {}".format(rect))

    # the order of the box points: bottom left, top left, top right,
    # bottom right
    box = cv2.boxPoints(rect)
    box = np.int0(box)

    # print("bounding box: {}".format(box))
    cv2.drawContours(img, [box], 0, (0, 0, 255), 2)

    # get width and height of the detected rectangle
    width = int(rect[1][0])
    height = int(rect[1][1])
    angle = rect[2]

    src_pts = box.astype("float32")
    # coordinate of the points in box points after the rectangle has been
    # straightened
    dst_pts = np.array([[0, height+2],
                        [0, 0],
                        [width, 0],
                        [width, height+2]], dtype="float32")

    # the perspective transformation matrix
    M = cv2.getPerspectiveTransform(src_pts, dst_pts)

    # directly warp the rotated rectangle to get the straightened rectangle
    warped = cv2.warpPerspective(img, M, (width, height))

    # cv2.imwrite("crop_img.jpg", warped)
    
    # cv2.waitKey(0)
    if angle < -45:
      warped = np.transpose(warped,(1,0,2))
      warped = warped[::-1]

#     cv2.imshow('croped', warped)
#     cv2.waitKey(0)
#     cv2.destroyAllWindows()


    return warped

In [7]:
def crop_each_line(img_path, rect):
    img = cv2.imread(img_path)
    img, (rh, rw) = resize_image(img)
    cnt = np.array([
            [[rect[0], rect[1]]],
            [[rect[2], rect[3]]],
            [[rect[4],rect[5]]],
            [[rect[6], rect[7]]]
        ])
    # find the exact rectangle enclosing the text area
    # rect is a tuple consisting of 3 elements: the first element is the center
    # of the rectangle, the second element is the width, height, and the
    # third element is the detected rotation angle.
    # Example output: ((227.5, 187.50003051757812),
    # (94.57575225830078, 417.98736572265625), -36.982906341552734)
    rect = cv2.minAreaRect(cnt)
    print("rect: {}".format(rect))

    box = cv2.boxPoints(rect)
    box = np.int0(box)

    # print("bounding box: {}".format(box))
    cv2.drawContours(img, [box], 0, (0, 0, 255), 1)

    # img_crop will the cropped rectangle, img_rot is the rotated image
    img_crop, img_rot = crop_rect(img, rect)
    cv2.imwrite("cropped_img.jpg", img_crop)
    cv2.imshow('croped', img_crop)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    return img_crop

def resize_image(img):
    img_size = img.shape
    im_size_min = np.min(img_size[0:2])
    im_size_max = np.max(img_size[0:2])

    im_scale = float(600) / float(im_size_min)
    if np.round(im_scale * im_size_max) > 1200:
        im_scale = float(1200) / float(im_size_max)
    new_h = int(img_size[0] * im_scale)
    new_w = int(img_size[1] * im_scale)

    new_h = new_h if new_h // 16 == 0 else (new_h // 16 + 1) * 16
    new_w = new_w if new_w // 16 == 0 else (new_w // 16 + 1) * 16

    re_im = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
    return re_im, (new_h / img_size[0], new_w / img_size[1])

def crop_rect(img, rect):
    # get the parameter of the small rectangle
    center, size, angle = rect[0], rect[1], rect[2]
    center, size = tuple(map(int, center)), tuple(map(int, size))

    # get row and col num in img
    height, width = img.shape[0], img.shape[1]

    # calculate the rotation matrix
    M = cv2.getRotationMatrix2D(center, angle, 1)
    # rotate the original image
    img_rot = cv2.warpAffine(img, M, (width, height))

    # now rotated rectangle becomes vertical, and we crop it
    img_crop = cv2.getRectSubPix(img_rot, size, center)

    return img_crop, img_rot

In [8]:
def ocr(img, oem=3, psm=6):
    """
    @param img: The image to be OCR'd
    @param oem: for specifying the type of Tesseract engine( default=1 for LSTM OCR Engine)
    """
    config = ('-l eng --oem {oem} --psm {psm}'.format(oem=oem,psm=psm))
    # config = ('-l eng --tessdata-dir "/usr/share/tesseract-ocr/tessdata" --oem {oem} -- psm {psm}'.format(oem=oem,psm=psm))

    try:
#         img = Image.fromarray(img)
        text = pytesseract.image_to_string(img, config=config)

        return text
    except:
        
        return ""

In [9]:
def ocr_metrics(predicts, ground_truth, norm_accentuation=False, norm_punctuation=False):
    """Calculate Character Error Rate (CER), Word Error Rate (WER) and Sequence Error Rate (SER)"""
    
    import string
    import unicodedata
    import editdistance

    if len(predicts) == 0 or len(ground_truth) == 0:
        return (1, 1, 1)

    cer, wer, ser = [], [], []

    for (pd, gt) in zip(predicts, ground_truth):

        if norm_accentuation:
            pd = unicodedata.normalize("NFKD", pd).encode("ASCII", "ignore").decode("ASCII")
            gt = unicodedata.normalize("NFKD", gt).encode("ASCII", "ignore").decode("ASCII")

        if norm_punctuation:
            pd = pd.translate(str.maketrans("", "", string.punctuation))
            gt = gt.translate(str.maketrans("", "", string.punctuation))

        pd_cer, gt_cer = list(pd.lower()), list(gt.lower())
        dist = editdistance.eval(pd_cer, gt_cer)
        cer.append(dist / (max(len(pd_cer), len(gt_cer))))

        pd_wer, gt_wer = pd.lower().split(), gt.lower().split()
        dist = editdistance.eval(pd_wer, gt_wer)
        wer.append(dist / (max(len(pd_wer), len(gt_wer))))

        pd_ser, gt_ser = [pd], [gt]
        dist = editdistance.eval(pd_ser, gt_ser)
        ser.append(dist / (max(len(pd_ser), len(gt_ser))))

    cer_f = sum(cer) / len(cer)
    wer_f = sum(wer) / len(wer)
    ser_f = sum(ser) / len(ser)
    
    evaluate = (cer_f, wer_f, ser_f)
    
    e_corpus = "\n".join([
    "Metrics:",
    "Character Error Rate: {}".format(evaluate[0]),
    "Word Error Rate:      {}".format(evaluate[1]),
    "Sequence Error Rate:  {}".format(evaluate[2]),
    ])

    return print(e_corpus)

In [10]:
def clean_string(string):
    text = string.replace('INACTIVE INGREDIENTS:', '') # added
    text = text.replace('ACTIVE INGREDIENTS:', '') # added
    text = text.split(':')[1]
    
    pattern = "[\|\*\_\'\{}&]".format('"')
    regex = re.compile('\\\S+')
    
    text = re.sub(pattern, "", text)
    text = re.sub(",, ", ", ", text)
    text = re.sub(regex, " ", text)
    text = re.sub('\.', " ", text)
    text_tokens = word_tokenize(text)
    text_wo_sw = [w for w in text_tokens if not w in stopwords.words()]
    text = ' '.join(text_wo_sw)
    text = text.strip()

    return text

def string_to_list(text):
    pattern = "[\|\*\_\'\{}]".format('"')
    text = re.sub(pattern, "", text)
    split = [remove_water(x) for x in re.split("[,.]", text)]
    
    return split

In [11]:
def remove_water(string):
    water = ['WATER (AQUA)', 'AQUA', 'EAU', 'AQUA/WATER/EAU', 'AQUA / WATER / EAU', 
             'PURIFIED WATER', 'DISTILLED WATER', 'D.I. WATER', 'AQUA (WATER)', 'AQUA (PURIFIED)']
    text = string.upper()
    if text in water:
        text = 'WATER'
    text = text.strip('  ')
    
    return text

In [12]:
def clean_string_name(string):
    text = string.replace(' Size: 4 oz. * USDA Certified Organic Ingredient ** None remains after saponifying oils into soap and glycerin', '')
    pattern = "[\|\*\_\'\{}&]".format('"')
    regex1 = re.compile('\[.*]')
    regex2 = re.compile('\[.*}')
    text = re.sub(pattern, "", text)
    text = re.sub(regex1, "", text)
    text = re.sub(regex2, "", text)
    text = text.strip('\x0c')
    text = text.strip(' ')
    text = text.upper()
    text = text.replace('INACTIVE INGREDIENTS:', '')
    text = text.replace('ACTIVE INGREDIENTS:', '')

    return text

In [13]:
def check_for_label(text, words):
    text = text.lower()
    for i in range(len(text)):
        if any(text[i:].startswith(word) for word in words):
            return True
    return False

# def clean_string(string):
#     pattern = "[\|\*\_\'\{}]".format('"')
#     text = re.sub(pattern, "", string)
#     text = re.sub(" I ", " / ", text)
#     text = re.sub("^I ", "", text)
#     text = re.sub("Omg", "0mg", text)
#     text = re.sub("Og", "0g", text)
#     text = re.sub('(?<=\d) (?=\w)', '', text)
#     text = change_to_g(text)
#     text = text.strip()
#     return text

In [14]:
def create_dict(ingredient_df, col_name):
    '''Generating rating_dict and category_dict
        ingredient_df: dataframe of the whole ingredient database
        col_name: name of column I want to generate (rating/category)
    '''
    col_dict = {name: row[col_name] for (idx, row) in ingredient_df.iterrows() for name in row['ingredient_name'].split('/')}

    return col_dict

In [15]:
def find_matching_ingredient(my_ingredients, fd, thresh=0.25): # rating_dict, category_dict, 
    ''' my_ingredients: list of product's ingredients
        Loop thru each ingredient in the ingredient list of the products
        then check if that ingredient appears in our ingredient list
        Calculate match_metric using SequenceMatcher and return the highest score and the best match
        Compare the match_metric with thresh > append to match_dict
    '''
    match_dict = {}
    for ingredient in tqdm(my_ingredients):
        if ingredient in match_dict.keys():
            continue
#         match_metric = {key: SequenceMatcher(None, key, ingredient).ratio() for key in rating_dict.keys()}
        match_metric = {key: SequenceMatcher(None, key, ingredient).ratio() for key in fd}
        best_match, best_metric = max(match_metric.items(), key=operator.itemgetter(1)) # What is operator.itemgetter(1)
        if best_metric > thresh:
            match_dict[ingredient] = best_match
        else:
            match_dict[ingredient] = 'unknown'
    return match_dict

In [16]:
def lookup_cosing(ingredient_list, match_dict, rating_dict, category_dict, option=''):
    res = []
    for item in ingredient_list:
        key = match_dict.get(item, 'unknown')
        
        rating = ingredient_rating_dict.get(key, 'unknown')
        category = ingredient_category_dict.get(key, [])

    if option == 'ingredient':
        res.append(key)
    elif option == 'rating':
        res.append(rating)
    elif option == 'category':
        res.append(category)
    else:
        res.extend([key, rating, category])

    return res

In [17]:
def create_dict_vietnamese(df_cmd, df_cosing):
    desc_cmd = {}
    ratingscore_cmd = {}
    function_cmd = {}
    desc_cosing = {}
    function_cosing = {}
    
    for idx, row in tqdm(df_cmd.iterrows()):
        for name in row['ingredient_name'].split('/'):
            chem_name = name.strip()
            desc_cmd[chem_name] = row['description']
            ratingscore_cmd[chem_name] = row['rating_score']
            function_cmd[chem_name] = row['functions']
            
    for idx, row in tqdm(df_cosing.iterrows()):
        for name in row['ingredient_name'].split('/'):
            chem_name = name.strip()
            desc_cosing[chem_name] = row['description']
            function_cosing[chem_name] = row['functions']    
    
    return desc_cmd, ratingscore_cmd, function_cmd, desc_cosing, function_cosing

In [22]:
desc_cmd, ratingscore_cmd, function_cmd, desc_cosing, function_cosing = create_dict_vietnamese(df_cmd, df_cosing)

3818it [00:00, 5117.69it/s]
37309it [00:06, 5718.88it/s]


In [23]:
len(function_cosing)

29908

In [24]:
import pickle

# a = {'hello': 'world'}

with open('vie_ratingscore_cmd.pickle', 'wb') as handle:
    pickle.dump(ratingscore_cmd, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('vie_function_cmd.pickle', 'wb') as handle:
    pickle.dump(function_cmd, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('vie_desc_cmd.pickle', 'wb') as handle:
    pickle.dump(desc_cmd, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [18]:
# # Create dict for cosing
# desc_cosing = create_dict(df_cosing, 'description')
# function_cosing = create_dict(df_cosing, 'functions')

# # Create dict for cmddf_
# desc_cmd = create_dict(df_cmd, 'description')
# ratingscore_cmd = create_dict(df_cmd, 'rating_score')
# function_cmd = create_dict(df_cmd, 'functions')

In [25]:
def lookup_all_vietnamese(ingredient_list, match_dict_cmd, match_dict_cosing,
               df_cmd, df_cosing, option=''):
    
    with open('vie_ratingscore_cmd.pickle', 'rb') as handle:
        ratingscore_cmd = pickle.load(handle)
    with open('vie_function_cmd.pickle', 'rb') as handle:
        function_cmd = pickle.load(handle)
    with open('vie_desc_cmd.pickle', 'rb') as handle:
        desc_cmd = pickle.load(handle)    
    
    with open('eng_desc_cosing.pickle', 'rb') as handle:
        desc_cosing = pickle.load(handle)
    with open('eng_function_cosing.pickle', 'rb') as handle:
        function_cosing = pickle.load(handle)
    
    res = []
    
    for item in tqdm(ingredient_list):
        
        value = match_dict_cmd[item]

        if value == 'unknown':
            key = match_dict_cosing.get(item, 'unknown')
            rating_score = 'Chưa đánh giá'
            functions = function_cosing.get(key, [])
            description = desc_cosing.get(key, [])
        else:
            key = match_dict_cmd.get(item, 'unknown')
            rating_score = ratingscore_cmd.get(key, np.nan)
            functions = function_cmd.get(key, [])
            description = desc_cmd.get(key, [])
            
        if key != 'unknown':             
            if option == 'ingredient':
                res.append(key)
            elif option == 'rating_score':
                res.append(rating_score)
            elif option == 'functions':
                res.append(functions)
            elif option == 'description':
                res.append(description)
            else:
                res.extend([[key, rating_score, functions, description]])
            
    df_res = pd.DataFrame(res, columns=['Ingredient_name', 'Rating_score', 'Functions', 'Description'])
    
    return df_res

In [57]:
df_cosing.head()

Unnamed: 0,cosing_ref_no,ingredient_name,description,restriction,functions
0,94753.0,DISODIUM TETRAMETHYLHEXADECENYLCYSTEINE FORMYL...,Disodium Tetramethylhexadecenylcysteine Formyl...,,SKIN PROTECTING
1,96229.0,ASTROCARYUM VULGARE SEED BUTTER,Astrocaryum Vulgare Seed Butter is the fat obt...,,"SKIN CONDITIONING, SKIN CONDITIONING - EMOLLIENT"
2,89177.0,BARLEY SH-POLYPEPTIDE-17,Barley sh-Polypeptide-17 is a single chain rec...,,"HAIR CONDITIONING, SKIN CONDITIONING"
3,98580.0,DAUCUS CAROTA SATIVA LEAF EXTRACT,Daucus Carota Sativa (Carrot) Leaf Extract is ...,,SKIN CONDITIONING - MISCELLANEOUS
4,89078.0,GOSSYPIUM HIRSUTUM SEED EXTRACT,Gossypium Hirsutum Seed Extract is the extrac...,,SKIN CONDITIONING


# **Function for fuzzy dict**

In [99]:
# Fuzzydict

from fuzzywuzzy import fuzz 

class FuzzyDict(dict):
    "Provides a dictionary that performs fuzzy lookup"
    def __init__(self, items = None, cutoff = .6):
        """Construct a new FuzzyDict instance

        items is an dictionary to copy items from (optional)
        cutoff is the match ratio below which mathes should not be considered
        cutoff needs to be a float between 0 and 1 (where zero is no match
        and 1 is a perfect match)"""
        super(FuzzyDict, self).__init__()

        if items:
            self.update(items)
        self.cutoff =  cutoff

        # short wrapper around some super (dict) methods
        self._dict_contains = lambda key: \
            super(FuzzyDict,self).__contains__(key)

        self._dict_getitem = lambda key: \
            super(FuzzyDict,self).__getitem__(key)

    def _search(self, lookfor, stop_on_first = False):
        """Returns the value whose key best matches lookfor

        if stop_on_first is True then the method returns as soon
        as it finds the first item
        """

        # if the item is in the dictionary then just return it
        if self._dict_contains(lookfor):
            return True, lookfor, self._dict_getitem(lookfor), 1

        # set up the fuzzy matching tool
        #ratio_calc = difflib.SequenceMatcher()
        #ratio_calc.set_seq1(lookfor)

        # test each key in the dictionary
        best_ratio = 0
        best_match = None
        best_key = None
        for key in self:

            # if the current key is not a string
            # then we just skip it
            if not isinstance(key, str):
                continue

            # we get an error here if the item to look for is not a
            # string - if it cannot be fuzzy matched and we are here
            # this it is defintely not in the dictionary
            try:
            # calculate the match value
                ratio = fuzz.ratio(lookfor, key)/100
            except TypeError:
                break

            # if this is the best ratio so far - save it and the value
            if ratio > best_ratio:
                best_ratio = ratio
                best_key = key
                best_match = self._dict_getitem(key)

            if stop_on_first and ratio >= self.cutoff:
                break

        return (
            best_ratio >= self.cutoff,
            best_key,
            best_match,
            best_ratio)


    def __contains__(self, item):
        "Overides Dictionary __contains__ to use fuzzy matching"
        if self._search(item, True)[0]:
            return True
        else:
            return False

    def __getitem__(self, lookfor):
        "Overides Dictionary __getitem__ to use fuzzy matching"
        matched, key, item, ratio = self._search(lookfor)

        if not matched:
            raise KeyError(
                "'%s'. closest match: '%s' with ratio %.3f"%
                    (str(lookfor), str(key), ratio))

        return item

In [100]:
def fuzzy_match_ingredients(ing_list, fuzdict):
    match_dict = {}
    for ing in tqdm(ing_list):
        if ing in match_dict.keys():
            continue
        upper_ing = ing.upper()
        if fuzdict.__contains__(upper_ing):
            match_dict[ing] = fuzdict[upper_ing]
        else:
            match_dict[ing] = 'unknown'
    
    return match_dict


# **Testing: Vietnamese**

In [60]:
df_cmd['rating_score'].value_counts()

4.0    1882
5.0     892
1.0     594
3.0     347
2.0     102
Name: rating_score, dtype: int64

In [19]:
# df_ingredient = pd.read_csv(ingredient_df_path)
df_cmd = pd.read_csv('../Database/CALLMEDUY/ingredient_vietnamese_3818.csv')
df_cosing = pd.read_csv('../Database/ingredient_cosing_37309.csv')

In [21]:
df_cosing

Unnamed: 0,cosing_ref_no,ingredient_name,description,restriction,functions
0,94753.0,DISODIUM TETRAMETHYLHEXADECENYLCYSTEINE FORMYL...,Disodium Tetramethylhexadecenylcysteine Formyl...,,SKIN PROTECTING
1,96229.0,ASTROCARYUM VULGARE SEED BUTTER,Astrocaryum Vulgare Seed Butter is the fat obt...,,"SKIN CONDITIONING, SKIN CONDITIONING - EMOLLIENT"
2,89177.0,BARLEY SH-POLYPEPTIDE-17,Barley sh-Polypeptide-17 is a single chain rec...,,"HAIR CONDITIONING, SKIN CONDITIONING"
3,98580.0,DAUCUS CAROTA SATIVA LEAF EXTRACT,Daucus Carota Sativa (Carrot) Leaf Extract is ...,,SKIN CONDITIONING - MISCELLANEOUS
4,89078.0,GOSSYPIUM HIRSUTUM SEED EXTRACT,Gossypium Hirsutum Seed Extract is the extrac...,,SKIN CONDITIONING
...,...,...,...,...,...
37304,90014.0,PHELLODENDRON AMURENSE BARK,This is a filtrate of the product obtained by ...,,"HUMECTANT, SKIN CONDITIONING, SKIN PROTECTING"
37305,90014.0,LONICERA JAPONICA FLOWER,This is a filtrate of the product obtained by ...,,"HUMECTANT, SKIN CONDITIONING, SKIN PROTECTING"
37306,90014.0,CHAENOMELES SINENSIS FRUIT,This is a filtrate of the product obtained by ...,,"HUMECTANT, SKIN CONDITIONING, SKIN PROTECTING"
37307,90014.0,CAMELLIA SINENSIS LEAF,This is a filtrate of the product obtained by ...,,"HUMECTANT, SKIN CONDITIONING, SKIN PROTECTING"


In [61]:
# dfi3 = pd.read_csv('Database/ingredient_inci_1570.csv')
cosing_dict = {name.strip(): name.strip() for name in df_cosing['ingredient_name']}
# print(ingnames_dict)
fd_cosing = FuzzyDict(cosing_dict, cutoff = .6)
print(len(fd_cosing))

29908


In [62]:
# dfi3 = pd.read_csv('Database/ingredient_inci_1570.csv')
cmd_dict = {name.strip(): name.strip() for name in df_cmd['ingredient_name']}
# print(ingnames_dict)
fd_cmd = FuzzyDict(cmd_dict, cutoff = .7)
print(len(fd_cmd))

3818


In [78]:
boxes = get_bounding_box('../text-detection-ctpn/data/res/sample5.txt')
img_path = '../Sample_images/sample5.JPG'
# img = cv2.imread('../Sample_images/sample5.JPG')
# img_ocr = preprocess_for_ocr(img)
# cv2.imshow('sample', img_ocr)
# cv2.waitKey(0)
# cv2.destroyAllWindows()

text = ''
for box in boxes:
    croped = crop_line(img_path, box)
    string = ocr(croped)
    text = text + ' ' + str(string.strip('\n').strip('\x0c').strip('.').strip()

text_result = clean_string(text)
ing_list = string_to_list(text_result)



SyntaxError: invalid syntax (<ipython-input-78-599882ce6d98>, line 15)

In [125]:
match_dict_fuzzy = fuzzy_match_ingredients(ing_list, fd_cmd)

100%|██████████| 43/43 [00:00<00:00, 267.19it/s]


In [168]:
match_dict_fuzzy

{'AQUA (WATER)': 'AQUA (WATER)',
 'CAPRYLIC/CAPRIC. TRIGLYCERIDE': 'CAPRYLIC/CAPRIC TRIGLYCERIDE',
 'CETYL ALCOHOL': 'CETYL ALCOHOL',
 'PROPANEDIOL': 'PROPANEDIOL',
 'STEARYL ALCOHOL': 'STEARYL ALCOHOL',
 'GLYCERIN': 'GLYCERIN',
 'SODIUM HYALURONATE': 'SODIUM HYALURONATE',
 'ARGININE': 'ARGININE',
 'ASPARTIC ACID': 'ASPARTIC ACID',
 'GLYCINE': 'GLYCINE',
 'ALANINE': 'ALANINE',
 'SERINE': 'SERINE',
 'VALINE': 'VALINE',
 'ISOLEUCINE': 'ISOLEUCINE',
 'PROLINE': 'PROLINE',
 'THREONINE': 'THREONINE',
 'HISTIDINE': 'HISTIDINE',
 'PHENYLALANINE': 'PHENYLALANINE',
 'GLUCOSE': 'GLUCOSE',
 'MALTOSE': 'MALTOSE',
 'FRUCTOSE': 'FRUCTOSE',
 'TREHALOSE': 'TREHALOSE',
 'SODIUM PCA. PCA. SODIUM LACTATE': 'unknown',
 'UREA': 'UREA',
 'ALLANTOIN': 'ALLANTOIN',
 'LINOLEIC ACID': 'LINOLEIC ACID',
 'OLEIC ACID': 'OLEIC ACID',
 'PHYTOSTERYL CANOLA GLYCERIDES': 'PHYTOSTERYL CANOLA GLYCERIDES',
 'PALMITIC ACID': 'PALMITIC ACID',
 'STEARIC ACID': 'STEARIC ACID',
 'LECITHIN': 'LECITHIN',
 'TRIOLEIN': 'TRIOLEIN',

In [141]:
match_dict_fuzzy.values
missing_ing = []
for key, value in match_dict_fuzzy.items():
    if value == 'unknown':
        missing_ing.append(key)
missing_ing

match_dict_extra = fuzzy_match_ingredients(missing_ing, fd_cosing)
match_dict_extra

100%|██████████| 3/3 [00:00<00:00,  4.21it/s]


{'SODIUM PCA. PCA. SODIUM LACTATE': 'SODIUM ALUMINUM LACTATE',
 'PENTYLENE GLYCOL. TRIETHANOLAMINE': 'DIETHYLENE GLYCOL DIETHYLHEXANOATE',
 'CHLORPHENESIN. EEE ERE': 'CHLORPHENESIN'}

In [81]:
df_res = lookup_all(ing_list, match_dict_fuzzy, match_dict_extra, df_cmd, df_cosing)

NameError: name 'ing_list' is not defined

In [219]:
df_res

Unnamed: 0,Ingredient_name,Rating_score,Functions,Description
0,AQUA (WATER),4,['Hỗn hợp'],"Là thành phần mỹ phẩm được sử dụng phổ biến, n..."
1,CAPRYLIC/CAPRIC TRIGLYCERIDE,unknown,[],[]
2,CETYL ALCOHOL,4,"['Chất dưỡng ẩm', 'Chất nhũ hóa/làm dầy kết cấ...",Cetearyl Alcohol và các loại cồn béo khác giữ ...
3,PROPANEDIOL,5,['Chưa phân loại'],"Propanediol là một chất tự nhiên, được dùng th..."
4,STEARYL ALCOHOL,4,"['Chất dưỡng ẩm', 'Chất nhũ hóa/làm dầy kết cấ...",Cồn béo được sử dụng làm chất dưỡng ẩm và giữ ...
5,GLYCERIN,5,"['Chất dưỡng da', 'Chất hồi phục da']","Glycerin là một chất dưỡng da và phục hồi da, ..."
6,SODIUM HYALURONATE,5,"['Chất chống oxy hóa', 'Chất dưỡng da', 'Chất ...",Sodium Hyaluronate là dạng muối của axit hyalu...
7,ARGININE,4,['Chất chống oxy hóa'],Chất này có khả năng chống oxy hóa.
8,ASPARTIC ACID,4,"['Chất chống oxy hóa', 'Chất dưỡng da']",Nghiên cứu chỉ ra rằng amino axit và dẫn xuất ...
9,GLYCINE,4,['Chất dưỡng da'],Thành phần cơ bản của tất cả các protein trong...


In [165]:
# # Create dict for cosing
# desc_cosing = create_dict(df_cosing, 'description')
# # restriction_cosing = create_dict(df_cosing, 'restriction')
# function_cosing = create_dict(df_cosing, 'functions')

# # Create dict for cmddf_
# desc_cmd = create_dict(df_cmd, 'description')
# ratingscore_cmd = create_dict(df_cmd, 'rating_score')
# function_cmd = create_dict(df_cmd, 'functions')

# **End-to-end**

In [63]:
def before_ctpn(img_path):
    img = cv2.imread(img_path)
    img_ctpn = preprocess_for_ocr(img, enhance=2)
    
    return img_ctpn

In [85]:
def after_ctpn_vietnamese(img_path, txt_path, cmd_path, cosing_path):
    # Get annotations of bounding boxes
    boxes = get_bounding_box(txt_path)
    
#     # Preprocess image for OCR:
#     img = cv2.imread(img_path)
#     #img_ocr = preprocess_for_ocr(img)
#     img_ocr = img
    
    # doing OCR
    text = ''
    for box in boxes:
        croped = crop_line(img_path, box)
        string = ocr(croped)
        text = text + ' ' + str(string.strip('\n').strip('\x0c').strip())
    
    print(text)
    
    # Cleaning result from OCR
    text_result = clean_string(text)
    ing_list = string_to_list(text_result)
    
    print("-----")
    print(text_result)
    
    # Loading ingredient dataframe
    df_cmd = pd.read_csv(cmd_path) # '../Database/CALLMEDUY/ingredient_vietnamese_3818.csv'
    df_cosing = pd.read_csv(cosing_path) #'../Database/ingredient_cosing_37309.csv'
    
#     desc_cmd, ratingscore_cmd, function_cmd, desc_cosing, function_cosing = create_dict_vietnamese(df_cmd, df_cosing)
    
    
    # fd_cmd
    cmd_dict = {name.strip(): name.strip() for name in df_cmd['ingredient_name']}
    fd_cmd = FuzzyDict(cmd_dict, cutoff = .7)
    
    # fd_cosing
    cosing_dict = {name.strip(): name.strip() for name in df_cosing['ingredient_name']}
    fd_cosing = FuzzyDict(cosing_dict, cutoff = .6)
    
    # Compare product ingredient list and database
    # match_dict = find_matching_ingredient(ing_list, rating, 0.55)
    match_dict_cmd = fuzzy_match_ingredients(ing_list, fd_cmd)
    match_dict_cosing = fuzzy_match_ingredients(ing_list, fd_cosing)
    
    # Prepare input for later models Random forest and KNN
    model_input = [[name for name in match_dict_cosing.values()]]
    
    df_res = lookup_all_vietnamese(ing_list, match_dict_cmd, match_dict_cosing, df_cmd, df_cosing)
    
    
    # Analyzing ingredient
#     df_res = lookup_all(ing_list, match_dict_fuzzy, match_dict_extra, df_cmd, df_cosing)
    
    return df_res

In [101]:
df_res = after_ctpn_vietnamese('../Sample_images/sample5.JPG',
                               '../text-detection-ctpn/data/res/sample5.txt', 
                               '../Database/CALLMEDUY/ingredient_vietnamese_3818.csv',
                               '../Database/ingredient_cosing_37309.csv')

 Ingredients / Ingrédients: AQUA (WATER), CAPRYLIC/CAPRIC. TRIGLYCERIDE, CETYL ALCOHOL, PROPANEDIOL, STEARYL ALCOHOL, GLYCERIN, SODIUM HYALURONATE, ARGININE, ASPARTIC ACID, GLYCINE, ALANINE, SERINE, VALINE, ISOLEUCINE, PROLINE, THREONINE, HISTIDINE, PHENYLALANINE, GLUCOSE, MALTOSE, FRUCTOSE, TREHALOSE, SODIUM PCA. PCA. SODIUM LACTATE, UREA, ALLANTOIN, LINOLEIC ACID, OLEIC ACID, PHYTOSTERYL CANOLA GLYCERIDES, PALMITIC ACID, STEARIC ACID, LECITHIN, TRIOLEIN, TOCOPHEROL, CARBOMER, ISOCETETH-20, POLYSORBATE 60, SODIUM CHLORIDE, CITRIC ACID, TRISODIUM ETHYLENEDIAMINE DISUCCINATE, PENTYLENE GLYCOL. TRIETHANOLAMINE, SODIUM HYDROXIDE, PHENOXYETHANOL, CHLORPHENESIN. eee ere
-----
AQUA ( WATER ) , CAPRYLIC/CAPRIC TRIGLYCERIDE , CETYL ALCOHOL , PROPANEDIOL , STEARYL ALCOHOL , GLYCERIN , SODIUM HYALURONATE , ARGININE , ASPARTIC ACID , GLYCINE , ALANINE , SERINE , VALINE , ISOLEUCINE , PROLINE , THREONINE , HISTIDINE , PHENYLALANINE , GLUCOSE , MALTOSE , FRUCTOSE , TREHALOSE , SODIUM PCA PCA SODIUM

100%|██████████| 43/43 [00:00<00:00, 206.63it/s]
100%|██████████| 43/43 [00:01<00:00, 35.90it/s]
100%|██████████| 43/43 [00:00<00:00, 142460.56it/s]


In [102]:
df_res

Unnamed: 0,Ingredient_name,Rating_score,Functions,Description
0,AQUA (WATER),4,['Hỗn hợp'],"Là thành phần mỹ phẩm được sử dụng phổ biến, n..."
1,CAPRYLIC/CAPRIC TRIGLYCERIDE,,[],[]
2,CETYL ALCOHOL,4,"['Chất dưỡng ẩm', 'Chất nhũ hóa/làm dầy kết cấ...",Cetearyl Alcohol và các loại cồn béo khác giữ ...
3,PROPANEDIOL,5,['Chưa phân loại'],"Propanediol là một chất tự nhiên, được dùng th..."
4,STEARYL ALCOHOL,4,"['Chất dưỡng ẩm', 'Chất nhũ hóa/làm dầy kết cấ...",Cồn béo được sử dụng làm chất dưỡng ẩm và giữ ...
5,GLYCERIN,5,"['Chất dưỡng da', 'Chất hồi phục da']","Glycerin là một chất dưỡng da và phục hồi da, ..."
6,SODIUM HYALURONATE,5,"['Chất chống oxy hóa', 'Chất dưỡng da', 'Chất ...",Sodium Hyaluronate là dạng muối của axit hyalu...
7,ARGININE,4,['Chất chống oxy hóa'],Chất này có khả năng chống oxy hóa.
8,ASPARTIC ACID,4,"['Chất chống oxy hóa', 'Chất dưỡng da']",Nghiên cứu chỉ ra rằng amino axit và dẫn xuất ...
9,GLYCINE,4,['Chất dưỡng da'],Thành phần cơ bản của tất cả các protein trong...


In [255]:
df_res

Unnamed: 0,Ingredient_name,Rating_score,Functions,Description
0,WATER,4,['Hỗn hợp'],"Là thành phần mỹ phẩm được sử dụng phổ biến, n..."
1,CAPRYLIC/CAPRIC TRIGLYCERIDE,unknown,[],[]
2,CETYL ALCOHOL,4,"['Chất dưỡng ẩm', 'Chất nhũ hóa/làm dầy kết cấ...",Cetearyl Alcohol và các loại cồn béo khác giữ ...
3,PROPANEDIOL,5,['Chưa phân loại'],"Propanediol là một chất tự nhiên, được dùng th..."
4,STEARYL ALCOHOL,4,"['Chất dưỡng ẩm', 'Chất nhũ hóa/làm dầy kết cấ...",Cồn béo được sử dụng làm chất dưỡng ẩm và giữ ...
5,GLYCERIN,5,"['Chất dưỡng da', 'Chất hồi phục da']","Glycerin là một chất dưỡng da và phục hồi da, ..."
6,SODIUM HYALURONATE,5,"['Chất chống oxy hóa', 'Chất dưỡng da', 'Chất ...",Sodium Hyaluronate là dạng muối của axit hyalu...
7,ARGININE,4,['Chất chống oxy hóa'],Chất này có khả năng chống oxy hóa.
8,ASPARTIC ACID,4,"['Chất chống oxy hóa', 'Chất dưỡng da']",Nghiên cứu chỉ ra rằng amino axit và dẫn xuất ...
9,GLYCINE,4,['Chất dưỡng da'],Thành phần cơ bản của tất cả các protein trong...
