In [1]:
import re
import cv2
import numpy as np
import pandas as pd
import shutil
import os
import random
import pytesseract
from pytesseract import Output
from matplotlib import pyplot as plt
# from google.colab.patches import cv2_imshow
try:
    from PIL import Image, ImageEnhance
except ImportError:
    import Image

    import re

import operator
import itertools
import gc
import pickle

from difflib import SequenceMatcher
from ast import literal_eval
from collections import Counter
from tqdm import tqdm

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [4]:
sample = cv2.imread('Sample_images/sample5.JPG')
cv2.imshow('sample', sample)
cv2.waitKey(0)
cv2.destroyAllWindows()
cv2.waitKey(0)

-1

In [5]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.GaussianBlur(image, (5,5), 0)

# thresholding
def thresholding(image):
    return cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 7, 4)
                            
# return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

# dilation: enhance the bright area
def dilate(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.dilate(image, kernel, iterations=1)

# erosion: enhance the dark area
def erode(image):
  kernel = np.ones((5,5), np.unint8)
  return cv2.erode(image, kernel, iterations=1)

# opening: erosion follow by a dilation
def opening(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

# closing: Dilation followed by Erosion. Removing black holes inside the object 
def closing(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.morphologyEx(opening(image), cv2.MORPH_CLOSE, kernel)

# canny
def canny(image):
      return cv2.Canny(image, 100, 200)

# deskew image
def deskew(image):
    coords = np.column_stack(np.where(image>0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

# template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)

In [6]:
def preprocess_for_ocr(img, enhance=1):
    """
    @param img: image to which the pre-processing steps being applied
    """
    if enhance > 1:
        img = Image.fromarray(img)

        contrast = ImageEnhance.Contrast(img)

        img = contrast.enhance(enhance)

        img = np.asarray(img)
    
    
    gray = get_grayscale(img)
    blur = remove_noise(gray)
    res = thresholding(blur)

    img = cv2.cvtColor(res, cv2.COLOR_GRAY2BGR)

    return img

In [7]:
def get_bounding_box(txt):
    annotation = txt
    with open(annotation, "r") as file1:
        bounding_boxes = file1.read()
        
    bounding_boxes = bounding_boxes.split('\n')[:-1]
    boxes = [i.split(',')[:-1] for i in bounding_boxes]

    new_boxes = []
    for box in boxes:
        new_box = []
        for i, each in enumerate(box):
            num = int(each)
            if i in [0, 1, 3, 6]:
                num -= 3
            else: 
                num += 3
            new_box.append(num)
        new_boxes.append(new_box)
    new_boxes.sort(key=lambda x: x[1])
    
    return new_boxes

In [8]:
def crop_line(img_path, box):
    img = cv2.imread(img_path)
    img, (rh, rw) = resize_image(img)
    # points for test.jpg
    cnt = np.array([
            [[box[0], box[1]]],
            [[box[2], box[3]]],
            [[box[4], box[5]]],
            [[box[6], box[7]]]
        ])
    # print("shape of cnt: {}".format(cnt.shape))
    rect = cv2.minAreaRect(cnt)
#     print("rect: {}".format(rect))

    # the order of the box points: bottom left, top left, top right,
    # bottom right
    box = cv2.boxPoints(rect)
    box = np.int0(box)

    # print("bounding box: {}".format(box))
    cv2.drawContours(img, [box], 0, (0, 0, 255), 2)

    # get width and height of the detected rectangle
    width = int(rect[1][0])
    height = int(rect[1][1])
    angle = rect[2]

    src_pts = box.astype("float32")
    # coordinate of the points in box points after the rectangle has been
    # straightened
    dst_pts = np.array([[0, height+2],
                        [0, 0],
                        [width, 0],
                        [width, height+2]], dtype="float32")

    # the perspective transformation matrix
    M = cv2.getPerspectiveTransform(src_pts, dst_pts)

    # directly warp the rotated rectangle to get the straightened rectangle
    warped = cv2.warpPerspective(img, M, (width, height))

    # cv2.imwrite("crop_img.jpg", warped)
    
    # cv2.waitKey(0)
    if angle < -45:
      warped = np.transpose(warped,(1,0,2))
      warped = warped[::-1]

#     cv2.imshow('croped', warped)
#     cv2.waitKey(0)
#     cv2.destroyAllWindows()


    return warped

In [9]:
# I dont use this one anymore. Change to the function above 'crop_line()'
def crop_each_line(img_path, rect):
    img = cv2.imread(img_path)
    img, (rh, rw) = resize_image(img)
    cnt = np.array([
            [[rect[0], rect[1]]],
            [[rect[2], rect[3]]],
            [[rect[4],rect[5]]],
            [[rect[6], rect[7]]]
        ])
    # find the exact rectangle enclosing the text area
    # rect is a tuple consisting of 3 elements: the first element is the center
    # of the rectangle, the second element is the width, height, and the
    # third element is the detected rotation angle.
    # Example output: ((227.5, 187.50003051757812),
    # (94.57575225830078, 417.98736572265625), -36.982906341552734)
    rect = cv2.minAreaRect(cnt)
    print("rect: {}".format(rect))

    box = cv2.boxPoints(rect)
    box = np.int0(box)

    # print("bounding box: {}".format(box))
    cv2.drawContours(img, [box], 0, (0, 0, 255), 1)

    # img_crop will the cropped rectangle, img_rot is the rotated image
    img_crop, img_rot = crop_rect(img, rect)
    cv2.imwrite("cropped_img.jpg", img)
    cv2.imshow('croped', img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    return img_crop

def resize_image(img):
    img_size = img.shape
    im_size_min = np.min(img_size[0:2])
    im_size_max = np.max(img_size[0:2])

    im_scale = float(600) / float(im_size_min)
    if np.round(im_scale * im_size_max) > 1200:
        im_scale = float(1200) / float(im_size_max)
    new_h = int(img_size[0] * im_scale)
    new_w = int(img_size[1] * im_scale)

    new_h = new_h if new_h // 16 == 0 else (new_h // 16 + 1) * 16
    new_w = new_w if new_w // 16 == 0 else (new_w // 16 + 1) * 16

    re_im = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
    return re_im, (new_h / img_size[0], new_w / img_size[1])

def crop_rect(img, rect):
    # get the parameter of the small rectangle
    center, size, angle = rect[0], rect[1], rect[2]
    center, size = tuple(map(int, center)), tuple(map(int, size))

    # get row and col num in img
    height, width = img.shape[0], img.shape[1]

    # calculate the rotation matrix
    M = cv2.getRotationMatrix2D(center, angle, 1)
    # rotate the original image
    img_rot = cv2.warpAffine(img, M, (width, height))

    # now rotated rectangle becomes vertical, and we crop it
    img_crop = cv2.getRectSubPix(img_rot, size, center)

    return img_crop, img_rot

In [10]:
def ocr(img, oem=3, psm=6):
    """
    @param img: The image to be OCR'd
    @param oem: for specifying the type of Tesseract engine( default=1 for LSTM OCR Engine)
    """
    config = ('-l eng --oem {oem} --psm {psm}'.format(oem=oem,psm=psm))
    # config = ('-l eng --tessdata-dir "/usr/share/tesseract-ocr/tessdata" --oem {oem} -- psm {psm}'.format(oem=oem,psm=psm))

    try:
#         img = Image.fromarray(img)
        text = pytesseract.image_to_string(img, config=config)

        return text
    except:
        
        return ""

In [11]:
def ocr_metrics(predicts, ground_truth, norm_accentuation=False, norm_punctuation=False):
    """Calculate Character Error Rate (CER), Word Error Rate (WER) and Sequence Error Rate (SER)"""
    
    import string
    import unicodedata
    import editdistance

    if len(predicts) == 0 or len(ground_truth) == 0:
        return (1, 1, 1)

    cer, wer, ser = [], [], []

    for (pd, gt) in zip(predicts, ground_truth):

        if norm_accentuation:
            pd = unicodedata.normalize("NFKD", pd).encode("ASCII", "ignore").decode("ASCII")
            gt = unicodedata.normalize("NFKD", gt).encode("ASCII", "ignore").decode("ASCII")

        if norm_punctuation:
            pd = pd.translate(str.maketrans("", "", string.punctuation))
            gt = gt.translate(str.maketrans("", "", string.punctuation))

        pd_cer, gt_cer = list(pd.lower()), list(gt.lower())
        dist = editdistance.eval(pd_cer, gt_cer)
        cer.append(dist / (max(len(pd_cer), len(gt_cer))))

        pd_wer, gt_wer = pd.lower().split(), gt.lower().split()
        dist = editdistance.eval(pd_wer, gt_wer)
        wer.append(dist / (max(len(pd_wer), len(gt_wer))))

        pd_ser, gt_ser = [pd], [gt]
        dist = editdistance.eval(pd_ser, gt_ser)
        ser.append(dist / (max(len(pd_ser), len(gt_ser))))

    cer_f = sum(cer) / len(cer)
    wer_f = sum(wer) / len(wer)
    ser_f = sum(ser) / len(ser)
    
    evaluate = (cer_f, wer_f, ser_f)
    
    e_corpus = "\n".join([
    "Metrics:",
    "Character Error Rate: {}".format(evaluate[0]),
    "Word Error Rate:      {}".format(evaluate[1]),
    "Sequence Error Rate:  {}".format(evaluate[2]),
    ])

    return print(e_corpus)

In [154]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cedric/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/cedric/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [152]:
def clean_string(string):
    pattern = "[\|\*\_\'\{}&]".format('"')
    regex = re.compile('\\\S*')
    text = re.sub(pattern, "", string)
    text = re.sub(",, ", ", ", text)
    text = re.sub(regex, " ", text)
    text_tokens = word_tokenize(text)
    text_wo_sw = [w for w in text_tokens if not w in stopwords.words()]
    text = ' '.join(text_wo_sw)
    #text = text.split(':')[1] # Take out everything after a colon
#     text = text.split('.')[0]
    text = text.strip()
#     split = [x.strip(' ').lower() for x in text.split(',')]

    return text

def string_to_list(text):
    pattern = "[\|\*\_\'\{}]".format('"')
    text = re.sub(pattern, "", text)
    text = text.replace('Aqua/Water/Eau', 'Aqua')
#     split = [x.strip(' ').lower() for x in text.split([\\.,]\\s)split('.')]
    split = [x.strip(' ').lower().replace('aqua/water/eau', 'aqua') for x in re.split("[,.]", text)]
    return split

In [13]:
# def check_for_label(text, words):
#     text = text.lower()
#     for i in range(len(text)):
#         if any(text[i:].startswith(word) for word in words):
#             return True
#     return False

# def clean_string(string):
#     pattern = "[\|\*\_\'\{}]".format('"')
#     text = re.sub(pattern, "", string)
#     text = re.sub(" I ", " / ", text)
#     text = re.sub("^I ", "", text)
#     text = re.sub("Omg", "0mg", text)
#     text = re.sub("Og", "0g", text)
#     text = re.sub('(?<=\d) (?=\w)', '', text)
#     text = change_to_g(text)
#     text = text.strip()
#     return text

In [14]:
def create_dict(ingredient_df, col_name):
    '''Generating rating_dict and category_dict
        ingredient_df: dataframe of the whole ingredient database
        col_name: name of column I want to generate (rating/category)
    '''
    col_dict = {name.strip(): row[col_name] for (idx, row) in ingredient_df.iterrows() for name in row['Ingredient_name'].split('/')} # .split('/')

    return col_dict

In [96]:
def find_matching_ingredient(my_ingredients, rating_dict, thresh=0.25):
    ''' my_ingredients: list of product's ingredients
        Loop thru each ingredient in the ingredient list of the products
        then check if that ingredient appears in our ingredient list
        Calculate match_metric using SequenceMatcher and return the highest score and the best match
        Compare the match_metric with thresh > append to match_dict
    '''
    match_dict = {}
    for ingredient in tqdm(my_ingredients):
        if ingredient in match_dict.keys():
            continue
        match_metric = {key: SequenceMatcher(None, key, ingredient).ratio() for key in rating_dict.keys()}
        best_match, best_metric = max(match_metric.items(), key=operator.itemgetter(1)) # What is operator.itemgetter(1)
        if best_metric > thresh:
            match_dict[ingredient] = best_match
        else:
            match_dict[ingredient] = 'unknown'
    return match_dict

In [97]:
def lookup(ingredient_list, match_dict, rating_dict, rating_score_dict, irritancy_dict, 
           comedogenicity_dict, functions_dict, quick_facts_dict, details_dict, option=''):
    res = []
    for item in ingredient_list:
        key = match_dict.get(item, item)
        rating = rating_dict.get(key, 'unknown')
        rating_score = rating_score_dict.get(key, -1)
        irritancy = irritancy_dict.get(key, 'unknown')
        comedogenicity = comedogenicity_dict.get(key, 'unknown')
        functions = functions_dict.get(key, [])
        quick_facts = quick_facts_dict.get(key, [])
        details = details_dict.get(key, [])

        if option == 'ingredient':
            res.append(key)
        elif option == 'rating':
            res.append(rating)
        elif option == 'rating_score':
            res.append(rating_score)   
        elif option == 'irritancy':
            res.append(irritancy)
        elif option == 'comedogenicity':
            res.append(comedogenicity) 
        elif option == 'functions':
            res.append(functions) 
        elif option == 'quick_facts':
            res.append(quick_facts)          
        elif option == 'details':
            res.append(details)
        else:
            res.extend([[key, rating, rating_score, irritancy, comedogenicity, functions, quick_facts, details]])

    return res

## **Testing: English**

In [73]:
img_file = 'text-detection-ctpn/data/demo/from_hien.JPG'

In [87]:
sample = cv2.imread(img_file)
cv2.imshow('testing', sample)
cv2.waitKey(0)
cv2.destroyAllWindows()
cv2.waitKey(0)

-1

In [88]:
img = preprocess_for_ocr(sample, enhance=2)
cv2.imshow('testing', img)
cv2.waitKey(0)
cv2.destroyAllWindows()
cv2.waitKey(0)

-1

In [90]:
# Get bounding boxes which is result from CTPN
bounding_box_txt = 'text-detection-ctpn/data/res/from_hien.txt'
boxes = get_bounding_box(bounding_box_txt)

In [91]:
boxes

[[77, 6, 995, 21, 995, 63, 76, 48],
 [77, 45, 899, 60, 899, 100, 76, 85],
 [77, 85, 1107, 102, 1107, 142, 76, 125],
 [77, 124, 1123, 141, 1123, 181, 76, 164],
 [77, 162, 739, 174, 739, 215, 76, 203],
 [77, 202, 563, 210, 563, 249, 76, 241],
 [77, 239, 611, 250, 611, 291, 76, 280],
 [828, 273, 1027, 270, 1027, 342, 829, 344],
 [77, 279, 691, 289, 691, 330, 76, 320],
 [77, 316, 691, 329, 691, 370, 76, 357],
 [77, 354, 547, 366, 547, 408, 76, 396],
 [77, 392, 547, 403, 547, 444, 76, 434],
 [77, 431, 675, 446, 675, 488, 76, 473],
 [1012, 438, 1091, 429, 1099, 504, 1021, 513],
 [77, 469, 643, 483, 643, 525, 76, 511]]

In [92]:
# Preprocess again before feeding into tesseract
img_ocr = preprocess_for_ocr(img)
cv2.imshow('testing', img_ocr)
cv2.waitKey(0)
cv2.destroyAllWindows()
cv2.waitKey(0)

-1

In [93]:
# Crop by bouding box > tesseract > result
text = ''
for box in boxes:
    croped = crop_line(img_file, box)
    string = ocr(croped)
    text = text + ', ' + str(string.strip('\n').strip('\x0c').strip())
    

In [94]:
text

', 20215024V01 - INGREDIENTS: AQUA/WATER/EAU, GLYCERIN,, PARAFFINUM LIQUIDUM/MINERAL OIL/HUILE MINERALE,, TRIETHANOLAMINE. SALICYLIC ACID, GLYCERYL STEARATE, AMMONIUM, LACTATE, PEG-100 STEARATE, CETEARYL ALCOHOL, CETYL ALCOHOL, ZEA, MAYS OIL/CORN OIL, CERAMIDE NP, CERAMIDE, AP, CERAMIDE EOP, CARBOMER, BEHENTRIMONIUM METHOSULFATE,, eae\nPas Oe, DIMETHICONE, METHYLPARABEN, SODIUM, LAUROYL LACTYLATE, CHOLECALCIFEROL,, CHOLESTEROL, DISODIUM EDTA,, PROPYLPARABEN. HYDROLYZED, HYALURONIC ACID. PHYTOSPHINGOSINE,, iu, XANTHAN GUM [CODE F.I.L. 0233015/1}'

In [31]:
string = clean_string(text)
ing_list = string_to_list(string)
ing_list

['',
 'aqua (water)',
 'caprylic/capric',
 'triglyceride',
 'cetyl alcohol',
 'propanediol',
 '',
 'stearyl',
 '',
 'alcohol',
 'glycerin',
 'sodium',
 'hyaluronate',
 'arginine',
 'aspartic acid',
 'glycine',
 'alanine',
 'serine valine',
 'isoleucine',
 'proline',
 'threonine',
 'histidine',
 'phenylalanine',
 'glucose',
 'maltose',
 'fructose',
 'trehalose',
 'sodium pca',
 'pca',
 'sodium',
 'lactate',
 'urea',
 'allantoin',
 'linoleic acid',
 'oleic acid',
 'phytosteryl canola',
 'glycerides',
 'palmitic acid',
 'stearic acid',
 'lecithin',
 'triolein',
 'tocopherol',
 'carbomer',
 'isoceteth-20',
 'polysorbate 60',
 'sodium chloride',
 'citric acid',
 'risodium ethylenediamine',
 'disuccinate',
 'pentylene',
 'glycol',
 'triethanolamine',
 'sodium hydroxide',
 'phenoxyethanol',
 '',
 'chlorphenesin',
 '']

In [32]:
# Import ingredient list
df_ingredient = pd.read_csv('Database/ingredient_inci_1570.csv')

In [33]:
df_ingredient.head()

Unnamed: 0,Ingredient_name,Link,Rating,Rating_score,Irritancy,Comedogenicity,Functions,Quick_facts,Details
0,Oubaku Ekisu,https://incidecoder.com/ingredients/phellodend...,goodie,2,,,"{'soothing': '/ingredient-functions/soothing',...",,['A\xa0traditional East Asian medicine that ha...
1,Type of clay,https://incidecoder.com/ingredients/solum-full...,goodie,2,,,{'viscosity controlling': '/ingredient-functio...,,['Fuller\'s Earth describes types of clay\xa0t...
2,18:2 cis-9,https://incidecoder.com/ingredients/linoleic-acid,goodie,2,,,{'skin-identical ingredient': '/ingredient-fun...,,"['The famous\xa0omega-6 fatty acid,\xa0the mot..."
3,3-O-Ethyl Ascorbate,https://incidecoder.com/ingredients/ethyl-asco...,goodie,2,,,{'antioxidant': '/ingredient-functions/antioxi...,,"[""A very stable and promising form of the skin..."
4,ALA,https://incidecoder.com/ingredients/linolenic-...,goodie,2,,,{'skin-identical ingredient': '/ingredient-fun...,,"['The famous omega-3 fatty acid,\xa0the mother..."


In [34]:
rating = create_dict(df_ingredient, 'Rating')
rating_score = create_dict(df_ingredient, 'Rating_score')
irritancy = create_dict(df_ingredient, 'Irritancy')
comedogenicity = create_dict(df_ingredient, 'Comedogenicity')
functions = create_dict(df_ingredient, 'Functions')
quick_facts = create_dict(df_ingredient, 'Quick_facts')
details = create_dict(df_ingredient, 'Details')

In [98]:
match_dict = find_matching_ingredient(ing_list, rating, thresh=0.55)

100%|██████████| 57/57 [00:02<00:00, 19.98it/s]


In [99]:
match_dict

{'': 'unknown',
 'aqua (water)': 'unknown',
 'caprylic/capric': 'Caprylic',
 'triglyceride': 'Stearic Triglyceride',
 'cetyl alcohol': 'Cetyl Alcohol',
 'propanediol': 'Propanediol',
 'stearyl': 'Phytosteryl',
 'alcohol': 'Alcohol',
 'glycerin': 'Diglycerin',
 'sodium': 'Disodium EDTA',
 'hyaluronate': 'Sodium Hyaluronate',
 'arginine': 'Arginine',
 'aspartic acid': 'Aspartic Acid',
 'glycine': 'Glycine',
 'alanine': 'Alanine',
 'serine valine': 'Argireline',
 'isoleucine': 'Isoleucine',
 'proline': 'Proline',
 'threonine': 'Threonine',
 'histidine': 'Histidine',
 'phenylalanine': 'Phenylalanine',
 'glucose': 'Glucose',
 'maltose': 'Trehalose',
 'fructose': 'Fructose',
 'trehalose': 'Trehalose',
 'sodium pca': 'Sodium Acrylate',
 'pca': 'Pca',
 'lactate': 'Tea-Lactate',
 'urea': 'Urea',
 'allantoin': 'Allantoin',
 'linoleic acid': 'Linoleic Acid',
 'oleic acid': 'Oleic Acid',
 'phytosteryl canola': 'Phytosteryl',
 'glycerides': 'Diglycerin',
 'palmitic acid': 'Palmitic Acid',
 'stearic

In [100]:
ing_list_f = filter(lambda x: x != "", ing_list)
ing_list2 = [x for x in ing_list_f]
ing_list2

['aqua (water)',
 'caprylic/capric',
 'triglyceride',
 'cetyl alcohol',
 'propanediol',
 'stearyl',
 'alcohol',
 'glycerin',
 'sodium',
 'hyaluronate',
 'arginine',
 'aspartic acid',
 'glycine',
 'alanine',
 'serine valine',
 'isoleucine',
 'proline',
 'threonine',
 'histidine',
 'phenylalanine',
 'glucose',
 'maltose',
 'fructose',
 'trehalose',
 'sodium pca',
 'pca',
 'sodium',
 'lactate',
 'urea',
 'allantoin',
 'linoleic acid',
 'oleic acid',
 'phytosteryl canola',
 'glycerides',
 'palmitic acid',
 'stearic acid',
 'lecithin',
 'triolein',
 'tocopherol',
 'carbomer',
 'isoceteth-20',
 'polysorbate 60',
 'sodium chloride',
 'citric acid',
 'risodium ethylenediamine',
 'disuccinate',
 'pentylene',
 'glycol',
 'triethanolamine',
 'sodium hydroxide',
 'phenoxyethanol',
 'chlorphenesin']

In [101]:
ing_analysize = lookup(ing_list2, match_dict, rating, rating_score, irritancy, comedogenicity, functions, quick_facts, details)

In [102]:
ing_analysize

[['unknown', 'unknown', -1, 'unknown', 'unknown', [], [], []],
 ['Caprylic',
  'No rating',
  -1,
  nan,
  nan,
  "{'emollient': '/ingredient-functions/emollient'}",
  nan,
  '["A vegetable origin emollient that has a similar consistency to lard (solid at room temperature) but melts rapidly upon contact with the skin. \\xa0It\'s claimed to have great skin compatibility, penetrates easily, does not feel tacky or heavy on the skin and does not leave a greasy shine."]'],
 ['Stearic Triglyceride',
  'No rating',
  -1,
  nan,
  nan,
  "{'emollient': '/ingredient-functions/emollient'}",
  nan,
  '["A vegetable origin emollient that has a similar consistency to lard (solid at room temperature) but melts rapidly upon contact with the skin. \\xa0It\'s claimed to have great skin compatibility, penetrates easily, does not feel tacky or heavy on the skin and does not leave a greasy shine."]'],
 ['Cetyl Alcohol',
  'No rating',
  -1,
  '2',
  '2',
  "{'emollient': '/ingredient-functions/emollient',

In [70]:
ingredient_analysing = pd.DataFrame(ing_analysize,columns=['Ingredient_name', 'Rating', 'Rating_score', 'Irritancy',
       'Comedogenicity', 'Functions', 'Quick_facts', 'Details'])

In [71]:
df_ingredient.columns.values

array(['Ingredient_name', 'Link', 'Rating', 'Rating_score', 'Irritancy',
       'Comedogenicity', 'Functions', 'Quick_facts', 'Details'],
      dtype=object)

In [72]:
ingredient_analysing

Unnamed: 0,Ingredient_name,Rating,Rating_score,Irritancy,Comedogenicity,Functions,Quick_facts,Details
0,Squalane,goodie,2,0,1,{'skin-identical ingredient': '/ingredient-fun...,,"[""It seems to us that squalane is in fashion a..."
1,Caprylic,No rating,-1,,,{'emollient': '/ingredient-functions/emollient'},,"[""A vegetable origin emollient that has a simi..."
2,Stearic Triglyceride,No rating,-1,,,{'emollient': '/ingredient-functions/emollient'},,"[""A vegetable origin emollient that has a simi..."
3,Cetyl Alcohol,No rating,-1,2,2,{'emollient': '/ingredient-functions/emollient...,,"['A so-called\xa0fatty (the good, non-drying k..."
4,Propanediol,No rating,-1,,,"{'solvent': '/ingredient-functions/solvent', '...",,"[""Propanediol is a natural alternative for the..."
5,Phytosteryl,goodie,2,,,{'emollient': '/ingredient-functions/emollient'},,['An amino-acid (glutamic acid) derived molecu...
6,Alcohol,icky,0,,,{'antimicrobial/antibacterial': '/ingredient-f...,,"[""Simply alcohol refers to ethanol and it's a ..."
7,Diglycerin,goodie,2,,,{'skin-identical ingredient': '/ingredient-fun...,,"[""The big brother of glycerin. It's also a nat..."
8,Disodium EDTA,No rating,-1,,,{'chelating': '/ingredient-functions/chelating...,,['Super common little helper ingredient that\x...
9,Sodium Hyaluronate,goodie,2,0,0,{'skin-identical ingredient': '/ingredient-fun...,,"[""It’s the - sodium form - cousin of the famou..."


## Ced's Modifications

In [103]:
# Fuzzydict

import difflib

class FuzzyDict(dict):
    "Provides a dictionary that performs fuzzy lookup"
    def __init__(self, items = None, cutoff = .6):
        """Construct a new FuzzyDict instance

        items is an dictionary to copy items from (optional)
        cutoff is the match ratio below which mathes should not be considered
        cutoff needs to be a float between 0 and 1 (where zero is no match
        and 1 is a perfect match)"""
        super(FuzzyDict, self).__init__()

        if items:
            self.update(items)
        self.cutoff =  cutoff

        # short wrapper around some super (dict) methods
        self._dict_contains = lambda key: \
            super(FuzzyDict,self).__contains__(key)

        self._dict_getitem = lambda key: \
            super(FuzzyDict,self).__getitem__(key)

    def _search(self, lookfor, stop_on_first = False):
        """Returns the value whose key best matches lookfor

        if stop_on_first is True then the method returns as soon
        as it finds the first item
        """

        # if the item is in the dictionary then just return it
        if self._dict_contains(lookfor):
            return True, lookfor, self._dict_getitem(lookfor), 1

        # set up the fuzzy matching tool
        ratio_calc = difflib.SequenceMatcher()
        ratio_calc.set_seq1(lookfor)

        # test each key in the dictionary
        best_ratio = 0
        best_match = None
        best_key = None
        for key in self:

            # if the current key is not a string
            # then we just skip it
            try:
                # set up the SequenceMatcher with other text
                ratio_calc.set_seq2(key)
            except TypeError:
                continue

            # we get an error here if the item to look for is not a
            # string - if it cannot be fuzzy matched and we are here
            # this it is defintely not in the dictionary
            try:
            # calculate the match value
                ratio = ratio_calc.ratio()
            except TypeError:
                break

            # if this is the best ratio so far - save it and the value
            if ratio > best_ratio:
                best_ratio = ratio
                best_key = key
                best_match = self._dict_getitem(key)

            if stop_on_first and ratio >= self.cutoff:
                break

        return (
            best_ratio >= self.cutoff,
            best_key,
            best_match,
            best_ratio)


    def __contains__(self, item):
        "Overides Dictionary __contains__ to use fuzzy matching"
        if self._search(item, True)[0]:
            return True
        else:
            return False

    def __getitem__(self, lookfor):
        "Overides Dictionary __getitem__ to use fuzzy matching"
        matched, key, item, ratio = self._search(lookfor)

        if not matched:
            raise KeyError(
                "'%s'. closest match: '%s' with ratio %.3f"%
                    (str(lookfor), str(key), ratio))

        return item

In [134]:
def fuzzy_match_ingredients(ing_list, fuzdict):
    match_dict = {}
    for ing in tqdm(ing_list):
        if ing in match_dict.keys():
            continue
        upper_ing = ing.upper()
        if fuzdict.__contains__(upper_ing):
            match_dict[ing] = fuzdict[upper_ing]
    
    return match_dict

In [122]:
dfcosing = pd.read_csv('Database/ingredient_cosing_37309.csv')
dfcosing.head()

Unnamed: 0.1,Unnamed: 0,COSING Ref No,INCI name,Chem/IUPAC Name / Description,Restriction,Function
0,0,94753.0,DISODIUM TETRAMETHYLHEXADECENYLCYSTEINE FORMYL...,Disodium Tetramethylhexadecenylcysteine Formyl...,,SKIN PROTECTING
1,1,96229.0,ASTROCARYUM VULGARE SEED BUTTER,Astrocaryum Vulgare Seed Butter is the fat obt...,,"SKIN CONDITIONING, SKIN CONDITIONING - EMOLLIENT"
2,2,89177.0,BARLEY SH-POLYPEPTIDE-17,Barley sh-Polypeptide-17 is a single chain rec...,,"HAIR CONDITIONING, SKIN CONDITIONING"
3,3,98580.0,DAUCUS CAROTA SATIVA LEAF EXTRACT,Daucus Carota Sativa (Carrot) Leaf Extract is ...,,SKIN CONDITIONING - MISCELLANEOUS
4,4,89078.0,GOSSYPIUM HIRSUTUM SEED EXTRACT,Gossypium Hirsutum Seed Extract is the extrac...,,SKIN CONDITIONING


In [125]:
list(dfcosing.columns)

['Unnamed: 0',
 'COSING Ref No',
 'INCI name',
 'Chem/IUPAC Name / Description',
 'Restriction',
 'Function']

In [126]:
# dfi3 = pd.read_csv('Database/ingredient_inci_1570.csv')
ingnames_dict = {name.strip(): name.strip() for name in dfcosing['INCI name']}
# print(ingnames_dict)
fd = FuzzyDict(ingnames_dict, cutoff = .55)
print(len(fd))

29908


In [133]:
fd['GLICOGEN']

'GLYCOGEN'

In [135]:
def cosing_lookup(ing_list, match_dict, desc, restrictions, functions):
    res = []
    for item in ing_list:
        key = match_dict.get(item, "")
        if key == "":
            continue
        description = desc.get(key, "unknown")
        restrictn = restrictions.get(key, "unknown")
        functn = functions.get(key, "unknown")
        res.extend([[key, description, restrictn, functn]])
    
    return res

In [142]:
def create_cosing_dict(ingredient_df, col_name):
    col_dict = {name.strip(): row[col_name] for (idx, row) in ingredient_df.iterrows() for name in row['INCI name'].split('/')} # .split('/')

    return col_dict
    

## **End-to-end: using INCI in English**

In [42]:
def before_ctpn(img_path):
    img = cv2.imread(img_path)
    img_ctpn = preprocess_for_ocr(img, enhance=2)
    
    return img_ctpn

In [171]:
def after_ctpn(img_path, txt_path, ingredient_df_path):
    # Get annotations of bounding boxes
    boxes = get_bounding_box(txt_path)
    
    # Preprocess image for OCR:
    img = cv2.imread(img_path)
    #img_ocr = preprocess_for_ocr(img)
    img_ocr = img
    
    # doing OCR
    text = ''
    for box in boxes:
        croped = crop_line(img_path, box)
        string = ocr(croped)
        text = text + ' ' + str(string.strip('\n').strip('\x0c').strip())
    
    print(text)
    
    # Cleaning result from OCR
    text_result = clean_string(text)
    ing_list = string_to_list(text_result)
    
    print("-----")
    print(text_result)
    
    # Loading ingredient dataframe
    df_ingredient = pd.read_csv(ingredient_df_path)
    
    # Create separated dict for constructing result df later
    #rating = create_dict(df_ingredient, 'Rating')
    #rating_score = create_dict(df_ingredient, 'Rating_score')
    #irritancy = create_dict(df_ingredient, 'Irritancy')
    #comedogenicity = create_dict(df_ingredient, 'Comedogenicity')
    #functions = create_dict(df_ingredient, 'Functions')
    #quick_facts = create_dict(df_ingredient, 'Quick_facts')
    #details = create_dict(df_ingredient, 'Details')
    
    desc = create_cosing_dict(df_ingredient, 'Chem/IUPAC Name / Description')
    restrictions = create_cosing_dict(df_ingredient, 'Restriction')
    functions = create_cosing_dict(df_ingredient, 'Function')
    
    
    ingnames_dict = {name.strip(): name.strip() for name in df_ingredient['INCI name']}
    fd = FuzzyDict(ingnames_dict, cutoff = .75)
    
    # Compare product ingredient list and database
    # match_dict = find_matching_ingredient(ing_list, rating, 0.55)
    match_dict = fuzzy_match_ingredients(ing_list, fd)
    print(match_dict)
    print(list(match_dict.values()))
    
    # Analyzing ingredient
    #ing_analysize = lookup(ing_list, match_dict, rating, rating_score, irritancy, comedogenicity, functions, quick_facts, details)
    ing_analysize = cosing_lookup(ing_list, match_dict, desc, restrictions, functions)
    
    # Put everything into a dataframe
    df_res = pd.DataFrame(ing_analysize, columns=['Ingredient_name', 'Description', 'Restrictions', 'Functions'])
    
    return df_res

In [172]:
fi = 'text-detection-ctpn/data/demo/IMG_0578_thresh.png'
ftxt = 'text-detection-ctpn/data/res/IMG_0578_thresh.txt'
res = after_ctpn(fi, ftxt, 'Database/ingredient_cosing_37309.csv')

 SMUOUWIUL COOL NV SNTHOL+  with over 50 ye» expertise in antidandruf 1.S0logy, Aas innovative share. 30 w:th Magnetic Lifting Foas, Eliminate dandrufi™, itch**, oil & reduces scalp maiodor while leaving pourishers on sccip and hair, UMC ee eit: sei etek cuales \ ‘Tha lather that is like skin-care product evfectively nourishes scalp. | GOR eh olresonalasiinciel oni llt teams | chus SETH bitch due TOE 10 CON Orr “no visible flakes, with regular use “comparing vs previous head & shoulders formula P&G calculation based on Nielsen sales information July 2017 - June 2013 intly massage onto scalp, Lather and rinse Directic ons; Shake before use. Wer hair Gently ma happens, rinse thoroughly. Repeat if desirod. Caution: Avoid contact with eyes. If this} thoroughly with water. Ingredients: WATER. SODIUM LAURETH SULFATE, DIMETHICONE, | SODIUM XYLENESULFONATE. ZINC CARBONATE GLYCOL DISTEARATE SODIUM LAURYL | SULFATE, SODIUM CHLORIDE, ZINC PYRITHIONE COCAMIDOPROPYL BETAINE COCAMIDE | MEA FRAGR FNT

100%|██████████| 56/56 [01:32<00:00,  1.66s/it]

{'sodium laureth sulfate': 'SODIUM LAURETH SULFATE', 'dimethicone': 'DIMETHICONE', 'sodium xylenesulfonate': 'SODIUM XYLENESULFONATE', 'sodium chloride': 'SODIUM CHLORIDE', 'guar kydroxypropylirimonium chloride sodium bance': 'GUAR HYDROXYPROPYLTRIMONIUM CHLORIDE', 'vie benzoate': 'SILVER BENZOATE', 'stearyl alcohol': 'STEARYL ALCOHOL', 'magnesium carsonate hydroxide': 'MAGNESIUM CARBONATE HYDROXIDE', 'methylchloro ! sothiazolinone': 'METHYLCHLOROISOTHIAZOLINONE', 'methylisothiazolinone': 'METHYLISOTHIAZOLINONE', '( 142090': 'CI 42090', 'c ! 17200': 'CI 17200'}
['SODIUM LAURETH SULFATE', 'DIMETHICONE', 'SODIUM XYLENESULFONATE', 'SODIUM CHLORIDE', 'GUAR HYDROXYPROPYLTRIMONIUM CHLORIDE', 'SILVER BENZOATE', 'STEARYL ALCOHOL', 'MAGNESIUM CARBONATE HYDROXIDE', 'METHYLCHLOROISOTHIAZOLINONE', 'METHYLISOTHIAZOLINONE', 'CI 42090', 'CI 17200']





In [159]:
res

Unnamed: 0,Ingredient_name,Description,Restrictions,Functions
0,SODIUM LAURETH SULFATE,sodium 2-(2-dodecyloxyethoxy)ethyl sulphate,,"CLEANSING, FOAMING, SURFACTANT - CLEANSING, SU..."
1,DIMETHICONE,Divinyldimethicone/Dimethicone/Silsesquioxane ...,,"ANTICAKING, OPACIFYING, VISCOSITY CONTROLLING"
2,SODIUM XYLENESULFONATE,Sodium xylenesulphonate,,SURFACTANT - HYDROTROPE
3,PROPYLENE GLYCOL DISTEARATE,Propylene distearate,,"OPACIFYING, PEARLESCENT, SKIN CONDITIONING - E..."
4,SODIUM LAURYL SULFATE,Sodium dodecyl sulphate,,"CLEANSING, DENATURANT, FOAMING, SURFACTANT - C..."
5,SODIUM PYRITHIONE,1-Oxo-2-Pyridinethiol Sodium Salt,II/369,PRESERVATIVE
6,COCAMIDOPROPYL BETAINE,"1-Propanaminium, 3-amino-N-(carboxymethyl)-N,N...",,"ANTISTATIC, CLEANSING, HAIR CONDITIONING, SURF..."
7,GUAR HYDROXYPROPYLTRIMONIUM CHLORIDE,"Guar gum, 2-hydroxy-3-(trimethylammonio)propyl...",,"ANTISTATIC, FILM FORMING, SKIN CONDITIONING, V..."
8,SODIUM ANISATE,Sodium anisate,,"ANTIMICROBIAL, FLAVOURING"
9,BENZOATE,"2,2-Dihydroxymethylpropane-1,3-diol, tetraeste...",,"HAIR CONDITIONING, SKIN CONDITIONING - EMOLLIENT"
