In [1]:
import re
import cv2
import numpy as np
import pandas as pd
import shutil
import os
import random
import pytesseract
from pytesseract import Output
from matplotlib import pyplot as plt
# from google.colab.patches import cv2_imshow
try:
    from PIL import Image, ImageEnhance
except ImportError:
    import Image

    import re

import operator
import itertools
import gc
import pickle

from difflib import SequenceMatcher
from ast import literal_eval
from collections import Counter
from tqdm import tqdm

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbookpro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/macbookpro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.GaussianBlur(image, (5,5), 0)

# thresholding
def thresholding(image):
    return cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 7, 4)
                            
# return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

# dilation: enhance the bright area
def dilate(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.dilate(image, kernel, iterations=1)

# erosion: enhance the dark area
def erode(image):
  kernel = np.ones((5,5), np.unint8)
  return cv2.erode(image, kernel, iterations=1)

# opening: erosion follow by a dilation
def opening(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

# closing: Dilation followed by Erosion. Removing black holes inside the object 
def closing(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.morphologyEx(opening(image), cv2.MORPH_CLOSE, kernel)

# canny
def canny(image):
      return cv2.Canny(image, 100, 200)

# deskew image
def deskew(image):
    coords = np.column_stack(np.where(image>0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

# template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)

In [4]:
def preprocess_for_ocr(img, enhance=1):
    """
    @param img: image to which the pre-processing steps being applied
    """
    if enhance > 1:
        img = Image.fromarray(img)

        contrast = ImageEnhance.Contrast(img)

        img = contrast.enhance(enhance)

        img = np.asarray(img)
    
    
    gray = get_grayscale(img)
    blur = remove_noise(gray)
    res = thresholding(blur)

    img = cv2.cvtColor(res, cv2.COLOR_GRAY2BGR)

    return img

In [5]:
def get_bounding_box(txt):
    annotation = txt
    with open(annotation, "r") as file1:
        bounding_boxes = file1.read()
        
    bounding_boxes = bounding_boxes.split('\n')[:-1]
    boxes = [i.split(',')[:-1] for i in bounding_boxes]

    new_boxes = []
    for box in boxes:
        new_box = []
        for i, each in enumerate(box):
            num = int(each)
            if i in [0, 1, 3, 6]:
                num -= 3
            else: 
                num += 3
            new_box.append(num)
        new_boxes.append(new_box)
    new_boxes.sort(key=lambda x: x[1])
    
    return new_boxes

In [6]:
def crop_line(img_path, box):
    img = cv2.imread(img_path)
    img, (rh, rw) = resize_image(img)
    # points for test.jpg
    cnt = np.array([
            [[box[0], box[1]]],
            [[box[2], box[3]]],
            [[box[4], box[5]]],
            [[box[6], box[7]]]
        ])
    # print("shape of cnt: {}".format(cnt.shape))
    rect = cv2.minAreaRect(cnt)
#     print("rect: {}".format(rect))

    # the order of the box points: bottom left, top left, top right,
    # bottom right
    box = cv2.boxPoints(rect)
    box = np.int0(box)

    # print("bounding box: {}".format(box))
    cv2.drawContours(img, [box], 0, (0, 0, 255), 2)

    # get width and height of the detected rectangle
    width = int(rect[1][0])
    height = int(rect[1][1])
    angle = rect[2]

    src_pts = box.astype("float32")
    # coordinate of the points in box points after the rectangle has been
    # straightened
    dst_pts = np.array([[0, height+2],
                        [0, 0],
                        [width, 0],
                        [width, height+2]], dtype="float32")

    # the perspective transformation matrix
    M = cv2.getPerspectiveTransform(src_pts, dst_pts)

    # directly warp the rotated rectangle to get the straightened rectangle
    warped = cv2.warpPerspective(img, M, (width, height))

    # cv2.imwrite("crop_img.jpg", warped)
    
    # cv2.waitKey(0)
    if angle < -45:
      warped = np.transpose(warped,(1,0,2))
      warped = warped[::-1]

#     cv2.imshow('croped', warped)
#     cv2.waitKey(0)
#     cv2.destroyAllWindows()


    return warped

In [7]:
def resize_image(img):
    img_size = img.shape
    im_size_min = np.min(img_size[0:2])
    im_size_max = np.max(img_size[0:2])

    im_scale = float(600) / float(im_size_min)
    if np.round(im_scale * im_size_max) > 1200:
        im_scale = float(1200) / float(im_size_max)
    new_h = int(img_size[0] * im_scale)
    new_w = int(img_size[1] * im_scale)

    new_h = new_h if new_h // 16 == 0 else (new_h // 16 + 1) * 16
    new_w = new_w if new_w // 16 == 0 else (new_w // 16 + 1) * 16

    re_im = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
    return re_im, (new_h / img_size[0], new_w / img_size[1])

def crop_rect(img, rect):
    # get the parameter of the small rectangle
    center, size, angle = rect[0], rect[1], rect[2]
    center, size = tuple(map(int, center)), tuple(map(int, size))

    # get row and col num in img
    height, width = img.shape[0], img.shape[1]

    # calculate the rotation matrix
    M = cv2.getRotationMatrix2D(center, angle, 1)
    # rotate the original image
    img_rot = cv2.warpAffine(img, M, (width, height))

    # now rotated rectangle becomes vertical, and we crop it
    img_crop = cv2.getRectSubPix(img_rot, size, center)

    return img_crop, img_rot

In [8]:
def ocr(img, oem=3, psm=6):
    """
    @param img: The image to be OCR'd
    @param oem: for specifying the type of Tesseract engine( default=1 for LSTM OCR Engine)
    """
    config = ('-l eng --oem {oem} --psm {psm}'.format(oem=oem,psm=psm))
    # config = ('-l eng --tessdata-dir "/usr/share/tesseract-ocr/tessdata" --oem {oem} -- psm {psm}'.format(oem=oem,psm=psm))

    try:
#         img = Image.fromarray(img)
        text = pytesseract.image_to_string(img, config=config)

        return text
    except:
        
        return ""

In [9]:
def ocr_metrics(predicts, ground_truth, norm_accentuation=False, norm_punctuation=False):
    """Calculate Character Error Rate (CER), Word Error Rate (WER) and Sequence Error Rate (SER)"""
    
    import string
    import unicodedata
    import editdistance

    if len(predicts) == 0 or len(ground_truth) == 0:
        return (1, 1, 1)

    cer, wer, ser = [], [], []

    for (pd, gt) in zip(predicts, ground_truth):

        if norm_accentuation:
            pd = unicodedata.normalize("NFKD", pd).encode("ASCII", "ignore").decode("ASCII")
            gt = unicodedata.normalize("NFKD", gt).encode("ASCII", "ignore").decode("ASCII")

        if norm_punctuation:
            pd = pd.translate(str.maketrans("", "", string.punctuation))
            gt = gt.translate(str.maketrans("", "", string.punctuation))

        pd_cer, gt_cer = list(pd.lower()), list(gt.lower())
        dist = editdistance.eval(pd_cer, gt_cer)
        cer.append(dist / (max(len(pd_cer), len(gt_cer))))

        pd_wer, gt_wer = pd.lower().split(), gt.lower().split()
        dist = editdistance.eval(pd_wer, gt_wer)
        wer.append(dist / (max(len(pd_wer), len(gt_wer))))

        pd_ser, gt_ser = [pd], [gt]
        dist = editdistance.eval(pd_ser, gt_ser)
        ser.append(dist / (max(len(pd_ser), len(gt_ser))))

    cer_f = sum(cer) / len(cer)
    wer_f = sum(wer) / len(wer)
    ser_f = sum(ser) / len(ser)
    
    evaluate = (cer_f, wer_f, ser_f)
    
    e_corpus = "\n".join([
    "Metrics:",
    "Character Error Rate: {}".format(evaluate[0]),
    "Word Error Rate:      {}".format(evaluate[1]),
    "Sequence Error Rate:  {}".format(evaluate[2]),
    ])

    return print(e_corpus)

In [10]:
# def clean_string(string):
# #     string = string + ' ' + str(string.strip('\n').strip('\x0c').strip())
#     text = string.replace('INACTIVE INGREDIENTS:', '') # added
#     text = text.replace('ACTIVE INGREDIENTS:', '') # added
#     text = text.split(':')[1]
#     text = text.strip()

#     return text

# def string_to_list(text):
#     pattern = "[\|\*\_\'\{}]".format('"')
#     text = re.sub(pattern, "", text)  
#     split = [remove_water(x) for x in text.split(',')]
    
#     return split

In [11]:
def clean_string(string):
    text = string.replace('INACTIVE INGREDIENTS:', '') # added
    text = text.replace('ACTIVE INGREDIENTS:', '') # added
    text = text.split(':')[1]
    
    pattern = "[\|\*\_\'\{}&]".format('"')
    regex = re.compile('\\\S+')
    
    text = re.sub(pattern, "", text)
    text = re.sub(",, ", ", ", text)
    text = re.sub(regex, " ", text)
    text = re.sub('\.', " ", text)
    text_tokens = word_tokenize(text)
    text_wo_sw = [w for w in text_tokens if not w in stopwords.words()]
    text = ' '.join(text_wo_sw)
    text = text.strip()

    return text

def string_to_list(text):
    pattern = "[\|\*\_\'\{}]".format('"')
    text = re.sub(pattern, "", text)
    split = [remove_water(x) for x in re.split("[,.]", text)]
    
    return split

In [12]:
def remove_water(string):
    water = ['WATER (AQUA)', 'AQUA', 'EAU', 'AQUA/WATER/EAU', 'AQUA / WATER / EAU', 
             'PURIFIED WATER', 'DISTILLED WATER', 'D.I. WATER', 'AQUA (WATER)', 'AQUA (PURIFIED)']
    text = string.upper()
    if text in water:
        text = 'WATER'
    text = text.strip('  ')
    
    return text

In [13]:
def clean_string_name(string):
    text = string.replace(' Size: 4 oz. * USDA Certified Organic Ingredient ** None remains after saponifying oils into soap and glycerin', '')
    pattern = "[\|\*\_\'\{}&]".format('"')
    regex1 = re.compile('\[.*]')
    regex2 = re.compile('\[.*}')
    text = re.sub(pattern, "", text)
    text = re.sub(regex1, "", text)
    text = re.sub(regex2, "", text)
    text = text.strip('\x0c')
    text = text.strip(' ')
    text = text.upper()
    text = text.replace('INACTIVE INGREDIENTS:', '')
    text = text.replace('ACTIVE INGREDIENTS:', '')

    return text

In [14]:
def find_matching_ingredient(my_ingredients, fd, thresh=0.25): # rating_dict, category_dict, 
    ''' my_ingredients: list of product's ingredients
        Loop thru each ingredient in the ingredient list of the products
        then check if that ingredient appears in our ingredient list
        Calculate match_metric using SequenceMatcher and return the highest score and the best match
        Compare the match_metric with thresh > append to match_dict
    '''
    match_dict = {}
    for ingredient in tqdm(my_ingredients):
        if ingredient in match_dict.keys():
            continue
#         match_metric = {key: SequenceMatcher(None, key, ingredient).ratio() for key in rating_dict.keys()}
        match_metric = {key: SequenceMatcher(None, key, ingredient).ratio() for key in fd}
        best_match, best_metric = max(match_metric.items(), key=operator.itemgetter(1)) # What is operator.itemgetter(1)
        if best_metric > thresh:
            match_dict[ingredient] = best_match
        else:
            match_dict[ingredient] = 'unknown'
    return match_dict

In [15]:
def create_dict_english(df_inci, df_cosing):
    rating_inci = {}
    irritancy_inci = {}
    comedogenicity_inci = {}
    function_inci = {}
    qfacts_inci = {}
    desc_inci = {}
    
    desc_cosing = {}
    function_cosing = {}
    
    for idx, row in tqdm(df_inci.iterrows()):
        for name in row['ingredient_name'].split('/'):
            chem_name = name.strip()
            rating_inci[chem_name] = row['rating']
            irritancy_inci[chem_name] = row['irritancy']
            comedogenicity_inci[chem_name] = row['comedogenicity']
            function_inci[chem_name] = row['functions']
            qfacts_inci[chem_name] = row['quick_facts']
            desc_inci[chem_name] = row['description']
            
    for idx, row in tqdm(df_cosing.iterrows()):
        for name in row['ingredient_name'].split('/'):
            desc_cosing[name] = row['description']
            function_cosing[name] = row['functions']    
    
    return rating_inci, irritancy_inci, comedogenicity_inci, function_inci, qfacts_inci, desc_inci, desc_cosing, function_cosing

In [16]:
df_inci = pd.read_csv('../Database/INCI/ingredient_inci_1570.csv') # '../Database/CALLMEDUY/ingredient_vietnamese_3818.csv'

In [18]:
df_inci

Unnamed: 0,ingredient_name,link,rating,rating_score,irritancy,comedogenicity,functions,quick_facts,description
0,OUBAKU EKISU,https://incidecoder.com/ingredients/phellodend...,GOODIE,2,,,"SOOTHING, ANTI-ACNE",,A traditional East Asian medicine that has a c...
1,TYPE OF CLAY,https://incidecoder.com/ingredients/solum-full...,GOODIE,2,,,"VISCOSITY CONTROLLING, ABSORBENT/MATTIFIER",,Fuller Earth describes types of clay that have...
2,18:2 CIS-9,https://incidecoder.com/ingredients/linoleic-acid,GOODIE,2,,,"SKIN-IDENTICAL INGREDIENT, EMOLLIENT, SURFACTA...",,"The famous omega-6 fatty acid, the mother of a..."
3,3-O-ETHYL ASCORBATE,https://incidecoder.com/ingredients/ethyl-asco...,GOODIE,2,,,"ANTIOXIDANT, SKIN BRIGHTENING",,A very stable and promising form of the skinca...
4,ALA,https://incidecoder.com/ingredients/linolenic-...,GOODIE,2,,,"SKIN-IDENTICAL INGREDIENT, EMOLLIENT, SURFACTA...",,"The famous omega-3 fatty acid, the mother of a..."
...,...,...,...,...,...,...,...,...,...
1565,MINIHA,https://incidecoder.com/ingredients/hydrolyzed...,NO RATING,-1,,,MOISTURIZER/HUMECTANT,,"It a super small, chemically chopped up versio..."
1566,TEA OIL CAMELLIA,https://incidecoder.com/ingredients/camellia-o...,GOODIE,2,,,"ANTIOXIDANT, SOOTHING",,Camellia Oleifera is a type of green tea plant...
1567,VITAMIN B3,https://incidecoder.com/ingredients/niacinamide,SUPERSTAR,3,,,"CELL-COMMUNICATING INGREDIENT, SKIN BRIGHTENIN...",['A multi-functional skincare superstar with s...,"Niacinamide, or as us normal people call it vi..."
1568,SS-WHITE,https://incidecoder.com/ingredients/oligopepti...,GOODIE,2,,,SKIN BRIGHTENING,,A skin-brightening peptide that is claimed to ...


In [17]:
df_cosing = pd.read_csv('../Database/ingredient_cosing_37309.csv') #'../Database/ingredient_cosing_37309.csv'

In [19]:
df_cosing

Unnamed: 0,cosing_ref_no,ingredient_name,description,restriction,functions
0,94753.0,DISODIUM TETRAMETHYLHEXADECENYLCYSTEINE FORMYL...,Disodium Tetramethylhexadecenylcysteine Formyl...,,SKIN PROTECTING
1,96229.0,ASTROCARYUM VULGARE SEED BUTTER,Astrocaryum Vulgare Seed Butter is the fat obt...,,"SKIN CONDITIONING, SKIN CONDITIONING - EMOLLIENT"
2,89177.0,BARLEY SH-POLYPEPTIDE-17,Barley sh-Polypeptide-17 is a single chain rec...,,"HAIR CONDITIONING, SKIN CONDITIONING"
3,98580.0,DAUCUS CAROTA SATIVA LEAF EXTRACT,Daucus Carota Sativa (Carrot) Leaf Extract is ...,,SKIN CONDITIONING - MISCELLANEOUS
4,89078.0,GOSSYPIUM HIRSUTUM SEED EXTRACT,Gossypium Hirsutum Seed Extract is the extrac...,,SKIN CONDITIONING
...,...,...,...,...,...
37304,90014.0,PHELLODENDRON AMURENSE BARK,This is a filtrate of the product obtained by ...,,"HUMECTANT, SKIN CONDITIONING, SKIN PROTECTING"
37305,90014.0,LONICERA JAPONICA FLOWER,This is a filtrate of the product obtained by ...,,"HUMECTANT, SKIN CONDITIONING, SKIN PROTECTING"
37306,90014.0,CHAENOMELES SINENSIS FRUIT,This is a filtrate of the product obtained by ...,,"HUMECTANT, SKIN CONDITIONING, SKIN PROTECTING"
37307,90014.0,CAMELLIA SINENSIS LEAF,This is a filtrate of the product obtained by ...,,"HUMECTANT, SKIN CONDITIONING, SKIN PROTECTING"


In [21]:
rating_inci, irritancy_inci, comedogenicity_inci, function_inci, qfacts_inci, desc_inci, desc_cosing, function_cosing = create_dict_english(df_inci, df_cosing)

1570it [00:00, 3783.07it/s]
37309it [00:05, 6407.60it/s]


In [22]:
rating_inci

{'OUBAKU EKISU': 'GOODIE',
 'TYPE OF CLAY': 'GOODIE',
 '18:2 CIS-9': 'GOODIE',
 '3-O-ETHYL ASCORBATE': 'GOODIE',
 'ALA': 'GOODIE',
 'ATIP': 'GOODIE',
 'ACETYL GLUTAMYL HEPTAPEPTIDE-1': 'GOODIE',
 'ACETYL HEXAPEPTIDE-3': 'GOODIE',
 'ACID RED 92 PHLOXINE': 'NO RATING',
 'AL2O3': 'NO RATING',
 'ASCORBYL ISOTETRAPALMITATE': 'GOODIE',
 'ASCORBYL TETRAISOPALMITATE': 'GOODIE',
 'ASTRAGALI RADIX': 'GOODIE',
 'BEMOTRIZINOL': 'GOODIE',
 'BISDISULIZOLE DISODIUM': 'GOODIE',
 'BISOCTRIZOLE': 'GOODIE',
 'CHILEAN WINEBERRY OIL': 'GOODIE',
 'CHINA CLAY': 'GOODIE',
 'CHOPI': 'NO RATING',
 'CITRUS SINENSIS OIL': 'ICKY',
 'COCOCIN': 'GOODIE',
 'COCONUT LIQUID ENDOSPERM': 'GOODIE',
 'COLLOIDAL OATMEAL': 'GOODIE',
 'COPPER PEPTIDE': 'GOODIE',
 'CURCUMIN': 'GOODIE',
 'D&C RED 33': 'NO RATING',
 'DHHB': 'GOODIE',
 'EAC': 'GOODIE',
 'EGF': 'NO RATING',
 'EPO': 'GOODIE',
 'EDELWEISS STEM CELL EXTRACT': 'GOODIE',
 'ELIX-IR': 'GOODIE',
 'EUSOLEX 6007': 'ICKY',
 'FGF1': 'NO RATING',
 'FGF2': 'NO RATING',
 'FERRIC

In [23]:
import pickle

# a = {'hello': 'world'}

with open('eng_rating_inci.pickle', 'wb') as handle:
    pickle.dump(rating_inci, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('eng_irritancy_inci.pickle', 'wb') as handle:
    pickle.dump(irritancy_inci, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('eng_comedogenicity_inci.pickle', 'wb') as handle:
    pickle.dump(comedogenicity_inci, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('eng_function_inci.pickle', 'wb') as handle:
    pickle.dump(function_inci, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('eng_qfacts_inci.pickle', 'wb') as handle:
    pickle.dump(qfacts_inci, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('eng_desc_inci.pickle', 'wb') as handle:
    pickle.dump(desc_inci, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('eng_desc_cosing.pickle', 'wb') as handle:
    pickle.dump(desc_cosing, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('eng_function_cosing.pickle', 'wb') as handle:
    pickle.dump(function_cosing, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [73]:
with open('eng_rating_inci.pickle', 'rb') as handle:
    rating_inci = pickle.load(handle)

In [24]:
def lookup_all_english(ingredient_list, match_dict_inci, match_dict_cosing,
               df_inci, df_cosing, option=''):

    with open('eng_rating_inci.pickle', 'rb') as handle:
        rating_inci = pickle.load(handle)
    with open('eng_irritancy_inci.pickle', 'rb') as handle:
        irritancy_inci = pickle.load(handle)
    with open('eng_comedogenicity_inci.pickle', 'rb') as handle:
        comedogenicity_inci = pickle.load(handle)
    with open('eng_function_inci.pickle', 'rb') as handle:
        function_inci = pickle.load(handle)
    with open('eng_qfacts_inci.pickle', 'rb') as handle:
        qfacts_inci = pickle.load(handle)
    with open('eng_desc_inci.pickle', 'rb') as handle:
        desc_inci = pickle.load(handle)
    with open('eng_desc_cosing.pickle', 'rb') as handle:
        desc_cosing = pickle.load(handle)
    with open('eng_function_cosing.pickle', 'rb') as handle:
        function_cosing = pickle.load(handle)
        
    res = []
    
    for item in tqdm(ingredient_list):
        
        value = match_dict_inci[item]
        if value == 'unknown':
            key = match_dict_cosing.get(item, 'unknown')
            rating = 'No rating'
            irritancy = np.nan
            comedogenicity = np.nan
            functions = function_cosing.get(key, [])
            quickfacts = np.nan
            description = desc_cosing.get(key, [])        
                
        else:
            key = match_dict_inci.get(item, 'unknown')
            rating = rating_inci.get(key, 'No rating')
            irritancy = irritancy_inci.get(key, np.nan)
            comedogenicity = comedogenicity_inci.get(key, np.nan)
            functions = function_inci.get(key, [])
            quickfacts = qfacts_inci.get(key, [])
            description = desc_inci.get(key, [])
            
        if key != 'unknown':    
            if option == 'ingredient':
                res.append(key)
            elif option == 'rating':
                res.append(rating)
            elif option == 'irritancy':
                res.append(irritancy)
            elif option == 'comedogenicity':
                res.append(comedogenicity)
            elif option == 'functions':
                res.append(functions)
            elif option == 'quickfacts':
                res.append(quickfacts)
            elif option == 'description':
                res.append(description)
            else:
                res.extend([[key, functions, rating, irritancy, comedogenicity, quickfacts, description]])
            
    df_res = pd.DataFrame(res, columns=['Ingredient_name', 'Functions', 'Rating', 'Irritancy',
                                        'Comedogenicity', 'Quick_facts', 'Description'])
    
    return df_res

In [25]:
def lookup_all_vietnamese(ingredient_list, match_dict_cmd, match_dict_cosing,
               df_cmd, df_cosing, option=''):
    
    with open('vie_ratingscore_cmd.pickle', 'rb') as handle:
        ratingscore_cmd = pickle.load(handle)
    with open('vie_function_cmd.pickle', 'rb') as handle:
        function_cmd = pickle.load(handle)
    with open('vie_desc_cmd.pickle', 'rb') as handle:
        desc_cmd = pickle.load(handle)    
    
    with open('eng_desc_cosing.pickle', 'rb') as handle:
        desc_cosing = pickle.load(handle)
    with open('eng_function_cosing.pickle', 'rb') as handle:
        function_cosing = pickle.load(handle)
    
    res = []
    
    for item in tqdm(ingredient_list):
        
        value = match_dict_cmd[item]

        if value == 'unknown':
            key = match_dict_cosing.get(item, 'unknown')
            rating_score = 'Chưa đánh giá'
            functions = function_cosing.get(key, [])
            description = desc_cosing.get(key, [])
        else:
            key = match_dict_cmd.get(item, 'unknown')
            rating_score = ratingscore_cmd.get(key, np.nan)
            functions = function_cmd.get(key, [])
            description = desc_cmd.get(key, [])
            
        if key != 'unknown':             
            if option == 'ingredient':
                res.append(key)
            elif option == 'rating_score':
                res.append(rating_score)
            elif option == 'functions':
                res.append(functions)
            elif option == 'description':
                res.append(description)
            else:
                res.extend([[key, rating_score, functions, description]])
            
    df_res = pd.DataFrame(res, columns=['Ingredient_name', 'Rating_score', 'Functions', 'Description'])
    
    return df_res

In [26]:
# def lookup_all(ingredient_list, match_dict_inci, match_dict_cosing,
#                df_inci, df_cosing, option=''):

#     with open('eng_rating_inci.pickle', 'rb') as handle:
#         rating_inci = pickle.load(handle)
#     with open('eng_irritancy_inci.pickle', 'rb') as handle:
#         irritancy_inci = pickle.load(handle)
#     with open('eng_comedogenicity_inci.pickle', 'rb') as handle:
#         comedogenicity_inci = pickle.load(handle)
#     with open('eng_function_inci.pickle', 'rb') as handle:
#         function_inci = pickle.load(handle)
#     with open('eng_qfacts_inci.pickle', 'rb') as handle:
#         qfacts_inci = pickle.load(handle)
#     with open('eng_desc_inci.pickle', 'rb') as handle:
#         desc_inci = pickle.load(handle)
#     with open('eng_desc_cosing.pickle', 'rb') as handle:
#         desc_cosing = pickle.load(handle)
#     with open('eng_function_cosing.pickle', 'rb') as handle:
#         function_cosing = pickle.load(handle)
        
#     res = []
    
#     for item in tqdm(ingredient_list):
        
#         value = match_dict_inci[item]
#         if value == 'unknown':
#             key = match_dict_cosing.get(item, 'unknown')
#             rating = 'No rating'
#             irritancy = np.nan
#             comedogenicity = np.nan
#             functions = function_cosing.get(key, [])
#             quickfacts = np.nan
#             description = desc_cosing.get(key, [])        
                
#         else:
#             key = match_dict_inci.get(item, 'unknown')
#             rating = rating_inci.get(key, 'No rating')
#             irritancy = irritancy_inci.get(key, np.nan)
#             comedogenicity = comedogenicity_inci.get(key, np.nan)
#             functions = function_inci.get(key, [])
#             quickfacts = qfacts_inci.get(key, [])
#             description = desc_inci.get(key, [])
            
#         if key != 'unknown':    
#             if option == 'ingredient':
#                 res.append(key)
#             elif option == 'rating':
#                 res.append(rating)
#             elif option == 'irritancy':
#                 res.append(irritancy)
#             elif option == 'comedogenicity':
#                 res.append(comedogenicity)
#             elif option == 'functions':
#                 res.append(functions)
#             elif option == 'quickfacts':
#                 res.append(quickfacts)
#             elif option == 'description':
#                 res.append(description)
#             else:
#                 res.extend([[key, functions, rating, irritancy, comedogenicity, quickfacts, description]])
            
#     df_res = pd.DataFrame(res, columns=['Ingredient_name', 'Functions', 'Rating', 'Irritancy',
#                                         'Comedogenicity', 'Quick_facts', 'Description'])
    
#     return df_res

# **Function for fuzzy dict**

In [27]:
# Fuzzy Dict

class FuzzyDict(dict):
    "Provides a dictionary that performs fuzzy lookup"
    def __init__(self, items = None, cutoff = .6):
        """Construct a new FuzzyDict instance

        items is an dictionary to copy items from (optional)
        cutoff is the match ratio below which mathes should not be considered
        cutoff needs to be a float between 0 and 1 (where zero is no match
        and 1 is a perfect match)"""
        super(FuzzyDict, self).__init__()

        if items:
            self.update(items)
        self.cutoff =  cutoff

        # short wrapper around some super (dict) methods
        self._dict_contains = lambda key: \
            super(FuzzyDict,self).__contains__(key)

        self._dict_getitem = lambda key: \
            super(FuzzyDict,self).__getitem__(key)

    def _search(self, lookfor, stop_on_first = False):
        """Returns the value whose key best matches lookfor

        if stop_on_first is True then the method returns as soon
        as it finds the first item
        """

        # if the item is in the dictionary then just return it
        if self._dict_contains(lookfor):
            return True, lookfor, self._dict_getitem(lookfor), 1

        # set up the fuzzy matching tool
        ratio_calc = SequenceMatcher()
        ratio_calc.set_seq1(lookfor)

        # test each key in the dictionary
        best_ratio = 0
        best_match = None
        best_key = None
        for key in self:

            # if the current key is not a string
            # then we just skip it
            try:
                # set up the SequenceMatcher with other text
                ratio_calc.set_seq2(key)
            except TypeError:
                continue

            # we get an error here if the item to look for is not a
            # string - if it cannot be fuzzy matched and we are here
            # this it is defintely not in the dictionary
            try:
            # calculate the match value
                ratio = ratio_calc.ratio()
            except TypeError:
                break

            # if this is the best ratio so far - save it and the value
            if ratio > best_ratio:
                best_ratio = ratio
                best_key = key
                best_match = self._dict_getitem(key)

            if stop_on_first and ratio >= self.cutoff:
                break

        return (
            best_ratio >= self.cutoff,
            best_key,
            best_match,
            best_ratio)


    def __contains__(self, item):
        "Overides Dictionary __contains__ to use fuzzy matching"
        if self._search(item, True)[0]:
            return True
        else:
            return False

    def __getitem__(self, lookfor):
        "Overides Dictionary __getitem__ to use fuzzy matching"
        matched, key, item, ratio = self._search(lookfor)

        if not matched:
            raise KeyError(
                "'%s'. closest match: '%s' with ratio %.3f"%
                    (str(lookfor), str(key), ratio))

        return item

In [28]:
# Fuzzydict

from fuzzywuzzy import fuzz 

class FuzzyDict(dict):
    "Provides a dictionary that performs fuzzy lookup"
    def __init__(self, items = None, cutoff = .6):
        """Construct a new FuzzyDict instance

        items is an dictionary to copy items from (optional)
        cutoff is the match ratio below which mathes should not be considered
        cutoff needs to be a float between 0 and 1 (where zero is no match
        and 1 is a perfect match)"""
        super(FuzzyDict, self).__init__()

        if items:
            self.update(items)
        self.cutoff =  cutoff

        # short wrapper around some super (dict) methods
        self._dict_contains = lambda key: \
            super(FuzzyDict,self).__contains__(key)

        self._dict_getitem = lambda key: \
            super(FuzzyDict,self).__getitem__(key)

    def _search(self, lookfor, stop_on_first = False):
        """Returns the value whose key best matches lookfor

        if stop_on_first is True then the method returns as soon
        as it finds the first item
        """

        # if the item is in the dictionary then just return it
        if self._dict_contains(lookfor):
            return True, lookfor, self._dict_getitem(lookfor), 1

        # set up the fuzzy matching tool
        #ratio_calc = difflib.SequenceMatcher()
        #ratio_calc.set_seq1(lookfor)

        # test each key in the dictionary
        best_ratio = 0
        best_match = None
        best_key = None
        for key in self:

            # if the current key is not a string
            # then we just skip it
            if not isinstance(key, str):
                continue

            # we get an error here if the item to look for is not a
            # string - if it cannot be fuzzy matched and we are here
            # this it is defintely not in the dictionary
            try:
            # calculate the match value
                ratio = fuzz.ratio(lookfor, key)/100
            except TypeError:
                break

            # if this is the best ratio so far - save it and the value
            if ratio > best_ratio:
                best_ratio = ratio
                best_key = key
                best_match = self._dict_getitem(key)

            if stop_on_first and ratio >= self.cutoff:
                break

        return (
            best_ratio >= self.cutoff,
            best_key,
            best_match,
            best_ratio)


    def __contains__(self, item):
        "Overides Dictionary __contains__ to use fuzzy matching"
        if self._search(item, True)[0]:
            return True
        else:
            return False

    def __getitem__(self, lookfor):
        "Overides Dictionary __getitem__ to use fuzzy matching"
        matched, key, item, ratio = self._search(lookfor)

        if not matched:
            raise KeyError(
                "'%s'. closest match: '%s' with ratio %.3f"%
                    (str(lookfor), str(key), ratio))

        return item

In [29]:
def fuzzy_match_ingredients(ing_list, fuzdict):
    match_dict = {}
    for ing in tqdm(ing_list):
        if ing in match_dict.keys():
            continue
        upper_ing = ing.upper()
        if fuzdict.__contains__(upper_ing):
            match_dict[ing] = fuzdict[upper_ing]
        else:
            match_dict[ing] = 'unknown'
    
    return match_dict


# **Testing: Englist**

In [30]:
# df_ingredient = pd.read_csv(ingredient_df_path)
df_inci = pd.read_csv('../Database/INCI/ingredient_inci_1570.csv')
df_cosing = pd.read_csv('../Database/ingredient_cosing_37309.csv')
df_paula = pd.read_csv('../Database/PAULA/ingredient_paula_1833_new.csv')

In [31]:
# df_inci.rename(columns={'Ingredient_name': 'ingredient_name',
#                        'Rating': 'rating',
#                         'Irritancy': 'irritancy',
#                         'Comedogenicity': 'comedogenicity',
#                         'Rating_score': 'rating_score',
#                        'Functions': 'functions',
#                        'Quick_facts': 'quick_facts',
#                        'Details': 'description',
#                        'Link': 'link'}, inplace=True)
# df_inci['ingredient_name'] = df_inci['ingredient_name'].apply(lambda x: x.upper())

In [32]:
# # df_paula['rating'] = df_paula['rating'].apply(lambda x: 'Good' if x == 'GOOD' else x)
# df_paula['rating'] = df_paula['rating'].map({'Best': 'superstar',
#                                             'Good': 'goodie',
#                                              'GOOD': 'goodie',
#                                             'Poor': 'icky',
#                                             'Average': 'average'
#                                             })
# df_paula['rating_score'] = df_paula['rating'].map({'icky': 0,
#                                                  'average': 1,
#                                                  'goodie': 2,
#                                                  'superstar': 3})
# df_paula.rename(columns={'name': 'ingredient_name',
#                         'category': 'functions',
#                         }, inplace=True)
# df_paula['ingredient_name'] = df_paula['ingredient_name'].apply(lambda x: x.upper())

In [33]:
boxes = get_bounding_box('../text-detection-ctpn/data/res/sample5.txt')
img_path = '../Sample_images/sample5.JPG'
# img = cv2.imread('../Sample_images/sample5.JPG')
# img_ocr = preprocess_for_ocr(img)
# cv2.imshow('sample', img_ocr)
# cv2.waitKey(0)
# cv2.destroyAllWindows()

text = ''
for box in boxes:
    croped = crop_line(img_path, box)
    string = ocr(croped)
    text = text + ' ' + str(string.strip('\n').strip('\x0c').strip())

text_result = clean_string(text)
ing_list = string_to_list(text_result)


In [34]:
cosing_dict = {name.strip(): name.strip() for name in df_cosing['ingredient_name']}
fd_cosing = FuzzyDict(cosing_dict, cutoff = .6)
print(len(fd_cosing))

paula_dict = {name.strip(): name.strip() for name in df_paula['ingredient_name']}
fd_paula = FuzzyDict(paula_dict, cutoff = .6)
print(len(fd_paula))

inci_dict = {name.strip(): name.strip() for name in df_inci['ingredient_name']}
fd_inci = FuzzyDict(inci_dict, cutoff = .7)
print(len(fd_inci))

29908
1833
1528


# **End to end**

In [35]:
def before_ctpn(img_path):
    img = cv2.imread(img_path)
    img_ctpn = preprocess_for_ocr(img, enhance=2)
    
    return img_ctpn

In [36]:
def ocr_everything(img_path, boundingtxt_file, inci_path, cmd_path, cosing_path, language):
    
    boxes = get_bounding_box(boundingtxt_file)
    
    # Preprocess image for OCR:
    img = cv2.imread(img_path)
    
    # doing OCR
    text = ''
    for box in boxes:
        cropped = crop_line(img_path, box)
        string = ocr(cropped)
        text = text + ' ' + str(string.strip('\n').strip('\x0c').strip())
    
#     if debug:
#         print(text)
    
    # Cleaning result from OCR
    text_result = clean_string(text)
    ing_list = string_to_list(text_result)
    
#     if debug:
#         print("-----")
#         print(text_result)
        
    # Loading ingredient dataframe
    
    df_cosing = pd.read_csv(cosing_path) #'../Database/ingredient_cosing_37309.csv'
    # fd_cosing
    cosing_dict = {name.strip(): name.strip() for name in df_cosing['ingredient_name']}
    fd_cosing = FuzzyDict(cosing_dict, cutoff = .6)
    match_dict_cosing = fuzzy_match_ingredients(ing_list, fd_cosing)
    print('len fd cosing:', len(fd_cosing))
    # Input for later models: KNN and randomforest
    model_input = [[name for name in match_dict_cosing.values()]]
    
    # fd main
    if language == 'Vietnamese':
        df_cmd = pd.read_csv(cmd_path) # Vietnamese database
        cmd_dict = {name.strip(): name.strip() for name in df_cmd['ingredient_name']}
        fd_cmd = FuzzyDict(cmd_dict, cutoff = .7)
        match_dict_fuzzy = fuzzy_match_ingredients(ing_list, fd_cmd)
        print('len fd cmd:', len(fd_cmd))
    else:
        df_inci = pd.read_csv(inci_path) # '../Database/CALLMEDUY/ingredient_vietnamese_3818.csv'
        inci_dict = {name.strip(): name.strip() for name in df_inci['ingredient_name']}
        fd_inci = FuzzyDict(inci_dict, cutoff = .7)
        match_dict_fuzzy = fuzzy_match_ingredients(ing_list, fd_inci)
        print('len fd inci:', len(fd_inci))
    
    # Compare product ingredient list and database
    # match_dict = find_matching_ingredient(ing_list, rating, 0.55)
    
    if debug:
        print(match_dict_fuzzy)
        print(list(match_dict_fuzzy.values()))

    if debug:
        print("length match_dict_fuzzy", len(match_dict_fuzzy))
        print("length match_dict_extra", len(match_dict_cosing))
    
    # Analyzing ingredient
    if language == 'Vietnamese':
        df_res = lookup_all_vietnamese(ing_list, match_dict_fuzzy, match_dict_cosing, df_cmd, df_cosing)
    
    else:
        df_res = lookup_all_english(ing_list, match_dict_fuzzy, match_dict_cosing, df_inci, df_cosing)
        
    return df_res, model_input

In [37]:
def after_ctpn_english(img_path, txt_path, inci_path, cosing_path):
    # Get annotations of bounding boxes
    boxes = get_bounding_box(txt_path)
    
#     # Preprocess image for OCR:
#     img = cv2.imread(img_path)
#     #img_ocr = preprocess_for_ocr(img)
#     img_ocr = img
    
    # doing OCR
    text = ''
    for box in boxes:
        croped = crop_line(img_path, box)
        string = ocr(croped)
        text = text + ' ' + str(string.strip('\n').strip('\x0c').strip())
    
    print(text)
    
    # Cleaning result from OCR
    text_result = clean_string(text)
    ing_list = string_to_list(text_result)
    
#     print("-----")
#     print(text_result)
    
    # Loading ingredient dataframe
    df_inci = pd.read_csv(inci_path) # '../Database/CALLMEDUY/ingredient_vietnamese_3818.csv'
    df_cosing = pd.read_csv(cosing_path) #'../Database/ingredient_cosing_37309.csv'
    
#     rating_inci, irritancy_inci, comedogenicity_inci, function_inci, qfacts_inci, desc_inci, desc_cosing, function_cosing = create_dict_english(df_inci, df_cosing)
    
    # fd_cmd
    inci_dict = {name.strip(): name.strip() for name in df_inci['ingredient_name']}
    fd_inci = FuzzyDict(inci_dict, cutoff = .7)
    
    # fd_cosing
    cosing_dict = {name.strip(): name.strip() for name in df_cosing['ingredient_name']}
    fd_cosing = FuzzyDict(cosing_dict, cutoff = .6)
    
    # Compare product ingredient list and database
    match_dict_inci = fuzzy_match_ingredients(ing_list, fd_inci)
    match_dict_cosing = fuzzy_match_ingredients(ing_list, fd_cosing)
    
    model_input = [[name for name in match_dict_cosing.values()]]

    
    # Analyzing ingredient
    df_res = lookup_all(ing_list, match_dict_fuzzy, match_dict_cosing, df_inci, df_cosing)
    
    return df_res, model_input

In [38]:
df_res_test, model_input_test = ocr_everything('../Sample_images/sample5.JPG',
                                                '../text-detection-ctpn/data/res/sample5.txt', 
                                                '../Database/INCI/ingredient_inci_1570.csv',
                                               '../Database/CALLMEDUY/ingredient_vietnamese_3818.csv',
                                                '../Database/ingredient_cosing_37309.csv',
                                              language='English')


100%|██████████| 43/43 [00:01<00:00, 40.91it/s]
100%|██████████| 43/43 [00:00<00:00, 431.27it/s]


len fd cosing: 29908
len fd inci: 1528


NameError: name 'debug' is not defined

In [81]:
with open('../Model_product_evaluator/ingredient_idx.pickle', 'rb') as handle:
    ingredient_idx = pickle.load(handle)

with open('../Model_product_evaluator/ingredient_idx_1000.pickle', 'rb') as handle:
    ingredient_idx_1000 = pickle.load(handle)

In [82]:
# Load random forest model for prediction
with open('../Model_product_evaluator/rf_200.pkl', 'rb') as model:
    reload_rf = pickle.load(model)

In [83]:
# Load KNN and dataset for product recommendation
with open('../Model_product_evaluator/recommendation.pkl', 'rb') as model:
    reload_knn = pickle.load(model)

df_recommendation = pd.read_csv('../Model_product_evaluator/recommendation_pool.csv')

In [87]:
def transform(X): # X: list
    M = len(X)
    N = len(ingredient_idx)
    A = np.zeros((M, N), dtype=np.uint8)
    i= 0
    for ing_list in X:
        x = np.zeros(N, dtype=np.uint8)
        for ingredient in ing_list:
            # Get the index for each ingredient
            if ingredient in ingredient_idx.keys():
                idx = ingredient_idx[ingredient]
                x[idx] = 1
            else:
                pass

        A[i, :] = x
        i += 1
    input_rf = A[:, :1000]
    input_knn = A
    
    return input_rf, input_knn

In [94]:
def evaluate_recommend(img_path, txt_path, inci_path, cmd_path, cosing_path, ingredient_idx, ingredient_idx_1000,
                      language):
    
    df_res, model_input = ocr_everything(img_path, txt_path, inci_path, cmd_path, cosing_path, language)
    input_rf, input_knn = transform(model_input)
    
    # evaluate score
    prediction = reload_rf.predict(input_rf)
    print(prediction)
    
    # recommendation
    distances, indices = reload_knn.kneighbors(input_knn)
    
    location = indices.tolist()[0]
    df_recommended = df_recommendation.iloc[location, :]
    
    return prediction, df_res, df_recommended

In [99]:
prediction, df_res, df_recommended = evaluate_recommend('../Sample_images/sample5.JPG',
                                                       '../text-detection-ctpn/data/res/sample5.txt', 
                                                       '../Database/INCI/ingredient_inci_1570.csv',
                                                    '../Database/CALLMEDUY/ingredient_vietnamese_3818.csv',
                                                       '../Database/ingredient_cosing_37309.csv',
                                                        ingredient_idx,
                                                        ingredient_idx_1000,
                                                       language='Vietnamese')
                                                        

100%|██████████| 43/43 [00:01<00:00, 37.17it/s]
100%|██████████| 43/43 [00:00<00:00, 269.13it/s]

len fd cosing: 29908



100%|██████████| 43/43 [00:00<00:00, 118420.93it/s]


len fd cmd: 3818
[4]


Unnamed: 0,Ingredient_name,Rating_score,Functions,Description
0,AQUA (WATER),4,['Hỗn hợp'],"Là thành phần mỹ phẩm được sử dụng phổ biến, n..."
1,CAPRYLIC/CAPRIC TRIGLYCERIDE,,[],[]
2,CETYL ALCOHOL,4,"['Chất dưỡng ẩm', 'Chất nhũ hóa/làm dầy kết cấ...",Cetearyl Alcohol và các loại cồn béo khác giữ ...
3,PROPANEDIOL,5,['Chưa phân loại'],"Propanediol là một chất tự nhiên, được dùng th..."
4,STEARYL ALCOHOL,4,"['Chất dưỡng ẩm', 'Chất nhũ hóa/làm dầy kết cấ...",Cồn béo được sử dụng làm chất dưỡng ẩm và giữ ...
5,GLYCERIN,5,"['Chất dưỡng da', 'Chất hồi phục da']","Glycerin là một chất dưỡng da và phục hồi da, ..."
6,SODIUM HYALURONATE,5,"['Chất chống oxy hóa', 'Chất dưỡng da', 'Chất ...",Sodium Hyaluronate là dạng muối của axit hyalu...
7,ARGININE,4,['Chất chống oxy hóa'],Chất này có khả năng chống oxy hóa.
8,ASPARTIC ACID,4,"['Chất chống oxy hóa', 'Chất dưỡng da']",Nghiên cứu chỉ ra rằng amino axit và dẫn xuất ...
9,GLYCINE,4,['Chất dưỡng da'],Thành phần cơ bản của tất cả các protein trong...


In [183]:
df_recommended

Unnamed: 0,cat_name,subcat_name,product_url,product_brand,product_name,product_score,product_img,ingredient_list,new_ing_list,new_ing_list_fixed,new_product_score
1755,Skin,Facial Moisturizer/Treatment,https://www.ewg.org/skindeep/products/879430-T...,The Ordinary,Natural Moisturizing Factors + HA,3,https://static.ewg.org/skindeep_images/8794/87...,"Aqua (Water), Caprylic/Capric Triglyceride, Ce...","['CAVIAR WATER', 'CAPRIC TRIGLYCERIDE', 'CETYL...","WATER, CAPRIC TRIGLYCERIDE, CETYL ALCOHOL, PRO...",4
14058,Skin,Facial Cleanser,https://www.ewg.org/skindeep/products/922934-P...,Pacifica,Cosmic Hemp Balancing Face Wash,4,https://static.ewg.org/skindeep_images/9229/92...,"AQUA, DISODIUM COCOAMPHODIACETATE, HEMP EXTRAC...","['WATER', 'DISODIUM COCOAMPHODIACETATE', 'SHRI...","WATER, DISODIUM COCOAMPHODIACETATE, SHRIMP EXT...",5
10734,Skin,Facial Cleanser,https://www.ewg.org/skindeep/products/917207-P...,Pacifica,Cosmic Hemp Balancing Face Wash,4,https://static.ewg.org/skindeep_images/9172/91...,"AQUA, DISODIUM COCOAMPHODIACETATE, HEMP EXTRAC...","['WATER', 'DISODIUM COCOAMPHODIACETATE', 'SHRI...","WATER, DISODIUM COCOAMPHODIACETATE, SHRIMP EXT...",5
5233,Skin,Hand Cream,https://www.ewg.org/skindeep/products/878116-D...,Duke,Cannon Bloody Knuckles Hand Repair Balm,3,https://static.ewg.org/skindeep_images/8781/87...,"WATER, GLYCERIN, STEARIC ACID, BUTYROSPERMUM P...","['WATER', 'GLYCERIN', 'STEARIC ACID', 'BUTYROS...","WATER, GLYCERIN, STEARIC ACID, BUTYROSPERMUM P...",4
13269,Skin,Moisturizer,https://www.ewg.org/skindeep/products/846427-A...,Aveeno,Active Naturals Sheer Hydration Daily Moisturi...,2,https://static.ewg.org/skindeep_images/8464/84...,"WATER, DICAPRYLYL ETHER, GLYCERIN, CAPRYLIC/CA...","['WATER', 'DICAPRYLYL ETHER', 'GLYCERIN', 'CAP...","WATER, DICAPRYLYL ETHER, GLYCERIN, CAPRIC TRIG...",3


In [165]:
df_res

Unnamed: 0,Ingredient_name,Functions,Rating,Irritancy,Comedogenicity,Quick_facts,Description
0,WATER,{'solvent': '/ingredient-functions/solvent'},No rating,,,,"['Good old water, aka H2O. The most common ski..."
1,CAPRYLIC/ CAPRIC TRIGLYCERIDE,[],No rating,,,[],[]
2,CETYL ALCOHOL,{'emollient': '/ingredient-functions/emollient...,No rating,2,2,,"['A so-called\xa0fatty (the good, non-drying k..."
3,PROPANEDIOL,"{'solvent': '/ingredient-functions/solvent', '...",No rating,,,,"[""Propanediol is a natural alternative for the..."
4,STEARYL ALCOHOL,{'emollient': '/ingredient-functions/emollient...,No rating,2,2,,"['A handy multi-tasker,\xa0white to light yell..."
5,GLYCERIN,{'skin-identical ingredient': '/ingredient-fun...,superstar,0,0,['A natural moisturizer that’s also in our ski...,['Glycerin doesn’t sound very glamorous but it...
6,SODIUM HYALURONATE,{'skin-identical ingredient': '/ingredient-fun...,goodie,0,0,,"[""It’s the - sodium form - cousin of the famou..."
7,ARGININE,{'skin-identical ingredient': '/ingredient-fun...,goodie,,,,"[""A semi-essential (infants cannot synthesize ..."
8,ASPARTIC ACID,{'skin-identical ingredient': '/ingredient-fun...,goodie,,,,['A non-essential\xa0amino acid \xa0(important...
9,GLYCINE,{'skin-identical ingredient': '/ingredient-fun...,goodie,,,,['A non-essential amino acid (the building blo...


In [166]:
input_rf, input_knn = transform(model_input)

In [167]:
Your code here
with open('../Model_product_evaluator/rf_200.pkl', 'rb') as model:
    reload_rf = pickle.load(model)

predictions = reload_rf.predict(input_rf)
print(predictions)
# print(f'Accuracy score: {accuracy_score(y_test, predictions)}')

[4]


# **Recommendation**