In [86]:
import pandas as pd
from tqdm import tqdm
import time
import re
import requests
import difflib
import operator
from difflib import SequenceMatcher

In [4]:

class FuzzyDict(dict):
    "Provides a dictionary that performs fuzzy lookup"
    def __init__(self, items = None, cutoff = .6):
        """Construct a new FuzzyDict instance

        items is an dictionary to copy items from (optional)
        cutoff is the match ratio below which mathes should not be considered
        cutoff needs to be a float between 0 and 1 (where zero is no match
        and 1 is a perfect match)"""
        super(FuzzyDict, self).__init__()

        if items:
            self.update(items)
        self.cutoff =  cutoff

        # short wrapper around some super (dict) methods
        self._dict_contains = lambda key: \
            super(FuzzyDict,self).__contains__(key)

        self._dict_getitem = lambda key: \
            super(FuzzyDict,self).__getitem__(key)

    def _search(self, lookfor, stop_on_first = False):
        """Returns the value whose key best matches lookfor

        if stop_on_first is True then the method returns as soon
        as it finds the first item
        """

        # if the item is in the dictionary then just return it
        if self._dict_contains(lookfor):
            return True, lookfor, self._dict_getitem(lookfor), 1

        # set up the fuzzy matching tool
        ratio_calc = difflib.SequenceMatcher()
        ratio_calc.set_seq1(lookfor)

        # test each key in the dictionary
        best_ratio = 0
        best_match = None
        best_key = None
        for key in self:

            # if the current key is not a string
            # then we just skip it
            try:
                # set up the SequenceMatcher with other text
                ratio_calc.set_seq2(key)
            except TypeError:
                continue

            # we get an error here if the item to look for is not a
            # string - if it cannot be fuzzy matched and we are here
            # this it is defintely not in the dictionary
            try:
            # calculate the match value
                ratio = ratio_calc.ratio()
            except TypeError:
                break

            # if this is the best ratio so far - save it and the value
            if ratio > best_ratio:
                best_ratio = ratio
                best_key = key
                best_match = self._dict_getitem(key)

            if stop_on_first and ratio >= self.cutoff:
                break

        return (
            best_ratio >= self.cutoff,
            best_key,
            best_match,
            best_ratio)


    def __contains__(self, item):
        "Overides Dictionary __contains__ to use fuzzy matching"
        if self._search(item, True)[0]:
            return True
        else:
            return False

    def __getitem__(self, lookfor):
        "Overides Dictionary __getitem__ to use fuzzy matching"
        matched, key, item, ratio = self._search(lookfor)

        if not matched:
            raise KeyError(
                "'%s'. closest match: '%s' with ratio %.3f"%
                    (str(lookfor), str(key), ratio))

        return item

In [5]:
def fuzzy_match_ingredients(ing_list, fuzdict):
    match_dict = {}
    for ing in tqdm(ing_list):
        if ing in match_dict.keys():
            continue
        upper_ing = ing.upper()
        if fuzdict.__contains__(upper_ing):
            match_dict[ing] = fuzdict[upper_ing]
    
    return match_dict

In [18]:
def cosing_lookup(ing_list, match_dict, desc, restrictions, functions):
    res = []
    for item in ing_list:
        key = match_dict.get(item, "")
        if key == "":
            continue
        description = desc.get(key, "unknown")
        restrictn = restrictions.get(key, "unknown")
        functn = functions.get(key, "unknown")
        res.extend([[key, description, restrictn, functn]])
    
    return res

In [19]:
def create_cosing_dict(ingredient_df, col_name):
    col_dict = {name.strip(): row[col_name] for (idx, row) in ingredient_df.iterrows() for name in row['INCI name'].split('/')} # .split('/')

    return col_dict

In [16]:
dfcosing = pd.read_csv('../Database/ingredient_cosing_37309.csv')
dfcosing.drop(columns=['Unnamed: 0'], inplace=True)
dfcosing

Unnamed: 0,COSING Ref No,INCI name,Chem/IUPAC Name / Description,Restriction,Function
0,94753.0,DISODIUM TETRAMETHYLHEXADECENYLCYSTEINE FORMYL...,Disodium Tetramethylhexadecenylcysteine Formyl...,,SKIN PROTECTING
1,96229.0,ASTROCARYUM VULGARE SEED BUTTER,Astrocaryum Vulgare Seed Butter is the fat obt...,,"SKIN CONDITIONING, SKIN CONDITIONING - EMOLLIENT"
2,89177.0,BARLEY SH-POLYPEPTIDE-17,Barley sh-Polypeptide-17 is a single chain rec...,,"HAIR CONDITIONING, SKIN CONDITIONING"
3,98580.0,DAUCUS CAROTA SATIVA LEAF EXTRACT,Daucus Carota Sativa (Carrot) Leaf Extract is ...,,SKIN CONDITIONING - MISCELLANEOUS
4,89078.0,GOSSYPIUM HIRSUTUM SEED EXTRACT,Gossypium Hirsutum Seed Extract is the extrac...,,SKIN CONDITIONING
...,...,...,...,...,...
37304,90014.0,PHELLODENDRON AMURENSE BARK,This is a filtrate of the product obtained by ...,,"HUMECTANT, SKIN CONDITIONING, SKIN PROTECTING"
37305,90014.0,LONICERA JAPONICA FLOWER,This is a filtrate of the product obtained by ...,,"HUMECTANT, SKIN CONDITIONING, SKIN PROTECTING"
37306,90014.0,CHAENOMELES SINENSIS FRUIT,This is a filtrate of the product obtained by ...,,"HUMECTANT, SKIN CONDITIONING, SKIN PROTECTING"
37307,90014.0,CAMELLIA SINENSIS LEAF,This is a filtrate of the product obtained by ...,,"HUMECTANT, SKIN CONDITIONING, SKIN PROTECTING"


In [20]:
# dfi3 = pd.read_csv('Database/ingredient_inci_1570.csv')
ingnames_dict = {name.strip(): name.strip() for name in dfcosing['INCI name']}
# print(ingnames_dict)
fd = FuzzyDict(ingnames_dict, cutoff = .55)
print(len(fd))

29908


In [15]:
len(ingnames_dict)

29908

In [None]:
# ingnames_dict = {name.strip(): name.strip() for name in df_ingredient['INCI name']}
# fd = FuzzyDict(ingnames_dict, cutoff = .55)

# # Compare product ingredient list and database
# # match_dict = find_matching_ingredient(ing_list, rating, 0.55)
# match_dict = fuzzy_match_ingredients(ing_list, fd)
# print(match_dict)

## **Load EWG ingredient list**

In [22]:
def clean_string_name(string):
    text = string.replace(' Size: 4 oz. * USDA Certified Organic Ingredient ** None remains after saponifying oils into soap and glycerin', '')
    pattern = "[\|\*\_\'\{}&]".format('"')
    regex1 = re.compile('\[.*]')
    regex2 = re.compile('\[.*}')
    text = re.sub(pattern, "", text)
    text = re.sub(regex1, "", text)
    text = re.sub(regex2, "", text)
    text = text.strip('\x0c')
    text = text.strip(' ')
    text = text.upper()
    
    return text

In [23]:
def clean_string(string):
    pattern = "[\|\*\_\'\{}&]".format('"')
    regex = re.compile('\\\S*')
    text = re.sub(pattern, "", string)
    text = re.sub(",, ", ", ", text)
    text = re.sub(regex, " ", text)
    text = text.split(':')[1]
#     text = text.split('.')[0]
    text = text.strip()
#     split = [x.strip(' ').lower() for x in text.split(',')]

    return text

def string_to_list(text):
    pattern = "[\|\*\_\'\{}]".format('"')
    text = re.sub(pattern, "", text)
    text = text.replace('Aqua/Water/Eau', 'Aqua')
#     split = [x.strip(' ').lower() for x in text.split([\\.,]\\s)split('.')]
    split = [x.strip(' ').lower().replace('aqua/water/eau', 'aqua') for x in re.split("[,.]", text)]
    return split

In [21]:
df_skin = pd.read_csv('../Database/ewg_skin_full_8237.csv')

In [24]:
example = df_skin[df_skin['ingredient_list'] != 'No ingredient list']['ingredient_list'].to_list()

In [37]:
sample = [name.upper().strip() for name in example[4747].split(',')]
sample

['DISTILLED WATER',
 'COCOS NUCIFERA (COCONUT*)',
 'PRUNUS ARMENIACA (APRICOT)',
 'MACADAMIA INTEGRIFOLIA (MACADAMIA) NUT OIL',
 'ALEURITES MOLUCCANA (KUKUI) SEED OIL',
 'CETEARYL ALCOHOL',
 'SORBITAN ESTER',
 'BEESWAX ORGANIC',
 'VEGETABLE GLYCERIN',
 'GEOGARD ULTRA',
 'MATRICARIA RECUTITA (CHAMOMILE) ORGANIC',
 'SANTALUM ALBUM (SANDALWOOD) ORGANIC',
 'COMMIPHORA MYRRHA (MYRRH) ORGANIC',
 'ANIBA ROSAEODORA (ROSEWOOD)',
 'ORGANIC CITRUS AURANTIUM (NEROLI)',
 'ORGANIC HELICRYSUM ITALICUUM (HELICHRYSUM)']

In [38]:
# match_dict = find_matching_ingredient(ing_list, rating, 0.55)
match_dict = fuzzy_match_ingredients(sample, fd)
print(match_dict)

100%|██████████| 16/16 [01:05<00:00,  4.09s/it]

{'DISTILLED WATER': 'STEM WATER', 'COCOS NUCIFERA (COCONUT*)': 'COCOS NUCIFERA FRUIT', 'PRUNUS ARMENIACA (APRICOT)': 'PRUNUS ARMENIACA FRUIT', 'MACADAMIA INTEGRIFOLIA (MACADAMIA) NUT OIL': 'MACADAMIA INTEGRIFOLIA SEED OIL', 'ALEURITES MOLUCCANA (KUKUI) SEED OIL': 'ALEURITES MOLUCCANUS SEED OIL', 'CETEARYL ALCOHOL': 'CETEARYL ALCOHOL', 'SORBITAN ESTER': 'SORBITAN STEARATE', 'BEESWAX ORGANIC': 'BEESWAX ACID', 'VEGETABLE GLYCERIN': 'VEGETABLE OIL', 'GEOGARD ULTRA': 'VINEGAR EXTRACT', 'MATRICARIA RECUTITA (CHAMOMILE) ORGANIC': 'MATRICARIA RECUTITA FLOWER OIL', 'SANTALUM ALBUM (SANDALWOOD) ORGANIC': 'SANTALUM ALBUM WOOD OIL', 'COMMIPHORA MYRRHA (MYRRH) ORGANIC': 'COMMIPHORA MYRRHA GUM OIL', 'ANIBA ROSAEODORA (ROSEWOOD)': 'ANIBA ROSAEODORA WOOD OIL', 'ORGANIC CITRUS AURANTIUM (NEROLI)': 'CITRUS AURANTIUM FLOWER OIL', 'ORGANIC HELICRYSUM ITALICUUM (HELICHRYSUM)': 'HELICHRYSUM ITALICUM FLOWER'}





In [62]:
for o in match_dict.values():
    print(o)

STEM WATER
COCOS NUCIFERA FRUIT
PRUNUS ARMENIACA FRUIT
MACADAMIA INTEGRIFOLIA SEED OIL
ALEURITES MOLUCCANUS SEED OIL
CETEARYL ALCOHOL
SORBITAN STEARATE
BEESWAX ACID
VEGETABLE OIL
VINEGAR EXTRACT
MATRICARIA RECUTITA FLOWER OIL
SANTALUM ALBUM WOOD OIL
COMMIPHORA MYRRHA GUM OIL
ANIBA ROSAEODORA WOOD OIL
CITRUS AURANTIUM FLOWER OIL
HELICHRYSUM ITALICUM FLOWER


In [92]:
df_skin

Unnamed: 0,cat_name,subcat_name,product_url,product_brand,product_name,product_score,product_img,ingredient_list
0,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701958-S...,Soap for Goodness Sake,"Handmade Soap, Olive Oil I",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Extra Virgin Olive Oil (Sod...
1,Skin,Bar Soap,https://www.ewg.org/skindeep/products/604412-M...,Makes 3 Organics,Unscented Super Shea Soap Bar,verified,https://static.ewg.org/skindeep_images/6044/60...,"Safflower* and/or Sunflower* Oils, Coconut* Oi..."
2,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701974-S...,Soap for Goodness Sake,"Handmade Soap, Shea and Oats",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Sunflower Oil (Sodium Sunfl...
3,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701929-S...,Soap for Goodness Sake,"Handmade Soap, Balancing Act",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Sunflower Oil (Sodium Sunfl...
4,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701942-S...,Soap for Goodness Sake,"Shampoo & Body Bar, French Lavender",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Expeller Pressed Babassu Oil (Orbig...
...,...,...,...,...,...,...,...,...
8232,Skin,Antiperspirant/Deodorant,https://www.ewg.org/skindeep/products/932610-C...,C & Co.,"C& Co. Handcrafted Skincare Natural Deodorant,...",01,https://static.ewg.org/skindeep_images/9326/93...,"Maranta Arundinacea (Arrowroot) Root Powder, C..."
8233,Skin,Antiperspirant/Deodorant,https://www.ewg.org/skindeep/products/859117-S...,Schmidt's,"Natural Deodorant, Geranium Flower",01,https://static.ewg.org/skindeep_images/8591/85...,"MAGNESIUM HYDROXIDE, COCOS NUCIFERA (COCONUT) ..."
8234,Skin,Antiperspirant/Deodorant,https://www.ewg.org/skindeep/products/802160-T...,thinksport,"Natural Deodorant, Bergamot Cedarwood",01,https://static.ewg.org/skindeep_images/8021/80...,"COCOS NUCIFERA (COCONUT OIL), EUPHORBIA CERIFE..."
8235,Skin,Antiperspirant/Deodorant,https://www.ewg.org/skindeep/products/885159-N...,Native,"Deodorant, Cucumber & Mint",01,https://static.ewg.org/skindeep_images/8851/88...,"CAPRYLIC/CAPRIC TRIGLYCERIDE, TAPIOCA STARCH,\..."


In [51]:
df_skin.drop(drop_index, inplace=True)

In [52]:
df_skin[df_skin['ingredient_list'] == 'No ingredient list']

Unnamed: 0,cat_name,subcat_name,product_url,product_brand,product_name,product_score,product_img,ingredient_list


In [63]:
def compare_ingredient(x):
    match_dict = fuzzy_match_ingredients(sample, fd)
    res = [i for i in match_dict.values()]
    
    return res

In [72]:
df_skin.to_csv('ewg_skin_full_8208.csv', index=False)

In [88]:
df_testing = df_skin.iloc[:10, :]
df_testing

Unnamed: 0,cat_name,subcat_name,product_url,product_brand,product_name,product_score,product_img,ingredient_list
0,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701958-S...,Soap for Goodness Sake,"Handmade Soap, Olive Oil I",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Extra Virgin Olive Oil (Sod...
1,Skin,Bar Soap,https://www.ewg.org/skindeep/products/604412-M...,Makes 3 Organics,Unscented Super Shea Soap Bar,verified,https://static.ewg.org/skindeep_images/6044/60...,"Safflower* and/or Sunflower* Oils, Coconut* Oi..."
2,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701974-S...,Soap for Goodness Sake,"Handmade Soap, Shea and Oats",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Sunflower Oil (Sodium Sunfl...
3,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701929-S...,Soap for Goodness Sake,"Handmade Soap, Balancing Act",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Sunflower Oil (Sodium Sunfl...
4,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701942-S...,Soap for Goodness Sake,"Shampoo & Body Bar, French Lavender",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Expeller Pressed Babassu Oil (Orbig...
5,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701976-S...,Soap for Goodness Sake,"Shampoo and Body Bar, Shea Butter",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Sunflower Oil ((Sodium Sunf...
6,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701977-S...,Soap for Goodness Sake,"Handmade Soap, Sugar",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Palm Kernel Oil (Sodium Pal...
7,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701959-S...,Soap for Goodness Sake,"Handmade Soap, Olive Oil II",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Extra Virgin Olive Oil (Sod...
8,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701955-S...,Soap for Goodness Sake,"Goat Milk Soap, Oatmeal and Honey",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Sunflower Oil (Sodium Sunfl...
9,Skin,Bar Soap,https://www.ewg.org/skindeep/products/604408-M...,Makes 3 Organics,Coriander Ginger Oatmeal Soap Bar,verified,https://static.ewg.org/skindeep_images/6044/60...,"Safflower* and/or Sunflower* Oils, Coconut* Oi..."


In [66]:
df_testing['new_ing_list'] = df_testing['ingredient_list'].apply(compare_ingredient)

100%|██████████| 16/16 [00:49<00:00,  3.12s/it]
100%|██████████| 16/16 [00:56<00:00,  3.56s/it]
100%|██████████| 16/16 [00:59<00:00,  3.75s/it]
100%|██████████| 16/16 [00:53<00:00,  3.33s/it]
100%|██████████| 16/16 [00:58<00:00,  3.68s/it]
100%|██████████| 16/16 [00:58<00:00,  3.63s/it]
100%|██████████| 16/16 [00:57<00:00,  3.57s/it]
100%|██████████| 16/16 [00:57<00:00,  3.60s/it]
100%|██████████| 16/16 [00:58<00:00,  3.66s/it]
100%|██████████| 16/16 [00:51<00:00,  3.23s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.



In [70]:
df_testing[['ingredient_list', 'new_ing_list']]

Unnamed: 0,ingredient_list,new_ing_list
0,Saponified Organic Extra Virgin Olive Oil (Sod...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
1,"Safflower* and/or Sunflower* Oils, Coconut* Oi...","[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
2,Saponified Organic Sunflower Oil (Sodium Sunfl...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
3,Saponified Organic Sunflower Oil (Sodium Sunfl...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
4,Saponified Expeller Pressed Babassu Oil (Orbig...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
5,Saponified Organic Sunflower Oil ((Sodium Sunf...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
6,Saponified Organic Palm Kernel Oil (Sodium Pal...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
7,Saponified Organic Extra Virgin Olive Oil (Sod...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
8,Saponified Organic Sunflower Oil (Sodium Sunfl...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
9,"Safflower* and/or Sunflower* Oils, Coconut* Oi...","[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."


In [69]:
df_testing

Unnamed: 0,cat_name,subcat_name,product_url,product_brand,product_name,product_score,product_img,ingredient_list,new_ing_list
0,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701958-S...,Soap for Goodness Sake,"Handmade Soap, Olive Oil I",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Extra Virgin Olive Oil (Sod...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
1,Skin,Bar Soap,https://www.ewg.org/skindeep/products/604412-M...,Makes 3 Organics,Unscented Super Shea Soap Bar,verified,https://static.ewg.org/skindeep_images/6044/60...,"Safflower* and/or Sunflower* Oils, Coconut* Oi...","[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
2,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701974-S...,Soap for Goodness Sake,"Handmade Soap, Shea and Oats",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Sunflower Oil (Sodium Sunfl...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
3,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701929-S...,Soap for Goodness Sake,"Handmade Soap, Balancing Act",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Sunflower Oil (Sodium Sunfl...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
4,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701942-S...,Soap for Goodness Sake,"Shampoo & Body Bar, French Lavender",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Expeller Pressed Babassu Oil (Orbig...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
5,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701976-S...,Soap for Goodness Sake,"Shampoo and Body Bar, Shea Butter",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Sunflower Oil ((Sodium Sunf...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
6,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701977-S...,Soap for Goodness Sake,"Handmade Soap, Sugar",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Palm Kernel Oil (Sodium Pal...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
7,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701959-S...,Soap for Goodness Sake,"Handmade Soap, Olive Oil II",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Extra Virgin Olive Oil (Sod...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
8,Skin,Bar Soap,https://www.ewg.org/skindeep/products/701955-S...,Soap for Goodness Sake,"Goat Milk Soap, Oatmeal and Honey",verified,https://static.ewg.org/skindeep_images/7019/70...,Saponified Organic Sunflower Oil (Sodium Sunfl...,"[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."
9,Skin,Bar Soap,https://www.ewg.org/skindeep/products/604408-M...,Makes 3 Organics,Coriander Ginger Oatmeal Soap Bar,verified,https://static.ewg.org/skindeep_images/6044/60...,"Safflower* and/or Sunflower* Oils, Coconut* Oi...","[STEM WATER, COCOS NUCIFERA FRUIT, PRUNUS ARME..."


## **SequenceMatcher**

In [83]:
def find_matching_ingredient(my_ingredients, rating_dict, thresh=0.5):
    ''' my_ingredients: list of product's ingredients
        Loop thru each ingredient in the ingredient list of the products
        then check if that ingredient appears in our ingredient list
        Calculate match_metric using SequenceMatcher and return the highest score and the best match
        Compare the match_metric with thresh > append to match_dict
    '''
    match_dict = {}
    for ingredient in tqdm(my_ingredients):
        if ingredient in match_dict.keys():
            continue
        match_metric = {key: SequenceMatcher(None, key, ingredient).ratio() for key in rating_dict.keys()}
        best_match, best_metric = max(match_metric.items(), key=operator.itemgetter(1)) # What is operator.itemgetter(1)
        if best_metric > thresh:
            match_dict[ingredient] = best_match
        else:
            match_dict[ingredient] = 'unknown'
    return match_dict

In [89]:
def compare_ingredient(x):
    x = clean_string_name(x)
    sample = [name.strip() for name in x.split(',')]
    match_dict = find_matching_ingredient(sample, fd)
    res = [i for i in match_dict.values()]
#     print(sample)
#     print(res)
    
    return res

In [90]:
df_testing['new_ing_list'] = df_testing['ingredient_list'].apply(compare_ingredient)

100%|██████████| 4/4 [00:11<00:00,  2.86s/it]
100%|██████████| 7/7 [00:16<00:00,  2.41s/it]
100%|██████████| 7/7 [00:24<00:00,  3.47s/it]
100%|██████████| 9/9 [00:32<00:00,  3.61s/it]
100%|██████████| 8/8 [00:28<00:00,  3.62s/it]
100%|██████████| 7/7 [00:23<00:00,  3.35s/it]
100%|██████████| 15/15 [00:26<00:00,  1.74s/it]
100%|██████████| 3/3 [00:07<00:00,  2.49s/it]
100%|██████████| 10/10 [00:31<00:00,  3.16s/it]
100%|██████████| 9/9 [00:21<00:00,  2.38s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.



In [91]:
df_skin['product_score'].value_counts()

04          1561
01          1347
03          1225
05          1222
02          1203
verified    1132
06           369
07           102
08            36
09            10
10             1
Name: product_score, dtype: int64