In [3]:
from difflib import SequenceMatcher
from ast import literal_eval
from collections import Counter
from tqdm import tqdm

import pandas as pd
import numpy as np
import re
import operator
import itertools
import gc
import pickle
import os.path

## **Preparing ingredient list**

**Read ingredients database and convert to dataframe**

In [None]:
cols = ['name','category','rating']
ingredient_df = pd.read_csv('../web_scraper/ingredients.csv', usecols=cols, converters={"category": literal_eval})
ingredient_df['name'] = ingredient_df['name'].str.strip()
ingredient_df['rating'].replace('GOOD','Good',inplace=True)
ingredient_df['rating_num'] = ingredient_df['rating'].map({'Poor':0, 'Average':1, 'Good':2, 'Best':3})
print("number of ingredient:",ingredient_df.shape[0])
ingredient_df.to_csv('ingredient_cleaned.csv', index=False)
ingredient_df.head()

In [7]:
columns = ['name', 'category', 'rating']
ingredient_df = pd.read_csv('ingredient_paula_nodescription.csv', usecols=columns, converters={'category': literal_eval})
ingredient_df.head()

Unnamed: 0,name,category,rating
0,"1, 2-Hexanediol",[Preservatives],Good
1,10-Hydroxydecanoic Acid,[Emollients],Good
2,4-T-butylcyclohexanol,"[Emollients, Skin-Soothing]",Good
3,Acacia farnesiana extract,"[Plant Extracts, Fragrance: Synthetic and Frag...",Poor
4,acacia senegal gum,"[Texture Enhancer, Plant Extracts, Skin-Soothing]",Good


In [8]:
ingredient_df['name'] = ingredient_df['name'].str.strip()
ingredient_df['rating'].replace('GOOD', 'Good', inplace=True)
ingredient_df['rating_score'] = ingredient_df['rating'].map({'Poor': 0,
                                                            'Average': 1,
                                                            'Good': 2,
                                                            'Best': 3})
ingredient_df.head()

Unnamed: 0,name,category,rating,rating_score
0,"1, 2-Hexanediol",[Preservatives],Good,2
1,10-Hydroxydecanoic Acid,[Emollients],Good,2
2,4-T-butylcyclohexanol,"[Emollients, Skin-Soothing]",Good,2
3,Acacia farnesiana extract,"[Plant Extracts, Fragrance: Synthetic and Frag...",Poor,0
4,acacia senegal gum,"[Texture Enhancer, Plant Extracts, Skin-Soothing]",Good,2


**Import cleaned data to as 'ingredient_paula_nodescription_cleaned.csv'**

In [9]:
ingredient_df.to_csv('ingredient_paula_nodescription_cleaned', index=False)

In [10]:
print('Number of ingredient: ', ingredient_df.shape[0])

Number of ingredient:  1833


## **Comparing ingredients**

Create a class where we can check if an ingredient matches our existing ingredient dictionary. if there is a match, find the ingredient's rating and category.

* Initialize the class with ingredient rating dictionary and category dictionary.
* Given a list of ingredient, find best matching ingredient that rating and category are avaliable. This is done by evaluating the similarity metric between the name of all existing ingredient and the name of given ingredient (use python function SequenceMatcher). If the similarity is below a thresh, then the given ingredient is labeled as 'unknown'.
* After building up the matching dictionary, we can find an ingredient's matching, rating and category by calling the lookup function.

In [19]:
class look_up_ingredient():
    
    def __init__(self, rating_dict, category_dict):
        self.rating_dict = rating_dict
        self.rating_dict['unknown'] = np.nan
        
        self.category_dict = category_dict
        self.category_dict['unknown'] = [] # Why this one is an empty list???
        
        self.rating = set([value for value in self.rating_dict.values()])
        self.category = set([value for values in self.category_dict.values() for value in values]) 
        # category can be a list of many functions that ingredient belongs to
        
        self.match_dict = {} # result after comparing
    
    def find_matching_ingredient(self, my_ingredients, thresh=0.25):
        ''' Loop thru each ingredient in the ingredient list of the products
            then check if that ingredient appears in our ingredient list
            Calculate match_metric using SequenceMatcher and return the highest score and the best match
            Compare the match_metric with thresh > append to match_dict
        '''
        for ingredient in tqdm(my_ingredients):
            if ingredient in self.match_dict.keys():
                continue
            match_metric = {key: SequenceMatcher(None, key, ingredient).ratio() for key in self.rating_dict.keys()}
            best_match, best_metric = max(match_metric.items(), key=operator.itemgetter(1)) # What is operator.itemgetter(1)
            if best_metric > thresh:
                self.match_dict[ingredient] = best_match
            else:
                self.match_dict[ingredient] = 'unknown'
    
    def lookup(self, ingredient, option=''):
        ''' Return the information we want
        '''
        key = self.match_dict.get(ingredient, 'unknown') # get the key and values of 'unknown'
        rating = self.rating_dict.get(key, -1) #key is the ingredient
        category = self.category_dict.get(key, [])
        
        if option == 'ingredient':
            return key
        elif option == 'rating':
            return rating
        elif option == 'category':
            return category
        else:
            return key, rating, category
    
    def save_match_dict(self, dictfile='ingredient_match_dict.pickle'):
        ''' Save result after comparing to a pickle file'''
        pickle_out = open(dictfile, 'wb') # wb: open and writing in binary mode
        pickle.dump(self.match_dict, pickle_out)
        pickle_out.close()
    
    def load_match_dict(self, dictfle='ingredient_match_dict.pickle'):
        ''' Read pickle file that we saved'''
        if os.path.isfile(dictfile):
            pickle_in = open(dictfile, 'rb')
            self.match_dict = pickle.load(pickle_in)

Create ingredient class, note for ingredient with alias we will duplicate the record.

For example, for "PEG/PPG-18/18 dimethicone" we will create three dict items, with different keys but same value.


In [16]:
# Ingredient_rating_dict: key is ingredient name, value is the rating_score
ingredient_rating_dict = {name: row['rating_score'] for (idx, row) in ingredient_df.iterrows() for name in row['name'].split('/')}
ingredient_rating_dict

{'1, 2-Hexanediol': 2,
 '10-Hydroxydecanoic Acid': 2,
 '4-T-butylcyclohexanol': 2,
 'Acacia farnesiana extract': 0,
 'acacia senegal gum': 2,
 'acai': 3,
 'acetic acid': 0,
 'acetone': 0,
 'acetylated castor oil': 2,
 'acetylated hydrogenated cottonseed glyceride': 2,
 'acetylated lanolin': 2,
 'acetylated lanolin alcohol': 2,
 'acetylated palm kernel glycerides': 2,
 'acetyl carnitine HCL': 3,
 'acetyl glucosamine': 3,
 'acetyl glyceryl ricinoleate': 2,
 'acetyl hexapeptide-1': 3,
 'acetyl hexapeptide-37': 3,
 'acetyl hexapeptide-8': 2,
 'acetyl octapeptide-3': 2,
 'acetyl tetrapeptide-11': 3,
 'acetyl tetrapeptide-5': 3,
 'acetyl tetrapeptide-9': 3,
 'acetyl tributyl citrate': 1,
 'acetyl tyrosine': 2,
 'Achillea millefolium': 0,
 'acid': 1,
 'acne soap': 0,
 'acrylate': 2,
 'acrylates': 2,
 'C10-30 alkyl acrylate crosspolymer': 2,
 'dimethicone copolymer': 2,
 'steareth-20 methacrylate copolymer': 2,
 'acrylates copolymer': 2,
 'Actaea racemosa': 1,
 'Actinidia chinensis (kiwi fruit

In [17]:
# Ingredient_category_dict: key is ingredient name, value is the list of category
ingredient_category_dict = {name: row['category'] for (idx, row) in ingredient_df.iterrows() for name in row['name'].split('/')}
ingredient_category_dict

{'1, 2-Hexanediol': ['Preservatives'],
 '10-Hydroxydecanoic Acid': ['Emollients'],
 '4-T-butylcyclohexanol': ['Emollients', 'Skin-Soothing'],
 'Acacia farnesiana extract': ['Plant Extracts',
  'Fragrance: Synthetic and Fragrant Plant Extracts'],
 'acacia senegal gum': ['Texture Enhancer', 'Plant Extracts', 'Skin-Soothing'],
 'acai': ['Antioxidants', 'Plant Extracts'],
 'acetic acid': ['Sensitizing'],
 'acetone': ['Sensitizing'],
 'acetylated castor oil': ['Texture Enhancer', 'Emollients'],
 'acetylated hydrogenated cottonseed glyceride': ['Texture Enhancer',
  'Emollients'],
 'acetylated lanolin': ['Emollients'],
 'acetylated lanolin alcohol': ['Emollients'],
 'acetylated palm kernel glycerides': ['Texture Enhancer', 'Emollients'],
 'acetyl carnitine HCL': ['Antioxidants'],
 'acetyl glucosamine': ['Skin-Replenishing',
  'Antioxidants',
  'Skin-Soothing',
  'Hydration'],
 'acetyl glyceryl ricinoleate': ['Texture Enhancer', 'Emollients'],
 'acetyl hexapeptide-1': ['Skin-Restoring'],
 'ac

In [20]:
lookup = look_up_ingredient(ingredient_rating_dict, ingredient_category_dict)

We also make a table where ingredient belongs to multiple categories are separated into different rows, this will be useful when we examine characters of different categories.

In [46]:
def split_ingredient_category(ingredient_df_):
    
    ''' Split ingredients that belong to multiple categories into separated rows'''
    
    ingredient_df = ingredient_df_.copy(deep=True)
    print('Copied')
    new_df = pd.DataFrame()
    
    for idx, row in ingredient_df.iterrows():
        if len(row['category']) == 1:
            ingredient_df.loc[idx, 'category'] = ingredient_df.loc[idx, 'category'][0] # index is to remove []

        elif len(row['category']) > 1:
#             print(len(row['category']))
            new_row = row
            for category in row['category']:
                new_row['category'] = category
                new_df = new_df.append(row)
            
            ingredient_df.drop(idx, axis=0, inplace=True) #remove existed row to prepare to append result above
    ingredient_df = ingredient_df.append(new_df)
    ingredient_df = ingredient_df.sort_values('name').reset_index(drop=True)
    return ingredient_df
        

In [47]:
ingredient_single_cat = split_ingredient_category(ingredient_df)
ingredient_single_cat

Copied


Unnamed: 0,name,category,rating,rating_score
0,"1, 2-Hexanediol",Preservatives,Good,2.0
1,10-Hydroxydecanoic Acid,Emollients,Good,2.0
2,4-T-butylcyclohexanol,Skin-Soothing,Good,2.0
3,4-T-butylcyclohexanol,Emollients,Good,2.0
4,AGE,Miscellaneous,Poor,0.0
...,...,...,...,...
3428,zinc stearate,Coloring Agents/Pigments,Good,2.0
3429,zinc stearate,Thickeners/Emulsifiers,Good,2.0
3430,zinc stearate,Texture Enhancer,Good,2.0
3431,zinc sulfate,Preservatives,Poor,0.0



Clean product data

    Drop products that are not "chemical" products, like makeup brushes, cleaning devices.
    Merge some categories.
    Split 'size' column to a number and unit, do unit conversion as necessary
    Compute 'price/size'
    Basic cleaning on ingredients:
        split inactive and active ingredient
        convert ingredients to a list
        find number of inactive and active ingredient
        check if the ingredients are in alphabatical order -- most companies like to list ingredient in a descending order of their quantity in the product, some companies just list ingredients alphabatically.
    Look up ingredients in our ingredient dictionary.
        get a set of all unique ingredients in the products dataframe
        find the match of all these ingredients
        for all product, we loop over its ingredient list and look up the matching ingredient, rating and ingredient category
        count how many ingredients in a product have a certain rating (how many ingredient rated as Good/Average etc.)
        count how many ingredients in a product belongs to a certain category (how many antioxidants/sunscreen etc.)
        compute average ingredient rating. For inactive ingredient, we also consider two kinds of weighted average.



'Emollients'