In [12]:
import pandas as pd 
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import pickle 

from collections import Counter
from tqdm import tqdm

In [2]:
import time
start_time = time.time()

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

In [3]:
def clean_string_name(string):
    text = string.replace(' Size: 4 oz. * USDA Certified Organic Ingredient ** None remains after saponifying oils into soap and glycerin', '')
    pattern = "[\|\*\_\'\{}&]".format('"')
    regex1 = re.compile('\[.*]')
    regex2 = re.compile('\[.*}')
    text = re.sub(pattern, "", text)
    text = re.sub(regex1, "", text)
    text = re.sub(regex2, "", text)
    text = text.strip('\x0c')
    text = text.strip(' ')
    text = text.upper()
    text = text.replace('INACTIVE INGREDIENTS:', '')
    text = text.replace('ACTIVE INGREDIENTS:', '')

    return text

In [8]:
mismatch = ['CAVIAR WATER', 'STEM WATER']
def convert_new_ing_list(x):
    trimed = [name.strip(" '").strip("''") for name in x.strip('[]').split(',')]
    res = ''

    for i in range(len(trimed)):
        if i != (len(trimed) -1):
            if trimed[i] in mismatch:
                res = res + 'WATER' + ', '
            else:
                res = res + trimed[i] + ', '
        else: 
            if trimed[i] in mismatch:
                res = res + 'WATER' + ', '
            else:
                res = res + trimed[i]
    return res

# **Load dataset**

In [15]:
df_skin = pd.read_csv('../Database/EWG/ewg_skin_products_transformed_8208.csv')
df_sun = pd.read_csv('../Database/EWG/ewg_sun_products_transformed_1565.csv')
df_makeup = pd.read_csv('../Database/EWG/ewg_makeup_products_transformed_10527.csv')

df_hair = pd.read_csv('../Database/EWG/ewg_hair_products_transformed_4908.csv')
df_or = pd.read_csv('../Database/EWG/ewg_oralcare_products_transformed_937.csv')
df_frag = pd.read_csv('../Database/EWG/ewg_fragrance_products_transformed_484.csv')

In [16]:
df_all = pd.concat([df_skin, df_sun, df_makeup, df_hair, df_or, df_frag])

In [17]:
df_all.drop_duplicates(inplace=True)
df_all['new_ing_list_fixed'] = df_all['new_ing_list'].apply(convert_new_ing_list)
df_all = df_all.sample(frac=1)

In [18]:
df_all.reset_index(drop=True, inplace=True)

In [13]:
with open('ingredient_idx.pickle', 'rb') as handle:
    ingredient_idx = pickle.load(handle)

In [14]:
len(ingredient_idx)

7433

# **Model**

In [28]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import NearestNeighbors

class IngredientTokenizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X): # X: nested list
        M = len(X)
        N = len(ingredient_idx)
        A = np.zeros((M, N), dtype=np.uint8)
        i= 0
        for ing_list in X:
            x = np.zeros(N, dtype=np.uint8)
            for ingredient in ing_list:
                # Get the index for each ingredient
                if ingredient in ingredient_idx.keys():
                    idx = ingredient_idx[ingredient]
                    x[idx] = 1
                else:
                    pass
            
            A[i, :] = x
            i += 1
        return A

In [26]:
def column_to_list(df, col_name):
    corpus = []
    for i in tqdm(range(len(df))):
        
        ingredients = df[col_name][i]
        if not isinstance(ingredients, str):
            print(ingredients)
            ingredients = str(ingredients)
        tokens = ingredients.split(', ')    
        corpus.append(tokens)
    return corpus

In [24]:
X = column_to_list(df_all, 'new_ing_list_fixed')

100%|██████████| 23260/23260 [00:02<00:00, 7854.31it/s] 


In [29]:
knn_pipeline = Pipeline([
        ('tokenizer', IngredientTokenizer()),
        ('KNN', NearestNeighbors(n_neighbors=5)),
    ])

In [30]:
knn_pipeline.fit(X)

Pipeline(steps=[('tokenizer', IngredientTokenizer()),
                ('KNN', NearestNeighbors())])

In [37]:
df_all.loc[90, :]

cat_name                                                            Sun
subcat_name                                        Moisturizer With SPF
product_url           https://www.ewg.org/skindeep/products/932934-F...
product_brand                                          First Aid Beauty
product_name               Ultra Repair Tinted Moisturizer, Tan, SPF 30
product_score                                                        03
product_img           https://static.ewg.org/skindeep_images/9329/93...
ingredient_list       Active Ingredients: Avobenzone 3%, Octinoxate ...
new_ing_list          ['AVOBENZONE', 'OCTINOXATE', 'OCTISALATE', 'OC...
new_ing_list_fixed    AVOBENZONE, OCTINOXATE, OCTISALATE, OCTOCRYLEN...
Name: 90, dtype: object

In [41]:
ext = df_all.iloc[90]['new_ing_list_fixed']
test = [[i for i in ext.split(' ')]]
test

[['AVOBENZONE,',
  'OCTINOXATE,',
  'OCTISALATE,',
  'OCTOCRYLENE,',
  'CYCLOPENTASILOXANE,',
  'DIMETHICONE,',
  'CETEARYL',
  'ALCOHOL,',
  'CETEARYL',
  'GLUCOSIDE,',
  'POTASSIUM',
  'CETYL',
  'PHOSPHATE,',
  'POLYGLYCERYL-4',
  'ISOSTEARATE,',
  'DIMETHICONE',
  'CROSSPOLYMER,',
  'TOCOPHERYL',
  'ACETATE,',
  'GLYCERIN,',
  'CETYL',
  'PEG-8',
  'DIMETHICONE,',
  'CETYL',
  'DIMETHICONE,',
  'POLYSORBATE',
  '60,',
  'BUTYLENE',
  'GLYCOL,',
  'HEXYL',
  'LAURATE,',
  'COLLOIDAL',
  'OATMEAL,',
  'PHENOXYETHANOL,',
  'SODIUM',
  'DEHYDROACETATE,',
  'SODIUM',
  'HYALURONATE,',
  'CAMELLIA',
  'SINENSIS',
  'LEAF',
  'EXTRACT,',
  'CARBOMER,',
  'TRIBEHENIN,',
  'LIMONENE,',
  'TETRASODIUM',
  'EDTA,',
  'AMINOMETHYL',
  'PROPANOL,',
  'CITRUS',
  'NOBILIS',
  'PEEL',
  'OIL,',
  'ALLANTOIN,',
  'SODIUM',
  'ASCORBATE,',
  'GLYCYRRHIZA',
  'GLABRA',
  'ROOT',
  'EXTRACT,',
  'CHRYSANTHEMUM',
  'PARTHENIUM',
  'FLOWER',
  'EXTRACT,',
  'MICA']]

In [44]:
distances, indices = knn_pipeline.kneighbors([test])
distances, indices

AttributeError: 'NearestNeighbors' object has no attribute 'predict'

In [None]:
df_all.iloc[90,:]

In [None]:
location = indices.tolist()[0]
df_all.iloc[location, :]