In [1]:
import pandas as pd 
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import pickle 

from collections import Counter
from tqdm import tqdm

In [2]:
import time
start_time = time.time()

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

In [3]:
def clean_string_name(string):
    text = convert_new_ing_list(string)
    text = text.replace(' Size: 4 oz. * USDA Certified Organic Ingredient ** None remains after saponifying oils into soap and glycerin', '')
    pattern = "[\|\*\_\'\{}&]".format('"')
    regex1 = re.compile('\[.*]')
    regex2 = re.compile('\[.*}')
    text = re.sub(pattern, "", text)
    text = re.sub(regex1, "", text)
    text = re.sub(regex2, "", text)
    text = text.strip('\x0c')
    text = text.strip(' ')
    text = text.upper()
    text = text.replace('ORGANIC', '')
    text = text.replace('SUSTAINABLE', '').replace('EXTRA VIRGIN', '').replace('SAPONIFIED', '')
    text = text.replace('AQUA (WATER)', 'WATER').replace('DISTILLED WATER', 'WATER').replace('WATER (AQUA)', 'WATER').replace('AQUA/WATER/EAU', 'WATER')
    text = text.replace('INACTIVE INGREDIENTS:', '')
    text = text.replace('ACTIVE INGREDIENTS:', '')

    return text

In [4]:
mismatch = ['CAVIAR WATER', 'STEM WATER']
def convert_new_ing_list(x):
    trimed = [name.strip(" '").strip("''") for name in x.strip('[]').split(',')]
    res = ''

    for i in range(len(trimed)):
        if i != (len(trimed) -1):
            if trimed[i] in mismatch:
                res = res + 'WATER' + ', '
            else:
                res = res + trimed[i] + ', '
        else: 
            if trimed[i] in mismatch:
                res = res + 'WATER' + ', '
            else:
                res = res + trimed[i]
    return res

# **Load dataset**

In [13]:
df_skin = pd.read_csv('../Database/EWG/ewg_skin_products_transformed_8208.csv')
df_sun = pd.read_csv('../Database/EWG/ewg_sun_products_transformed_1565.csv')
df_makeup = pd.read_csv('../Database/EWG/ewg_makeup_products_transformed_10527.csv')

df_hair = pd.read_csv('../Database/EWG/ewg_hair_products_transformed_4908.csv')
df_or = pd.read_csv('../Database/EWG/ewg_oralcare_products_transformed_937.csv')
df_frag = pd.read_csv('../Database/EWG/ewg_fragrance_products_transformed_484.csv')

In [14]:
df_all = pd.concat([df_skin, df_sun, df_makeup, df_hair, df_or, df_frag])

In [15]:
df_all.drop_duplicates(inplace=True)
df_all['new_ing_list_fixed'] = df_all['new_ing_list'].apply(convert_new_ing_list)
df_all['new_product_score'] = df_all['product_score'].map({'verified': 1,
                                                          '01': 2,
                                                          '02': 3,
                                                          '03': 4,
                                                          '04': 5,
                                                          '05': 6,
                                                          '06': 7,
                                                          '07': 8,
                                                          '08': 9,
                                                          '09': 10,
                                                          '10': 10,})
df_all.reset_index(drop=True, inplace=True)

In [16]:
drop_index = df_all[df_all['new_ing_list'] == '[]'].index.to_list()
df_all.drop(drop_index, inplace=True)
df_all.reset_index(drop=True, inplace=True)

In [17]:
df_all = df_all.sample(frac=1)
df_all.reset_index(drop=True, inplace=True)

In [18]:
df_dataset = df_all[df_all['new_product_score'] <=5]

In [20]:
df_dataset.reset_index(drop=True, inplace=True)

In [21]:
df_dataset

Unnamed: 0,cat_name,subcat_name,product_url,product_brand,product_name,product_score,product_img,ingredient_list,new_ing_list,new_ing_list_fixed,new_product_score
0,Skin,Body Wash/Cleanser,https://www.ewg.org/skindeep/products/877943-D...,,Dead Down Wind Unscented Body & Hair Soap,03,https://static.ewg.org/skindeep_images/8779/87...,"WATER, SODIUM LAURYL SULFATE, COCAMIDOPROPYL B...","['WATER', 'SODIUM LAURYL SULFATE', 'COCAMIDOPR...","WATER, SODIUM LAURYL SULFATE, COCAMIDOPROPYL B...",4
1,Makeup,Lipstick,https://www.ewg.org/skindeep/products/930086-M...,Milk Makeup,"Lip Color, Low Key",04,https://static.ewg.org/skindeep_images/9300/93...,"Ingrdients: Octyldodecanol, Cetyl Ricinoleate,...","['OCTYLDODECANOL', 'CETYL RICINOLEATE', 'KAOLI...","OCTYLDODECANOL, CETYL RICINOLEATE, KAOLIN, BUT...",5
2,Skin,Body Wash/Cleanser,https://www.ewg.org/skindeep/products/920817-D...,Dove,Deep Moisture Shower Foam Body Wash,04,https://static.ewg.org/skindeep_images/9208/92...,"WATER (AQUA), SODIUM LAUROYL GLUTAMATE, GLYCER...","['WATER', 'SODIUM LAUROYL GLUTAMATE', 'GLYCERI...","WATER, SODIUM LAUROYL GLUTAMATE, GLYCERIN, COC...",5
3,Makeup,Lipstick,https://www.ewg.org/skindeep/products/941592-D...,,Danessa Myricks Colorfix 24-Hour Cream Color M...,03,https://phorcys-static.ewg.org/image/contents/...,"Isododecane, Cyclopentasiloxane, Isononyl Ison...","['ISODODECANE', 'CYCLOPENTASILOXANE', 'ISONONY...","ISODODECANE, CYCLOPENTASILOXANE, ISONONYL ISON...",4
4,Skin,Facial Moisturizer/Treatment,https://www.ewg.org/skindeep/products/818761-i...,isoi,Sebum Care Essence Lotion,03,https://static.ewg.org/skindeep_images/8187/81...,"Water, Ethanol*, Glycerin, Cetyl Ethylhexanoat...","['WATER', 'ETHANOLUM', 'GLYCERIN', 'CETYL ETHY...","WATER, ETHANOLUM, GLYCERIN, CETYL ETHYLHEXANOA...",4
...,...,...,...,...,...,...,...,...,...,...,...
17215,Makeup,Lipstick,https://www.ewg.org/skindeep/products/927211-B...,Bitzy,"Matte Lipstick Crayon, Wine Me",02,https://static.ewg.org/skindeep/img/ewg_missin...,"MINERAL OIL , MICA , HYDROGENATED POLYDECENE ,...","['MINERAL OIL', 'MICA', 'HYDROGENATED POLYDECE...","MINERAL OIL, MICA, HYDROGENATED POLYDECENE, OZ...",3
17216,Skin,Mask,https://www.ewg.org/skindeep/products/922230-A...,Acure,Incredibly Clear Charcoal Lemonade Mask,03,https://static.ewg.org/skindeep_images/9222/92...,"WATER, KAOLIN, BUTYROSPERMUM PARKII (SHEA) BUT...","['WATER', 'KAOLIN', 'BUTYROSPERMUM PARKII BUTT...","WATER, KAOLIN, BUTYROSPERMUM PARKII BUTTER, GL...",4
17217,Skin,Makeup Remover,https://www.ewg.org/skindeep/products/671546-A...,AROMATICA,Natural Coconut Cleansing Oil,verified,https://static.ewg.org/skindeep_images/6715/67...,"HELIANTHUS ANNUUS (SUNFLOWER) SEED OIL, POLYGL...","['HELIANTHUS ANNUUS SEED OIL', 'POLYGLYCERYL-2...","HELIANTHUS ANNUUS SEED OIL, POLYGLYCERYL-2 SES...",1
17218,Makeup,Concealer,https://www.ewg.org/skindeep/products/810794-K...,Kaja,"Don't Settle Concealer, Coffee Bean",02,https://static.ewg.org/skindeep_images/8107/81...,"Water, Cyclopentasiloxane, Iron Oxides (CI 774...","['WATER', 'CYCLOPENTASILOXANE', 'IRON OXIDES',...","WATER, CYCLOPENTASILOXANE, IRON OXIDES, IRON O...",3


In [22]:
df_dataset.to_csv('recommendation_pool.csv', index=False)

In [23]:
with open('ingredient_idx.pickle', 'rb') as handle:
    ingredient_idx = pickle.load(handle)

In [24]:
len(ingredient_idx)

7433

In [25]:
corpus = []
for i in tqdm(range(len(df_dataset))):
    ingredients = df_dataset['new_ing_list_fixed'][i]
#     print(ingredients)
    tokens = ingredients.split(', ')    
    corpus.append(tokens)

100%|██████████| 17220/17220 [00:01<00:00, 12184.51it/s]


In [26]:
# Define the oh_encoder function
def oh_encoder(tokens):
    x = np.zeros(N)    
    for ingredient in tokens:
        if ingredient in ingredient_idx.keys():
            idx = ingredient_idx[ingredient]
            x[idx] = 1
        else:
            pass
    return x

In [27]:
# Get the number of items and tokens 
M = len(df_dataset)                 # The number of the items
N = len(ingredient_idx)     # The number of the ingredients

# Initialize a matrix of zeros
A = np.zeros((M, N), dtype=np.uint8)

# Make a document-term matrix
i = 0
for tokens in corpus:
    A[i, :] = oh_encoder(tokens)
    i += 1

In [28]:
A.shape

(17220, 7433)

In [29]:
# columns name
ingredient_idx_key = [key for key in ingredient_idx.keys()]
df_sorted = pd.DataFrame(A, columns = ingredient_idx_key) 

# append to the original df
df_vector = pd.concat([df_dataset, df_sorted], axis=1)

In [30]:
df_vector

Unnamed: 0,cat_name,subcat_name,product_url,product_brand,product_name,product_score,product_img,ingredient_list,new_ing_list,new_ing_list_fixed,...,ALPINIA FORMOSANA SEED POWDER,SARGASSUM FUSIFORME EXTRACT,ISOSTEARAMIDOMORPHOLINE STEARATE,GLUTAMIC ACID FERMENT FILTRATE,APRICOT KERNEL AMINO ACIDS,OROTIC ACID,ACTINIDIA CHINENSIS FRUIT JUICE,CORN ACID,PEANUT ACID,TARAXACUM OFFICINALE
0,Skin,Body Wash/Cleanser,https://www.ewg.org/skindeep/products/877943-D...,,Dead Down Wind Unscented Body & Hair Soap,03,https://static.ewg.org/skindeep_images/8779/87...,"WATER, SODIUM LAURYL SULFATE, COCAMIDOPROPYL B...","['WATER', 'SODIUM LAURYL SULFATE', 'COCAMIDOPR...","WATER, SODIUM LAURYL SULFATE, COCAMIDOPROPYL B...",...,0,0,0,0,0,0,0,0,0,0
1,Makeup,Lipstick,https://www.ewg.org/skindeep/products/930086-M...,Milk Makeup,"Lip Color, Low Key",04,https://static.ewg.org/skindeep_images/9300/93...,"Ingrdients: Octyldodecanol, Cetyl Ricinoleate,...","['OCTYLDODECANOL', 'CETYL RICINOLEATE', 'KAOLI...","OCTYLDODECANOL, CETYL RICINOLEATE, KAOLIN, BUT...",...,0,0,0,0,0,0,0,0,0,0
2,Skin,Body Wash/Cleanser,https://www.ewg.org/skindeep/products/920817-D...,Dove,Deep Moisture Shower Foam Body Wash,04,https://static.ewg.org/skindeep_images/9208/92...,"WATER (AQUA), SODIUM LAUROYL GLUTAMATE, GLYCER...","['WATER', 'SODIUM LAUROYL GLUTAMATE', 'GLYCERI...","WATER, SODIUM LAUROYL GLUTAMATE, GLYCERIN, COC...",...,0,0,0,0,0,0,0,0,0,0
3,Makeup,Lipstick,https://www.ewg.org/skindeep/products/941592-D...,,Danessa Myricks Colorfix 24-Hour Cream Color M...,03,https://phorcys-static.ewg.org/image/contents/...,"Isododecane, Cyclopentasiloxane, Isononyl Ison...","['ISODODECANE', 'CYCLOPENTASILOXANE', 'ISONONY...","ISODODECANE, CYCLOPENTASILOXANE, ISONONYL ISON...",...,0,0,0,0,0,0,0,0,0,0
4,Skin,Facial Moisturizer/Treatment,https://www.ewg.org/skindeep/products/818761-i...,isoi,Sebum Care Essence Lotion,03,https://static.ewg.org/skindeep_images/8187/81...,"Water, Ethanol*, Glycerin, Cetyl Ethylhexanoat...","['WATER', 'ETHANOLUM', 'GLYCERIN', 'CETYL ETHY...","WATER, ETHANOLUM, GLYCERIN, CETYL ETHYLHEXANOA...",...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17215,Makeup,Lipstick,https://www.ewg.org/skindeep/products/927211-B...,Bitzy,"Matte Lipstick Crayon, Wine Me",02,https://static.ewg.org/skindeep/img/ewg_missin...,"MINERAL OIL , MICA , HYDROGENATED POLYDECENE ,...","['MINERAL OIL', 'MICA', 'HYDROGENATED POLYDECE...","MINERAL OIL, MICA, HYDROGENATED POLYDECENE, OZ...",...,0,0,0,0,0,0,0,0,0,0
17216,Skin,Mask,https://www.ewg.org/skindeep/products/922230-A...,Acure,Incredibly Clear Charcoal Lemonade Mask,03,https://static.ewg.org/skindeep_images/9222/92...,"WATER, KAOLIN, BUTYROSPERMUM PARKII (SHEA) BUT...","['WATER', 'KAOLIN', 'BUTYROSPERMUM PARKII BUTT...","WATER, KAOLIN, BUTYROSPERMUM PARKII BUTTER, GL...",...,0,0,0,0,0,0,0,0,0,0
17217,Skin,Makeup Remover,https://www.ewg.org/skindeep/products/671546-A...,AROMATICA,Natural Coconut Cleansing Oil,verified,https://static.ewg.org/skindeep_images/6715/67...,"HELIANTHUS ANNUUS (SUNFLOWER) SEED OIL, POLYGL...","['HELIANTHUS ANNUUS SEED OIL', 'POLYGLYCERYL-2...","HELIANTHUS ANNUUS SEED OIL, POLYGLYCERYL-2 SES...",...,0,0,0,0,0,0,0,0,0,0
17218,Makeup,Concealer,https://www.ewg.org/skindeep/products/810794-K...,Kaja,"Don't Settle Concealer, Coffee Bean",02,https://static.ewg.org/skindeep_images/8107/81...,"Water, Cyclopentasiloxane, Iron Oxides (CI 774...","['WATER', 'CYCLOPENTASILOXANE', 'IRON OXIDES',...","WATER, CYCLOPENTASILOXANE, IRON OXIDES, IRON O...",...,0,0,0,0,0,0,0,0,0,0


# **KNN**

In [31]:
df_vector.iloc[:, 11:]

Unnamed: 0,WATER,PHENOXYETHANOL,GLYCERIN,IRON OXIDES,TITANIUM DIOXIDE,SILICA,TOCOPHERYL ACETATE,TOCOPHEROL,DIMETHICONE,CITRIC ACID,...,ALPINIA FORMOSANA SEED POWDER,SARGASSUM FUSIFORME EXTRACT,ISOSTEARAMIDOMORPHOLINE STEARATE,GLUTAMIC ACID FERMENT FILTRATE,APRICOT KERNEL AMINO ACIDS,OROTIC ACID,ACTINIDIA CHINENSIS FRUIT JUICE,CORN ACID,PEANUT ACID,TARAXACUM OFFICINALE
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17215,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17216,1,0,1,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
17217,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
17218,1,1,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
y = df_vector['new_product_score'].to_numpy()
X = df_vector.iloc[:, 11:].to_numpy()

In [33]:
print('X_train shape: ', X.shape)
print('y_train shape: ', y.shape)

X_train shape:  (17220, 7433)
y_train shape:  (17220,)


In [34]:
from sklearn.neighbors import NearestNeighbors

In [35]:
nbrs = NearestNeighbors(n_neighbors=5).fit(X)

## **Testing**

In [36]:
df_test = pd.read_csv('ewg_all_products_vector.csv')

In [37]:
df_test[df_test['new_product_score'] == 10]

Unnamed: 0,cat_name,subcat_name,product_url,product_brand,product_name,product_score,product_img,ingredient_list,new_ing_list,new_ing_list_fixed,...,ALPINIA FORMOSANA SEED POWDER,SARGASSUM FUSIFORME EXTRACT,ISOSTEARAMIDOMORPHOLINE STEARATE,GLUTAMIC ACID FERMENT FILTRATE,APRICOT KERNEL AMINO ACIDS,OROTIC ACID,ACTINIDIA CHINENSIS FRUIT JUICE,CORN ACID,PEANUT ACID,TARAXACUM OFFICINALE
266,Fragrance,Fragrance For Men,https://www.ewg.org/skindeep/products/892978-A...,Acqua Di Parma,Colonia Essenza,09,https://static.ewg.org/skindeep/img/ewg_missin...,"Alcohol Denat, Water, Fragrance, Limonene, Hyd...","['ALCOHOL DENAT.', 'WATER', 'GRANITE', 'LIMONE...","ALCOHOL DENAT., WATER, GRANITE, LIMONENE, HYDR...",...,0,0,0,0,0,0,0,0,0,0
338,Hair,Hair Color And Bleaching,https://www.ewg.org/skindeep/products/877042-F...,Fright Night,"Colored Hair Spray, Cryptic Blue",10,https://static.ewg.org/skindeep/img/ewg_missin...,"ALCOHOL DENAT., HYDROFLUOROCARBON 152A, POLYAC...","['ALCOHOL DENAT.', 'HYDROFLUOROCARBON 152A', '...","ALCOHOL DENAT., HYDROFLUOROCARBON 152A, POLYAC...",...,0,0,0,0,0,0,0,0,0,0
562,Fragrance,Fragrance For Women,https://www.ewg.org/skindeep/products/882929-M...,Marc Jacobs,"Eau De Toilette Spray, Daisy Love",10,https://static.ewg.org/skindeep_images/8829/88...,"ALCOHOL DENAT., FRAGRANCE, WATER, ETHYLHEXYL M...","['ALCOHOL DENAT.', 'GRANITE', 'WATER', 'ETHYLH...","ALCOHOL DENAT., GRANITE, WATER, ETHYLHEXYL MET...",...,0,0,0,0,0,0,0,0,0,0
752,Fragrance,Fragrance For Men,https://www.ewg.org/skindeep/products/879553-D...,Dolce & Gabbana,The One Grey Eau De Toilette Intense for Men,09,https://static.ewg.org/skindeep_images/8795/87...,"ALCOHOL, PARFUM (FRAGRANCE), DIPROPYLENE GLYCO...","['ALCOHOL', 'DIPROPYLENE GLYCOL', 'CAVIAR WATE...","ALCOHOL, DIPROPYLENE GLYCOL, WATER, LIMONENE, ...",...,0,0,0,0,0,0,0,0,0,0
1052,Sun,Beach & Sport Sunscreen,https://www.ewg.org/skindeep/products/928992-C...,Coppertone,"Sport Sunscreen Lotion, SPF 100",10,https://static.ewg.org/skindeep_images/9289/92...,"Active Ingredients: Avobenzone 3%, Homosalate ...","['AVOBENZONE', 'HOMOSALATE', 'OCTISALATE', 'OC...","AVOBENZONE, HOMOSALATE, OCTISALATE, OCTOCRYLEN...",...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21312,Sun,Beach & Sport Sunscreen,https://www.ewg.org/skindeep/products/928918-B...,Banana Boat,"Kids MAX Clear Sunscreen Spray, SPF 100",10,https://static.ewg.org/skindeep_images/9289/92...,"Active Ingredients: Avobenzone 3.0%, Homosalat...","['AVOBENZONE', 'HOMOSALATE', 'OCTISALATE', 'OC...","AVOBENZONE, HOMOSALATE, OCTISALATE, OCTOCRYLEN...",...,0,0,0,0,0,0,0,0,0,0
21574,Fragrance,Fragrance For Women,https://www.ewg.org/skindeep/products/876763-G...,Gucci,Ladies Eau De Parfum Vaporisateur Natural Spra...,09,https://static.ewg.org/skindeep_images/8767/87...,"ALCOHOL DENAT., PARFUM (FRAGRANCE), AQUA (WATE...","['ALCOHOL DENAT.', 'CAVIAR WATER', 'ETHYLHEXYL...","ALCOHOL DENAT., WATER, ETHYLHEXYL METHOXYCINNA...",...,0,0,0,0,0,0,0,0,0,0
21747,Skin,Facial Moisturizer/Treatment,https://www.ewg.org/skindeep/products/900897-F...,Fourth Ray Beauty,Mellow Milk Mist,09,https://static.ewg.org/skindeep_images/9008/90...,"Water, Butylene Glycol, Glycerin, Propanediol,...","['WATER', 'BUTYLENE GLYCOL', 'GLYCERIN', 'PROP...","WATER, BUTYLENE GLYCOL, GLYCERIN, PROPANEDIOL,...",...,0,0,0,0,0,0,0,0,0,0
21873,Hair,Hair Color And Bleaching,https://www.ewg.org/skindeep/products/877073-F...,Fright Night,"Colored Hair Spray, Rotten Red",10,https://static.ewg.org/skindeep/img/ewg_missin...,"ALCOHOL DENAT., HYDROFLUOROCARBON 152A, POLYAC...","['ALCOHOL DENAT.', 'HYDROFLUOROCARBON 152A', '...","ALCOHOL DENAT., HYDROFLUOROCARBON 152A, POLYAC...",...,0,0,0,0,0,0,0,0,0,0


In [38]:
df_test.iloc[9830, :10]

cat_name                                                         Makeup
subcat_name                                               Facial Powder
product_url           https://www.ewg.org/skindeep/products/811657-B...
product_brand                                                   Benefit
product_name          The POREfessional Agent Zero Shine Shine-Vanis...
product_score                                                        06
product_img           https://static.ewg.org/skindeep_images/8116/81...
ingredient_list       Talc, Nylon-12, Aqua (Water), Magnesium Myrist...
new_ing_list          ['TALC', 'NYLON-12', 'CAVIAR WATER', 'MAGNESIU...
new_ing_list_fixed    TALC, NYLON-12, WATER, MAGNESIUM MYRISTATE, LA...
Name: 9830, dtype: object

In [39]:
test = df_test.iloc[9830, 11:].to_numpy(dtype= np.uint8)
test.shape

(7433,)

In [40]:
distances, indices = nbrs.kneighbors([test])
distances, indices

(array([[3.87298335, 4.        , 4.        , 4.        , 4.        ]]),
 array([[15821, 14812, 16276, 12858, 15169]]))

In [41]:
location = indices.tolist()[0]
location

[15821, 14812, 16276, 12858, 15169]

In [42]:
df_dataset.iloc[location, :]

Unnamed: 0,cat_name,subcat_name,product_url,product_brand,product_name,product_score,product_img,ingredient_list,new_ing_list,new_ing_list_fixed,new_product_score
15821,Makeup,Facial Powder,https://www.ewg.org/skindeep/products/639286-B...,Beautycounter,Mattifying Powder,verified,https://static.ewg.org/skindeep_images/6392/63...,"Zea Mays (Corn) Starch, Lauroyl Lysine, Silica...","['ZEA MAYS STARCH', 'LAUROYL LYSINE', 'SILICA'...","ZEA MAYS STARCH, LAUROYL LYSINE, SILICA, PHENO...",1
14812,Makeup,Facial Powder,https://www.ewg.org/skindeep/products/815606-T...,tarte,"Shape Tape Setting Powder, Translucent Tan-Deep",03,https://static.ewg.org/skindeep_images/8156/81...,"Talc, magnesium myristate, nylon-12, caprylic/...","['TALC', 'MAGNESIUM MYRISTATE', 'NYLON-12', 'C...","TALC, MAGNESIUM MYRISTATE, NYLON-12, CAPRIC TR...",4
16276,Makeup,Facial Powder,https://www.ewg.org/skindeep/products/872193-J...,Juvia's Place,"I AM MAGIC Setting Powder, Namib",04,https://static.ewg.org/skindeep_images/8721/87...,"Talc, Magnesium Myristate, Nylon-12, Zea Mays ...","['TALC', 'MAGNESIUM MYRISTATE', 'NYLON-12', 'Z...","TALC, MAGNESIUM MYRISTATE, NYLON-12, ZEA MAYS ...",5
12858,Makeup,Facial Powder,https://www.ewg.org/skindeep/products/815607-T...,tarte,"Shape Tape Setting Powder, Translucent",03,https://static.ewg.org/skindeep_images/8156/81...,"Talc, magnesium myristate, nylon-12, caprylic/...","['TALC', 'MAGNESIUM MYRISTATE', 'NYLON-12', 'C...","TALC, MAGNESIUM MYRISTATE, NYLON-12, CAPRIC TR...",4
15169,Makeup,Facial Powder,https://www.ewg.org/skindeep/products/872196-J...,Juvia's Place,"I AM MAGIC Setting Powder, Kalahari",04,https://static.ewg.org/skindeep_images/8721/87...,"Talc, Magnesium Myristate, Nylon-12, Zea Mays ...","['TALC', 'MAGNESIUM MYRISTATE', 'NYLON-12', 'Z...","TALC, MAGNESIUM MYRISTATE, NYLON-12, ZEA MAYS ...",5


In [43]:
import pickle

pickle.dump(nbrs, open('recommendation.pkl', 'wb'))

In [152]:
# Your code here
with open('recommendation.pkl', 'rb') as model:
    reload_rf = pickle.load(model)

distances, indices = reload_rf.kneighbors([test])
distances, indices 

(array([[3.87298335, 4.        , 4.        , 4.        , 4.        ]]),
 array([[  592, 16665,  9619, 14174, 17070]]))

# **Model**

In [28]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import NearestNeighbors

class IngredientTokenizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X): # X: nested list
        M = len(X)
        N = len(ingredient_idx)
        A = np.zeros((M, N), dtype=np.uint8)
        i= 0
        for ing_list in X:
            x = np.zeros(N, dtype=np.uint8)
            for ingredient in ing_list:
                # Get the index for each ingredient
                if ingredient in ingredient_idx.keys():
                    idx = ingredient_idx[ingredient]
                    x[idx] = 1
                else:
                    pass
            
            A[i, :] = x
            i += 1
        return A

In [26]:
def column_to_list(df, col_name):
    corpus = []
    for i in tqdm(range(len(df))):
        
        ingredients = df[col_name][i]
        if not isinstance(ingredients, str):
            print(ingredients)
            ingredients = str(ingredients)
        tokens = ingredients.split(', ')    
        corpus.append(tokens)
    return corpus

In [24]:
X = column_to_list(df_all, 'new_ing_list_fixed')

100%|██████████| 23260/23260 [00:02<00:00, 7854.31it/s] 


In [29]:
knn_pipeline = Pipeline([
        ('tokenizer', IngredientTokenizer()),
        ('KNN', NearestNeighbors(n_neighbors=5)),
    ])

In [30]:
knn_pipeline.fit(X)

Pipeline(steps=[('tokenizer', IngredientTokenizer()),
                ('KNN', NearestNeighbors())])

In [37]:
df_all.loc[90, :]

cat_name                                                            Sun
subcat_name                                        Moisturizer With SPF
product_url           https://www.ewg.org/skindeep/products/932934-F...
product_brand                                          First Aid Beauty
product_name               Ultra Repair Tinted Moisturizer, Tan, SPF 30
product_score                                                        03
product_img           https://static.ewg.org/skindeep_images/9329/93...
ingredient_list       Active Ingredients: Avobenzone 3%, Octinoxate ...
new_ing_list          ['AVOBENZONE', 'OCTINOXATE', 'OCTISALATE', 'OC...
new_ing_list_fixed    AVOBENZONE, OCTINOXATE, OCTISALATE, OCTOCRYLEN...
Name: 90, dtype: object

In [41]:
ext = df_all.iloc[90]['new_ing_list_fixed']
test = [[i for i in ext.split(' ')]]
test

[['AVOBENZONE,',
  'OCTINOXATE,',
  'OCTISALATE,',
  'OCTOCRYLENE,',
  'CYCLOPENTASILOXANE,',
  'DIMETHICONE,',
  'CETEARYL',
  'ALCOHOL,',
  'CETEARYL',
  'GLUCOSIDE,',
  'POTASSIUM',
  'CETYL',
  'PHOSPHATE,',
  'POLYGLYCERYL-4',
  'ISOSTEARATE,',
  'DIMETHICONE',
  'CROSSPOLYMER,',
  'TOCOPHERYL',
  'ACETATE,',
  'GLYCERIN,',
  'CETYL',
  'PEG-8',
  'DIMETHICONE,',
  'CETYL',
  'DIMETHICONE,',
  'POLYSORBATE',
  '60,',
  'BUTYLENE',
  'GLYCOL,',
  'HEXYL',
  'LAURATE,',
  'COLLOIDAL',
  'OATMEAL,',
  'PHENOXYETHANOL,',
  'SODIUM',
  'DEHYDROACETATE,',
  'SODIUM',
  'HYALURONATE,',
  'CAMELLIA',
  'SINENSIS',
  'LEAF',
  'EXTRACT,',
  'CARBOMER,',
  'TRIBEHENIN,',
  'LIMONENE,',
  'TETRASODIUM',
  'EDTA,',
  'AMINOMETHYL',
  'PROPANOL,',
  'CITRUS',
  'NOBILIS',
  'PEEL',
  'OIL,',
  'ALLANTOIN,',
  'SODIUM',
  'ASCORBATE,',
  'GLYCYRRHIZA',
  'GLABRA',
  'ROOT',
  'EXTRACT,',
  'CHRYSANTHEMUM',
  'PARTHENIUM',
  'FLOWER',
  'EXTRACT,',
  'MICA']]

In [139]:
distances, indices = knn_pipeline.kneighbors([test])
distances, indices

NameError: name 'knn_pipeline' is not defined

In [None]:
df_all.iloc[90,:]

In [None]:
location = indices.tolist()[0]
df_all.iloc[location, :]