In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load

In [6]:
prods = pd.read_csv('../data/cosmetics.csv')
prods = prods[prods["Name"]!='#NAME?']
prods.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1


In [7]:
prods['Processed Ingredients'] = prods['Ingredients'].apply(lambda x: ' '.join(sorted(x.split(','))))
prods['Processed Ingredients'] = prods['Processed Ingredients'].str.lower()
prods["Processed Ingredients"]

0        alcohol denat.  aluminum distearate  benzyl s...
1        butylene glycol  methylparaben  pentylene gly...
2        1  acetyl glutamine  alanine  arginine  aspar...
3        acetyl carnitine hcl  acetyl hexapeptide-8  a...
4        1  aloe barbadensis leaf extract  aluminum hy...
                              ...                        
1462     alcohol denat.  ascorbyl palmitate  caprylic/...
1463     adenosine  alumina  benzoic acid  butylene gl...
1464     ci 77491 (iron oxides)  caramel  citrus auran...
1465     alpha-isomethyl ionone  ci 14700 (red 4)  ci ...
1466                        visit the dermaflash boutique
Name: Processed Ingredients, Length: 1467, dtype: object

In [11]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
ingredient_vectors = vectorizer.fit_transform(prods['Processed Ingredients'])
dump(vectorizer,'rec_vectorizer.joblib')
dump(ingredient_vectors,'rec_matrix.joblib')

['rec_matrix.joblib']

In [31]:
similarity_matrix = cosine_similarity(ingredient_vectors)

In [32]:
def get_recs(target,top_n=5):
  target_processed = ' '.join(sorted(target))
  target_vector = vectorizer.transform([target_processed])
  similarities = cosine_similarity(target_vector,ingredient_vectors)[0]
  indices = similarities.argsort()[-top_n:][::-1]
  return prods['Name'].iloc[indices].tolist()

In [33]:
ings = ['acrylates/c10-30 alkyl acrylate crosspolymer',
 'aminomethyl propanol',
 'c13-14 isoparaffin',
 'dimethicone',
 'dimethicone crosspolymer',
 'dimethiconol',
 'disodium edta',
 'dmdm hydantoin',
 'laureth-4',
 'niacinamide',
 'palmitoyl pentapeptide-4',
 'polymethylsilsesquioxane',
 'polysorbate 20',
 'retinol',
 'retinyl propionate',
 'secale cereale seed extract',
 'tapioca starch',
 'titanium dioxide']
get_recs(ings)

['Rénergie Lift Multi-Action Sunscreen Broad Spectrum SPF 15 For All Skin Types',
 'GenOptics Spot Essence Serum',
 'Overnight Miracle Mask',
 'GenOptics Aura Essence Serum',
 'Wrinkle Revenge Eye Balm']