In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from math import pi

from bokeh.io import show, output_file
from bokeh.layouts import row
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource

# Read csvs

These are the csv's we cleaned up and combined in step 01. 

In [51]:
ings = pd.read_csv('data/ings.csv', encoding = "ISO-8859-1", index_col=0)
prods = pd.read_csv('data/prods.csv', encoding = "ISO-8859-1", index_col=0)
prod_ing = pd.read_csv('data/prod_ing.csv', encoding = "ISO-8859-1", index_col=0)
prod_ing = prod_ing.drop(['diff'], axis = 1)
print('Number of products: ', len(prods))
print('Number of unique ingredients: ', len(ings))
print('Number of total ingredients: ', len(prod_ing))

Number of products:  16438
Number of unique ingredients:  7325
Number of total ingredients:  545640


### Ings
Ings df is just a database of unique ingredients and their harmful score, function and notes.

In [52]:
#caught a rogue ingredient
ings['ingredient'] = ings['ingredient'].replace('Stearic Acid(Masking, Fragrance, Emulsion Stabilising, Emulsifying, Sufactant, Refatting, Surfactantsurfactant-Cleansing Agent Is Included As A Function For The Soap Form Of Stearic Acid.', 'Stearic Acid')
ings = ings.drop_duplicates(subset = 'ingredient')
print('Number of unique ingredients: ', len(ings))
ings.head()

Number of unique ingredients:  7324


Unnamed: 0,id,ingredient,ewg,cir,func_Abrasive,func_Antimicrobial,func_Antioxidant,func_Antistatic Agent,func_Astringent,func_Binding,...,notes_Comedogenic Rating (3),notes_Comedogenic Rating (4),notes_Comedogenic Rating (5),notes_Good for Dry Skin,notes_Good for Oily Skin,notes_Good for Sensitive Skin,notes_Paraben,notes_Promotes Wound Healing,notes_Sulfate,notes_UV Protection
0,0,Water,1,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,Hydrogenated Polyisobutene,1,A,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,Butylene Glycol,1,A,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,Sesamum Indicum (Sesame) Seed Oil,1,A,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,Ammonium Acryloyldimethyltaurate/VP Copolymer,1,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
ing_uniqueID = ings.loc[:,['ingredient']].reset_index()
ing_uniqueID = ing_uniqueID.drop(['index'], axis = 1)
ing_uniqueID['uniqueID'] = ing_uniqueID.index
ing_uniqueID.tail(3)

Unnamed: 0,ingredient,uniqueID
7321,Viola Odorata Leaf Wax,7321
7322,Starch Acetate,7322
7323,C12-18 Acid Triglyceride,7323


### Prods
Prods df is a database of all the products with various information including brand, price and rating

In [54]:
prods.head(3)

Unnamed: 0,id,product,brand,price,rating,ratingsCount,cat_Acne&BlemishTreatments,cat_Ampoules,cat_BodySkincare,cat_Cleansers,...,note_AlcoholFree,note_Anti-Aging,note_Brightening,note_ContainsAlcohol,note_ContainsParaben,note_ContainsSulfate,note_ParabenFree,note_PromotesWoundHealing,note_SulfateFree,note_UVProtection
0,0,Naturally Gentle Eye Makeup Remover,Clinique,20.0,5.0,16,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
2,2,Foot Cream Norwegian Formula,Neutrogena,23.82,4.9,84,0,0,0,0,...,1,0,1,0,1,0,0,1,1,0
3,3,Essential Power Skin Refiner_Moisture,Laneige,28.2,5.0,12,0,0,0,0,...,0,0,1,1,0,0,1,1,1,0


### Prod_ing
Prod ing is a database that essentiall links prods and ings. it contains the product id, ingredient and ingredient order.

In [55]:
prod_ing.tail(3)

Unnamed: 0,id,ingredient,order
545637,19926,Sodium Citrate,42
545638,19926,Disodium EDTA,43
545639,19926,Phenoxyethanol,44


### Add a column that is a unique ingredient identifier

In [56]:
#id is product id
#uniqueID is ingredient ID
prod_ing = pd.merge(prod_ing, ing_uniqueID, on='ingredient')
prod_ing.tail(3)

Unnamed: 0,id,ingredient,order,uniqueID
543453,19873,Viola Odorata Leaf Wax,12,7321
543454,19874,Starch Acetate,8,7322
543455,19875,C12-18 Acid Triglyceride,9,7323


## Reformatting Prod_ing
Instead of having each ingredient in it's own row, we want to create a list of ingredients for each product while retaining the order.

In [58]:
#cleanup of a rogue ingredient here as well
prod_ing['ingredient'] = prod_ing['ingredient'].replace('Stearic Acid(Masking, Fragrance, Emulsion Stabilising, Emulsifying, Sufactant, Refatting, Surfactantsurfactant-Cleansing Agent Is Included As A Function For The Soap Form Of Stearic Acid.', 'Stearic Acid')

##group and create list
prod_ing_lists = prod_ing.groupby('id')['ingredient'].apply(list)
prod_ing_ID_lists = prod_ing.groupby('id')['uniqueID'].apply(list)

##convert back to dataframe and reset index
prod_ing_df = prod_ing_lists.to_frame().reset_index()
prod_ing_ID_df = prod_ing_ID_lists.to_frame().reset_index()
##check to make sure unique id's are still in tact (id shouldn't be exactly == to index)
prod_ing_ID_df.tail()

Unnamed: 0,id,uniqueID
19858,19922,"[0, 1, 15, 16, 23, 26, 27, 43, 45, 50, 135, 14..."
19859,19923,"[0, 2, 5, 15, 16, 23, 26, 27, 45, 46, 49, 50, ..."
19860,19924,"[0, 2, 23, 53, 61, 97, 109, 143, 150, 160, 189..."
19861,19925,"[45, 98, 100, 102, 119, 120, 135, 136, 137, 13..."
19862,19926,"[0, 2, 5, 15, 16, 23, 41, 50, 73, 85, 87, 109,..."


In [59]:
prod_ing_df.tail()

Unnamed: 0,id,ingredient
19858,19922,"[Water, Hydrogenated Polyisobutene, Disodium E..."
19859,19923,"[Water, Butylene Glycol, Sucrose, Disodium EDT..."
19860,19924,"[Water, Butylene Glycol, Glycerin, Allantoin, ..."
19861,19925,"[Tocopherol (Vitamin E, Tocopheryl Acetate, Be..."
19862,19926,"[Water, Butylene Glycol, Sucrose, Disodium EDT..."


In [65]:
prod_ing_lists = pd.merge(prod_ing_df, prod_ing_ID_df, on='id')
prod_ing_lists = prod_ing_lists.rename(columns = {'ingredient': 'ingList', 'uniqueID': 'ing#List' })
prod_ing_lists.head()

Unnamed: 0,id,ingList,ing#List
0,0,"[Water, Hydrogenated Polyisobutene, Butylene G...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,1,"[Disodium EDTA, Phenoxyethanol, Mineral Oil, G...","[15, 16, 22, 23, 24, 25, 26, 27, 28, 29, 30, 3..."
2,2,"[Water, Sodium Chloride, Methylparaben, Propyl...","[0, 13, 17, 19, 23, 46, 47, 48, 49, 50, 51, 52..."
3,3,"[Water, Butylene Glycol, Sucrose, Arginine, Di...","[0, 2, 5, 9, 15, 16, 23, 49, 62, 63, 64, 65, 6..."
4,4,"[Butylene Glycol, Disodium EDTA, Phenoxyethano...","[2, 15, 16, 17, 19, 23, 33, 47, 49, 92, 93, 94..."


## Merge prod_ings and prods
We can now use the list to merge back with the products.

In [97]:
products_and_ingredients= pd.merge(prod_ing_lists, prods, on='id')
products_and_ingredients.tail(3)

Unnamed: 0,id,ingList,ing#List,product,brand,price,rating,ratingsCount,cat_Acne&BlemishTreatments,cat_Ampoules,...,note_AlcoholFree,note_Anti-Aging,note_Brightening,note_ContainsAlcohol,note_ContainsParaben,note_ContainsSulfate,note_ParabenFree,note_PromotesWoundHealing,note_SulfateFree,note_UVProtection
16435,19923,"[Water, Butylene Glycol, Sucrose, Disodium EDT...","[0, 2, 5, 15, 16, 23, 26, 27, 45, 46, 49, 50, ...",Bright Lover Rubber Mask,Dr. Jart+,12.51,2.3,4,0,0,...,1,1,1,0,0,0,1,1,1,1
16436,19925,"[Tocopherol (Vitamin E, Tocopheryl Acetate, Be...","[45, 98, 100, 102, 119, 120, 135, 136, 137, 13...",Born Lippy Balms,The Body Shop,7.0,2.0,3,0,0,...,1,1,1,0,0,0,1,1,1,1
16437,19926,"[Water, Butylene Glycol, Sucrose, Disodium EDT...","[0, 2, 5, 15, 16, 23, 41, 50, 73, 85, 87, 109,...",Fresh Pressed Daily Booster With Pure Vitamin ...,Clinique,22.99,2.0,3,0,0,...,1,1,1,0,0,0,1,0,1,0


### Add column for number of ingredients

In [98]:
products_and_ingredients['ingCount'] = products_and_ingredients['ingList'].apply(lambda x: len(x))
products_and_ingredients.head(3)

Unnamed: 0,id,ingList,ing#List,product,brand,price,rating,ratingsCount,cat_Acne&BlemishTreatments,cat_Ampoules,...,note_Anti-Aging,note_Brightening,note_ContainsAlcohol,note_ContainsParaben,note_ContainsSulfate,note_ParabenFree,note_PromotesWoundHealing,note_SulfateFree,note_UVProtection,ingCount
0,0,"[Water, Hydrogenated Polyisobutene, Butylene G...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",Naturally Gentle Eye Makeup Remover,Clinique,20.0,5.0,16,0,0,...,0,0,0,1,0,0,0,1,0,22
1,2,"[Water, Sodium Chloride, Methylparaben, Propyl...","[0, 13, 17, 19, 23, 46, 47, 48, 49, 50, 51, 52...",Foot Cream Norwegian Formula,Neutrogena,23.82,4.9,84,0,0,...,0,1,0,1,0,0,1,1,0,20
2,3,"[Water, Butylene Glycol, Sucrose, Arginine, Di...","[0, 2, 5, 9, 15, 16, 23, 49, 62, 63, 64, 65, 6...",Essential Power Skin Refiner_Moisture,Laneige,28.2,5.0,12,0,0,...,0,1,1,0,0,1,1,1,0,38


In [112]:
products_and_ingredients['ing#List'] = products_and_ingredients['ing#List'].apply(lambda x: str(x))
products_and_ingredients['ing#List'] = products_and_ingredients['ing#List'].str.replace(',', '')
products_and_ingredients['ing#List'] = products_and_ingredients['ing#List'].str.replace('[', '')
products_and_ingredients['ing#List'] = products_and_ingredients['ing#List'].str.replace(']', '')
products_and_ingredients = products_and_ingredients.drop_duplicates(subset = 'product')

products_and_ingredients.head()

Unnamed: 0,id,ingList,ing#List,product,brand,price,rating,ratingsCount,cat_Acne&BlemishTreatments,cat_Ampoules,...,note_Anti-Aging,note_Brightening,note_ContainsAlcohol,note_ContainsParaben,note_ContainsSulfate,note_ParabenFree,note_PromotesWoundHealing,note_SulfateFree,note_UVProtection,ingCount
0,0,"[Water, Hydrogenated Polyisobutene, Butylene G...",0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...,Naturally Gentle Eye Makeup Remover,Clinique,20.0,5.0,16,0,0,...,0,0,0,1,0,0,0,1,0,22
1,2,"[Water, Sodium Chloride, Methylparaben, Propyl...",0 13 17 19 23 46 47 48 49 50 51 52 53 54 55 56...,Foot Cream Norwegian Formula,Neutrogena,23.82,4.9,84,0,0,...,0,1,0,1,0,0,1,1,0,20
2,3,"[Water, Butylene Glycol, Sucrose, Arginine, Di...",0 2 5 9 15 16 23 49 62 63 64 65 66 67 68 69 70...,Essential Power Skin Refiner_Moisture,Laneige,28.2,5.0,12,0,0,...,0,1,1,0,0,1,1,1,0,38
3,4,"[Butylene Glycol, Disodium EDTA, Phenoxyethano...",2 15 16 17 19 23 33 47 49 92 93 94 95 96 97 98...,Daily Facials Gentle Clean 4-in-1 Water Activa...,Olay,32.25,4.9,17,0,0,...,1,0,0,1,0,0,1,1,0,21
4,5,"[Water, Butylene Glycol, Methylparaben, Glycer...",0 2 17 23 104 105 106 107 108 109 110,Gokujyun Lotion Refill Pouch,Hada Labo,13.26,4.9,55,0,0,...,0,0,0,1,0,0,1,1,0,11


# Reformatting product + ingredients

In [99]:
products_and_ingredients.iloc[1,2]

[0, 13, 17, 19, 23, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61]

In [96]:
import re, math
from collections import Counter
import numpy as np

text1 = products_and_ingredients.iloc[1,2]
text2 = products_and_ingredients.iloc[2,2]

class Similarity():
    def compute_cosine_similarity(self, string1, string2):
         # intersects the words that are common
         # in the set of the two words
         intersection = set(string1.keys()) & set(string2.keys())
         # dot matrix of vec1 and vec2
         numerator = sum([string1[x] * string2[x] for x in intersection])

         # sum of the squares of each vector
         # sum1 is the sum of text1 and same for sum2 for text2
         sum1 = sum([string1[x]**2 for x in string1.keys()])
         sum2 = sum([string2[x]**2 for x in string2.keys()])

         # product of the square root of both sum(s)
         denominator = math.sqrt(sum1) * math.sqrt(sum2)
         if not denominator:
            return 0.0
         else:
            return round(numerator/float(denominator),4)

    def text_to_vector(self,text):
        WORD = re.compile(r'\w+')
        words = WORD.findall(text)
        return Counter(words)

    # Jaccard Similarity
    def tokenize(self,string):
        return string.lower().split(" ")

    def jaccard_similarity(self, string1, string2):
        intersection = set(string1).intersection(set(string2))
        union = set(string1).union(set(string2))
        return len(intersection)/float(len(union))

similarity = Similarity()

# vector space
vector1 = similarity.text_to_vector(text1)
vector2 = similarity.text_to_vector(text2)

# split words into tokens
token1 = similarity.tokenize(text1)
token2 = similarity.tokenize(text2)

cosine = similarity.compute_cosine_similarity(vector1, vector2)
print ('Cosine Similarity:', cosine)

jaccard = similarity.jaccard_similarity(token1,token2)
print( 'Jaccard Similarity:', jaccard)

Cosine Similarity: 0.1088
Jaccard Similarity: 0.05454545454545454


In [113]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer()

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(products_and_ingredients['ing#List'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(15950, 6890)

In [114]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [115]:
indices = pd.Series(products_and_ingredients.index, index=products_and_ingredients['product']).drop_duplicates()

In [119]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(product, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[product]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    product_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return products_and_ingredients[['product', 'ingList']].iloc[product_indices]

In [120]:
get_recommendations('Naturally Gentle Eye Makeup Remover')

Unnamed: 0,product,ingList
5632,On-The-Go Pro Performance Makeup Wipes,"[Water, Phenoxyethanol, Methylparaben, Ethylpa..."
4969,Eye Makeup Remover Pads,"[Water, Butylene Glycol, Phenoxyethanol, Methy..."
3029,Expert Anti-Blemish Toner,"[Water, Disodium EDTA, Phenoxyethanol, Methylp..."
4719,Skin Brightening Deep Clean Gel,"[Water, Sodium Chloride, Disodium Phosphate, P..."
9289,Gentle Creme Eye Makeup Remover,"[Water, Acrylates/C10-30 Alkyl Acrylate Crossp..."
3173,Expert Anti-Blemish Night Moisturizer,"[Water, Acrylates/C10-30 Alkyl Acrylate Crossp..."
16226,Daily Moisture Therapy Cleanser,"[Water, Disodium Phosphate, Methylparaben, Pro..."
4260,Vitamin E Face Mist,"[Water, Butylene Glycol, Phenoxyethanol, Methy..."
3069,Expert Sensitive Hydrating Serum,"[Water, Disodium EDTA, Phenoxyethanol, Methylp..."
3979,Micro-Exfoliating Scrub,"[Water, Disodium EDTA, Phenoxyethanol, Methylp..."


In [118]:
products_and_ingredients['ingList'].iloc['On-The-Go Pro Performance Makeup Wipes']

TypeError: cannot do positional indexing on <class 'pandas.indexes.numeric.Int64Index'> with these indexers [On-The-Go Pro Performance Makeup Wipes] of <class 'str'>