## libraries

In [1]:
#!pip install nltk
#!pip install textdistance
import pandas as pd
import regex
import string
import nltk
from nltk.corpus import stopwords
import textdistance as td
import numpy as np

## cleaned dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/satyam9090/Comparing-Cosmetics-by-Ingredients/master/datasets/cosmetics.csv')
df.head()
starts_with_visit = df[df.Ingredients.str.startswith("Visit")]
df = pd.concat([df, starts_with_visit]).drop_duplicates(keep=False) #we are droping the duplicate values, the ones whose ingredients start with Visit are dropped
df.Ingredients.str.startswith("Visit").sum() #checking whether there are any that start with Visit
df = df[~df.Ingredients.isin(["#NAME?","No Info"])]
starts_with_st_change = df[df.Ingredients.str.startswith("This ingredient list is subject to change")]
df = pd.concat([df,starts_with_st_change]).drop_duplicates(keep = False)
starts_with_asteriks = df[df.Ingredients.str.startswith("*")]
df = pd.concat([df, starts_with_asteriks]).drop_duplicates(keep = False)
df = df[df.Ingredients != '-4MSK: Helps remove melanin trapped in dark spots and ion force complex works to slough off the surface cells that contain melanin.']

## test variables

In [55]:
ingredients='Deionized, water, Aqua, SD, Alcohol, 40, Propanediol, Salicylic, Acid, Citric, Acid, Glycerin, Potassium, Sorbate, Sodium, Benzoate, Solanum, Lycopersicum, Tomato, Fruit, Leaf, Stem, Extract, Cellulose, Hydroxyethyicellulose, lsopropyl, Alcohol, Sodium, acetate, Polysorbate, 80, Sulfur, PEG, 12, Dimethicone, Acrylates, C10, 30, Alkyl, Acrylate, Crosspolymer, Ethylhexylglycerin, Phenoxyethanol, Triethanolamine, Butylene, Glycol, Polysorbate, 20, Micrococcus, Lysate, Sodium, Caproyl, Lauroyl, Lactylate, Triethyl, Citrate, Benzoic, Acid, Chlorphenesin, Allantoin, Tocopheryl, Acetate'
label = ['Moisturizer', 'Not sure']
skintype = ['Oily', 'Dry', 'Not sure']

## cleaning the string

In [4]:
nltk.download('stopwords')
stopwords_ = stopwords.words('english')
def clean_string(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    text = ' '.join([word for word in text.split() if word not in stopwords_])
    return text
cleaned = clean_string(ingredients)
print(cleaned)

Deionized water Aqua SD Alcohol 40 Propanediol Salicylic Acid Citric Acid Glycerin Potassium Sorbate Sodium Benzoate Solanum Lycopersicum Tomato Fruit Leaf Stem Extract Cellulose Hydroxyethyicellulose lsopropyl Alcohol Sodium acetate Polysorbate 80 Sulfur PEG 12 Dimethicone Acrylates C10 30 Alkyl Acrylate Crosspolymer Ethylhexylglycerin Phenoxyethanol Triethanolamine Butylene Glycol Polysorbate 20 Micrococcus Lysate Sodium Caproyl Lauroyl Lactylate Triethyl Citrate Benzoic Acid Chlorphenesin Allantoin Tocopheryl Acetate


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## function

In [5]:
def cosmetics_with_ingredients(ingredients, label, skintype):
    if ('Not sure' in label) and ('Not sure' in skintype):
        
        df_high_rank = df#[df.Rank>=4]
        df_high_rank.reset_index(drop=True, inplace=True)
        matches = []
        indexes = []
        for i in df_high_rank.Ingredients:
            matches.append(td.sorensen.normalized_similarity(cleaned,i))
        for i, value in enumerate(matches):
            if value == max(matches):
                indexes.append(i)
        
        if max(matches)*100>=60:
            skintypes_test = {}
            for i in indexes:
                skins = []
                the_best_one_test = df_high_rank[df_high_rank.index == i]
                if the_best_one_test.Combination[i] == 1:
                    skins.append('Combination')
                if the_best_one_test.Dry[i] == 1:
                    skins.append('Dry')
                if the_best_one_test.Normal[i] == 1:
                    skins.append('Normal')
                if the_best_one_test.Oily[i] == 1:
                    skins.append('Oily')
                if the_best_one_test.Sensitive[i] ==1:
                    skins.append('Sensitive') 
                skintypes_test[i] = skins
                
            
            more_skins = []
            for i, v in zip(skintypes_test.keys(), skintypes_test.values()):
                if len(v) == 5:
                    more_skins.append(i)
                if len(more_skins)==0:
                    for i, v in zip(skintypes_test.keys(), skintypes_test.values()):  
                        if len(v) == len(max(skintypes_test.values())):
                            more_skins.append(i)
            
            final = df_high_rank.iloc[more_skins]
            final.reset_index(drop=True, inplace=True)
            final = final.style.set_properties(**{'background-color': 'thistle',                                                   
                                      'color': 'navy',                       
                                      'border-color': 'purple4'})
            print("Here is the table with the most matched ingredients that can help you make a decision")
            display(final)
        else:
            print('Sorry, we are not familiar with such a combination of cosmetics.')
    elif 'Not sure' not in label and 'Not sure' in skintype:
        df_label = df[df.Label.apply(lambda x: x in label)]
        matches = []
        indexes = []
        for i in df_label.Ingredients:
            matches.append(td.sorensen.normalized_similarity(cleaned,i))
        for i, value in enumerate(matches):
            if value == np.max(matches):
                indexes.append(i)

        if max(matches)*100>=60:
            skintypes_test = {}
            for i in indexes:
                skins = []
                the_best_one_test = df_label[df_label.index == i]
                if the_best_one_test.Combination[i] == 1:
                    skins.append('Combination')
                if the_best_one_test.Dry[i] == 1:
                    skins.append('Dry')
                if the_best_one_test.Normal[i] == 1:
                    skins.append('Normal')
                if the_best_one_test.Oily[i] == 1:
                    skins.append('Oily')
                if the_best_one_test.Sensitive[i] ==1:
                    skins.append('Sensitive') 
                skintypes_test[i] = skins

            more_skins = []
            for i, v in zip(skintypes_test.keys(), skintypes_test.values()):
                if len(v) == len(max(skintypes_test.values())):
                    more_skins.append(i)

            final = df_label.iloc[more_skins]
            the_best_cosmetics = final[final.Rank == final.Rank.max()]
        
            final_skins = []
            for i in range(len(final)):
                df_ = final.iloc[i]
                if df_.Combination == 1:
                    final_skins.append('Combination')
                if df_.Dry == 1:
                    final_skins.append('Dry')
                if df_.Normal == 1:
                    final_skins.append('Normal')
                if df_.Oily == 1:
                    final_skins.append('Oily')
                if df_.Sensitive ==1:
                    final_skins.append('Sensitive') 


            final_skins = list(set(final_skins))
            final_skins = ', '.join([i for i in final_skins])
            print('The given cosmetics matches with the following skin types:', final_skins)
        else:
            print('Sorry, we are not familiar with such a combination of cosmetics.')
            #import urllib.request
            #url = urllib.request.urlopen('https://st4.depositphotos.com/3360263/19910/v/1600/depositphotos_199107392-stock-illustration-vector-illustration-sad-kitten-sorry.jpg')
            #display(url)
            
    elif ('Not sure' not in label) and ('Not sure' not in skintype):
        df_label = df[df.Label.apply(lambda x: x in label)]
        matches = []
        indexes = []
        for i in df_label.Ingredients:
            matches.append(td.sorensen.normalized_similarity(cleaned,i))
        for i, value in enumerate(matches):
            if value == np.max(matches):
                indexes.append(i)

        if max(matches)*100>=60:
            skintypes_test = {}
            for i in indexes:
                skins = []
                the_best_one_test = df_label[df_label.index == i]
                if the_best_one_test.Combination[i] == 1:
                    skins.append('Combination')
                if the_best_one_test.Dry[i] == 1:
                    skins.append('Dry')
                if the_best_one_test.Normal[i] == 1:
                    skins.append('Normal')
                if the_best_one_test.Oily[i] == 1:
                    skins.append('Oily')
                if the_best_one_test.Sensitive[i] ==1:
                    skins.append('Sensitive') 
                skintypes_test[i] = skins

            matched=[]
            for i, v in zip(skintypes_test.keys(), skintypes_test.values()):
                if set(skintype).issubset(set(a)):
                    matched.append(i)

                if len(matches)!=0:
                    print('This cosmetics fits your skin type with', int(max(matches)*100), '%')
                else:
                    print('This cosmetics does not match your skin type, please use our first page to get a recommendation from us')
        else:
            print('Sorry, we are not familiar with such a combination of cosmetics.')
            
    elif ('Not sure' in label) and ('Not sure' not in skintype):
        matches = []
        for i in df.Ingredients:
            matches.append(td.sorensen.normalized_similarity(cleaned,i))
        for i, value in enumerate(matches):
            if value == np.max(matches):
                max_index = i

        if max(matches)*100>=60:
            the_best_cosmetics = df[df.index == max_index]
            skintypes = []
            if the_best_cosmetics.Combination[max_index] == 1:
                skintypes.append('Combination')
            if the_best_cosmetics.Dry[max_index] == 1:
                skintypes.append('Dry')
            if the_best_cosmetics.Normal[max_index] == 1:
                skintypes.append('Normal')
            if the_best_cosmetics.Oily[max_index] == 1:
                skintypes.append('Oily')
            if the_best_cosmetics.Sensitive[max_index] ==1:
                skintypes.append('Sensitive')

            if set(skintype).issubset(set(skintypes)):
                print('These ingredients are okay for your skin type! You can use the cosmetics with the given ingredients')
                print('The cosmetics matches with your skin with approximately', int(max(matches)*100), '%')
        else:
            print('Sorry, we are not familiar with such a combination of cosmetics.')

## testing function


In [6]:
cosmetics_with_ingredients(ingredients, label, skintype)

These ingredients are okay for your skin type! You can use the cosmetics with the given ingredients
The cosmetics matches with your skin with approximately 87 %


## what to improve
- when ingredients are given, bot skin type and label are not.(is not working properly)


In [60]:
def cosmetics(label, skintype):
    if ('Not sure' not in skintype) and ('Not sure' in label):
        df_full = df[df.Rank >=4.5]
        skintypes_test = {}
        for i in df_full.index:
            skins = []
            the_best_one_test = df_full[df_full.index == i]
            if the_best_one_test.Combination[i] == 1:
                skins.append('Combination')
            if the_best_one_test.Dry[i] == 1:
                skins.append('Dry')
            if the_best_one_test.Normal[i] == 1:
                skins.append('Normal')
            if the_best_one_test.Oily[i] == 1:
                skins.append('Oily')
            if the_best_one_test.Sensitive[i] ==1:
                skins.append('Sensitive') 
            skintypes_test[i] = skins
        matched=[]
        for i, v in zip(skintypes_test.keys(), skintypes_test.values()):
            if set(skintype).issubset(set(v)):
                matched.append(i)
        final = df_full.loc[matched]
        final.drop_duplicates(inplace = True)
        final.sort_values('Rank', ascending=False, inplace=True)
        final.drop_duplicates('Label', inplace=True)
        final = final[['Label','Brand', 'Name', 'Price', 'Rank']]
        final.reset_index(inplace = True, drop=True)
        final = final.style.set_properties(**{'background-color': 'thistle',                                                   
                                          'color': 'navy',                       
                                          'border-color': 'black'})
        display(final)
    elif ('Not sure' in skintype) and ('Not sure' not in label):
        df_label = df[df.Label.apply(lambda x: x in label)]
        df_label = df_label[df_label.Rank >=4.5]
        skintypes_test = {}
        for i in df_label.index:
            skins = []
            the_best_one_test = df_label[df_label.index == i]
            if the_best_one_test.Combination[i] == 1:
                skins.append('Combination')
            if the_best_one_test.Dry[i] == 1:
                skins.append('Dry')
            if the_best_one_test.Normal[i] == 1:
                skins.append('Normal')
            if the_best_one_test.Oily[i] == 1:
                skins.append('Oily')
            if the_best_one_test.Sensitive[i] ==1:
                skins.append('Sensitive') 
            skintypes_test[i] = skins

        more_skins = []
        for i, v in zip(skintypes_test.keys(), skintypes_test.values()):
            if len(v) == 5:
                more_skins.append(i)
        if len(more_skins)==0:
            for i, v in zip(skintypes_test.keys(), skintypes_test.values()):  
                if len(v) == len(max(skintypes_test.values())):
                    more_skins.append(i)


        final = df_label.loc[more_skins]
        final.drop_duplicates(inplace = True)
        final.sort_values('Rank', ascending=False, inplace=True)
        final.drop_duplicates('Brand', inplace=True)
        final = final[['Brand', 'Name', 'Price', 'Oily', 'Combination', 'Dry', 'Sensitive', 'Normal']]
        final.reset_index(inplace = True, drop=True)
        final = final.style.set_properties(**{'background-color': 'thistle',                                                   
                                      'color': 'navy',                       
                                      'border-color': 'purple4'})
        display(final)

    elif ('Not sure' not in skintype) and ('Note sure' not in label):
        df_label = df[df.Label.apply(lambda x: x in label)]
        df_label = df_label[df_label.Rank >=4.5]
        skintypes_test = {}
        for i in df_label.index:
            skins = []
            the_best_one_test = df_label[df_label.index == i]
            if the_best_one_test.Combination[i] == 1:
                skins.append('Combination')
            if the_best_one_test.Dry[i] == 1:
                skins.append('Dry')
            if the_best_one_test.Normal[i] == 1:
                skins.append('Normal')
            if the_best_one_test.Oily[i] == 1:
                skins.append('Oily')
            if the_best_one_test.Sensitive[i] ==1:
                skins.append('Sensitive') 
            skintypes_test[i] = skins

        matched=[]
        for i, v in zip(skintypes_test.keys(), skintypes_test.values()):
            if set(skintype).issubset(v):
                matched.append(i)

        final = df_label.loc[matched]
        final.drop_duplicates(inplace = True)
        final.sort_values('Rank', ascending=False, inplace=True)
        final.drop_duplicates('Brand', inplace=True)
        final = final[['Brand', 'Name', 'Price']]
        final.reset_index(inplace = True, drop=True)
        final = final.style.set_properties(**{'background-color': 'thistle',                                                   
                                              'color': 'navy',                       
                                              'border-color': 'purple4'})
        display(final)
    elif ('Not sure' in label) and ('Not sure' in skintype):
        df_full = df[df.Rank >=4.5]
        skintypes_test = {}
        for i in df_full.index:
            skins = []
            the_best_one_test = df_full[df_full.index == i]
            if the_best_one_test.Combination[i] == 1:
                skins.append('Combination')
            if the_best_one_test.Dry[i] == 1:
                skins.append('Dry')
            if the_best_one_test.Normal[i] == 1:
                skins.append('Normal')
            if the_best_one_test.Oily[i] == 1:
                skins.append('Oily')
            if the_best_one_test.Sensitive[i] ==1:
                skins.append('Sensitive') 
            skintypes_test[i] = skins

        more_skins = []
        for i, v in zip(skintypes_test.keys(), skintypes_test.values()):
            if len(v) == 5:
                more_skins.append(i)
        if len(more_skins)==0:
            for i, v in zip(skintypes_test.keys(), skintypes_test.values()):  
                if len(v) == len(max(skintypes_test.values())):
                    more_skins.append(i)


        final = df_full.loc[more_skins]
        final.drop_duplicates(inplace = True)
        final.sort_values('Rank', ascending=False, inplace=True)
        final.drop_duplicates('Label', inplace=True)
        final = final[['Label','Brand', 'Name', 'Price', 'Rank','Oily', 'Combination', 'Dry', 'Sensitive', 'Normal']]
        final.reset_index(inplace = True, drop=True)
        final = final.style.set_properties(**{'background-color': 'thistle',                                                   
                                              'color': 'navy',                       
                                              'border-color': 'purple4'})
        display(final)

In [61]:
cosmetics(label, skintype)

Unnamed: 0,Label,Brand,Name,Price,Rank,Oily,Combination,Dry,Sensitive,Normal
0,Sun protect,COOLA,Sport Continuous Spray SPF 30 - Unscented,32,5.0,1,1,1,1,1
1,Face Mask,DR ROEBUCK’S,Uluru Purifying Mask,28,5.0,1,1,1,1,1
2,Cleanser,ERNO LASZLO,Pore Refining Detox Double Cleanse,55,5.0,1,1,1,1,1
3,Treatment,BIOEFFECT,EGF Serum,160,5.0,1,1,1,1,1
4,Eye cream,BEAUTYBIO,Bright Eyes Collagen-Infused Brightening Colloidal Silver Eye Masks,40,5.0,1,1,1,1,1
5,Moisturizer,REN CLEAN SKINCARE,Evercalm™ Overnight Recovery Balm,48,4.9,1,1,1,1,1
