In [197]:
import numpy as np
import pandas as pd
import glob, re, string
from collections import Counter
import time

# https://medium.com/@asheeshmisra29/web-automation-selenium-webdriver-and-python-getting-started-part-3-a9c07143d36d
import unittest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

from string_grouper import match_strings, match_most_similar, group_similar_strings, StringGrouper
# from sklearn.feature_extraction.text import TfidfVectorizer
# from scipy.sparse import csr_matrix
# import sparse_dot_topn.sparse_dot_topn as sdt

cosmetic ingredient database: https://public.opendatasoft.com/explore/dataset/cosmetic-ingredient-database-ingredients-and-fragrance-inventory/table/

cosdna: https://cosdna.com/

In [2]:
file_list = glob.glob('./data/*.csv')
file_list

['./data\\brands.csv',
 './data\\recommendations_2020-05-05.csv',
 './data\\responses_2020-05-05.csv',
 './data\\responses_modified_2020-05-05.csv']

In [3]:
responses = pd.read_csv(file_list[2])

In [4]:
new_columns = [
    'timestamp',
    'email_address',
    'name',
    'order_number',
    'skin_type',
    'cannot_contain',
    'skin_concern',
    'am_routine',
    'pm_routine',
    'climate',
    'skin_sensitivities',
    'used_retinoids',
    'used_acids',
    'prone_to_breakouts',
    'miscellaneous',
    'permission'
]

responses.columns = new_columns
responses.drop(columns=['email_address','name'], inplace=True)
responses.head()

Unnamed: 0,timestamp,order_number,skin_type,cannot_contain,skin_concern,am_routine,pm_routine,climate,skin_sensitivities,used_retinoids,used_acids,prone_to_breakouts,miscellaneous,permission
0,4/30/2020 12:43:32,12846,Combination,,"Acne, Pigmentation, Fine lines/wrinkles","Trader Joe’s “all in one” cleanser, Bioderma S...","Trader Joe’s “all in one” cleanser, Bioderma S...","Temperate, a lovely 70F (lucky!)",I broke out from Cosrx snail serum and from Ma...,Differing 0.1% for four years,I prefer toners with acids as masks with AHA s...,I have mild to moderate acne that I have been ...,"I have a super oily forehead, oily nose and c...",Yes
1,4/30/2020 12:45:25,12837,Combination,No allergies,"Acne, Fine lines/wrinkles, Skin Texture (rough...",1. Wash with Biore Charcoal Acne Clearing Clea...,Collapse into bed,"Tropical, humid",No.,No.,No.,Yes. They are usually cystic and around my cyc...,I need help.,Yes
2,4/30/2020 12:50:26,12845,Oily,No.,"Acne, Pigmentation",Cleanser (Drunk Elephant Beste No. 9 Jelly Cle...,Oil cleanser (Dermalogica Precleanse); Cleanse...,Desert dry heat,No.,No.,I've used 2% BHA on and off; and daily I use a...,"Yes, occasional small whiteheads and blackhead...",I'm Black so hyperpigmentation and skin discol...,Yes
3,4/30/2020 13:02:43,12850,Normal,No,"Fine lines/wrinkles, Skin Texture (roughness, ...","Splash of water, occasional Paula's Choice enr...",Occasional Palmer's facial cleansing oil (if w...,"Tropical, humid",No,Almost through the purging/peeling phase with ...,I've tried (and still own) The Ordinary red pe...,"Not really - a very rare pimple, I tend more o...",I turn 43 in a month and am really trying to g...,Yes
4,4/30/2020 13:08:20,12842,Combination,"Ethylhexyl palmitate, strong sulfates, chemica...","Acne, Skin Texture (roughness, dullness), Other","Osea Ocean Cleanser, Renee Rouleau (RR) Elderb...","Renee Rouleau (RR) Luxe Mint Cleanser, Renee R...","Temperate, a lovely 70F (lucky!)",Overall I have fairly sensitive skin with horm...,"No, but I would like to start to address early...",Azelaic acid has been a good acid for me (not ...,"My acne is adult onset, primarily hormonal, an...","I just moved to Chicago, so lots of temperatur...",Yes


In [5]:
responses['skin_concern'] = responses['skin_concern'].str.lower().str.split(', ')
responses['skin_concern'][:5]

0            [acne, pigmentation, fine lines/wrinkles]
1    [acne, fine lines/wrinkles, skin texture (roug...
2                                 [acne, pigmentation]
3    [fine lines/wrinkles, skin texture (roughness,...
4    [acne, skin texture (roughness, dullness), other]
Name: skin_concern, dtype: object

# page objects

https://selenium-python.readthedocs.io/page-objects.html
https://www.youtube.com/watch?v=BURK7wMcCwU

In [None]:
class BasePage():
    
    def __init__(self, driver):
        self.driver = driver
        
class MainPage(BasePage):
    
    def is_title_matches(self):
        return 'cosdna' in self.driver.title

In [292]:
CHROMEDRIVER_PATH = 'C:/chromedriver_win32/chromedriver.exe'
BASE_URL = 'https://www.cosdna.com'
DRIVER = webdriver.Chrome(executable_path = CHROMEDRIVER_PATH)

In [321]:
class Product():
    """
    Create a product object containing various features scraped from cosdna.com
    
    Parameters
    ----------
    
    driver : current selenium.webdriver instance
    
    name : str, (default=None)
        name of product
        
    cosdna_id : str, (default=None)
        10-digit identifier at the end of the cosdna url
        
        
    Attributes
    ----------
    
    name : str
        user-defined name of Product
        
    cosdna_name : str
        full name of product as it appears on cosdna
        
    brand : str
        name of brand as it appears on cosdna
        
    product : str
        name of product as it appears on cosdna
        
    cosdna_id : str
        10-digit identifier at the end of the cosdna url
        
    cosdna_url : str
        full cosdna_url
        
    ingredients : dict
        full list of ingredients and their listed functions from cosdna
        
    linked : bool
        whether Product() has been linked with a cosdna listing
        
    synced : bool
        whether Product() has collected information from linked cosdna listing
    """
    
    def __init__(self, driver, name=None, cosdna_id=None):
        self.driver = driver 
        self.name = name
        self.cosdna_id = cosdna_id
        if cosdna_id != None:
            self.cosdna_url = 'https://cosdna.com/eng/cosmetic_' + cosdna_id + '.html'
            self.linked = True
        else:
            self.cosdna_url = None
            self.linked = False
        self.synced = False
    
    
    def link(self, name=None, sort='featured'):
        """
        Searches for top match on cosdna
        
        cosdna_url
        cosdna_id
        
        Parameters
        ----------
        
        name : str, (default=None)
            full name of product 
            if blank: uses name assigned when creating Product() instance
            if not blank: updates name
            
        Returns
        -------
        
        self : object
        """
        
        # if name=None, use existing name. if not, update name
        if name is not None:
            self.name = name
        
        # search for name in cosdna.com
        name = str(self.name).translate(str.maketrans('', '', string.punctuation)).lower()
        name = name.replace(' ', '+')
        search_url = 'https://cosdna.com/eng/product.php?q=' + name + '&sort=' + sort
        self.driver.get(search_url)
        
        # try to get the top result. if successful, changed linked to True
        try:
            top = self.driver.find_element_by_xpath(("//table[@class='table table-hover']/tbody/tr/td[1]/a"))
            self.cosdna_url = top.get_attribute('href')
            self.cosdna_id = re.findall(".*cosmetic_(.*).html", self.cosdna_url)[0]
            self.linked = True
            return self
        except:
            print('no results on cosdna')
            name = input('enter new search: ')
            if name == 'break':
                return self
            else:
                return self.link(name=name)
        
        
    def sync(self, cosdna_url=None):
        """
        Collects information from linked cosdna site:
        
        cosdna_url
        cosdna_id
        brand
        product
        cosdna_name
        ingredients
        synced
        
        Parameters
        ----------
        
        cosdna_url : str, (default=None)
            full cosdna url of product. 
            if blank: uses linked cosdna_url
            if not blank: updates cosdna_url
            
        Returns
        -------
        
        self : object
        """
        # navigate to url and get cosdna_id
        if self.cosdna_url == None:
            if new_url == None:
                return 'create instance with cosdna_id or .link() instance with cosdna_url'
            else:
                self.cosdna_url = cosdna_url
        self.cosdna_id = re.findall(".*cosmetic_(.*).html", self.cosdna_url)[0]
        self.driver.get(self.cosdna_url)

        # get brand and product information from page
        self.brand = self.driver.find_element_by_class_name('brand-name').text.lower()
        self.product = self.driver.find_element_by_class_name('prod-name').text.lower()
        cosdna_name = self.brand + ' ' + self.product
        self.cosdna_name = cosdna_name.strip()

        # get ingredients information from ingredients table
        self.ingredients = {}
        table = self.driver.find_element_by_class_name('chem-list')
        rows = table.find_elements_by_tag_name('tr')
        for row in rows:
            ingredient = row.find_elements_by_tag_name('td')[0].text.strip().lower()
            if 'no results' in ingredient:
                ingredient = row.find_elements_by_class_name('text-muted')[0].text.strip().lower()
                function = None
            else:
                try:
                    function_cell = row.find_elements_by_tag_name('td')[1]
                    function = function_cell.text.strip().lower().split(',')
                    if 'sunscreen' in function:
                        try:
                            uva = re.search("uv[ab]\d", function_cell.find_elements_by_tag_name('img')[0].get_attribute('src'))[0]
                            uvb = re.search("uv[ab]\d", function_cell.find_elements_by_tag_name('img')[1].get_attribute('src'))[0]
                            function.append(uva)
                            function.append(uvb)
                        except:
                            continue
                except:
                    function = None
            self.ingredients[ingredient] = function
        # change state to True
        self.synced = True
        return self
        
        
    def link_sync(self, sort='featured'):
        """
        Should operate the same way as .link().sync()
        """
        self.link(sort=sort)
        self.sync(self.cosdna_url)
        return self

## test usage: `link()` and `sync()`

In [337]:
product = Product(DRIVER, 'Trader Joeâ€™s â€œall in oneâ€ cleanser')
product.link()

no results on cosdna


enter new search:  trader joe's all in one cleanser


<__main__.Product at 0x1b6a7e98508>

In [338]:
product.cosdna_url

'https://cosdna.com/eng/cosmetic_aa6a484449.html'

In [339]:
product.sync()

<__main__.Product at 0x1b6a7e98508>

In [340]:
product.ingredients

{'aqua': ['solvent'],
 'peg-80 sorbitan laurate': ['surfactant', ' emulsifier'],
 'cocamidopropyl betaine': ['surfactant', ' viscosity control', ' antistatic'],
 'sodium trideceth sulfate': ['surfactant'],
 'sodium lauroamphoacetate': ['surfactant'],
 'peg 150 distearate': ['surfactant', ' viscosity control', ' emulsifier'],
 'ubiquinone': ['antioxidant'],
 'tocopheryl acetate': ['moisturizer', ' antioxidant'],
 'thioctic acid': ['antioxidant'],
 'retinyl palmitate': ['antioxidant'],
 'phytonadione': [''],
 'cholecalciferol': [''],
 'ascorbyl palmitate': ['antioxidant'],
 'citric acid': ['ph adjusters'],
 'panthenol': ['antistatic', ' moisturizer'],
 'sorbic acid': ['preservative'],
 'phenoxyethanol': ['preservative'],
 'sodium lauroyl glutamate': ['surfactant', ' antistatic'],
 'glycerin': ['solvent', ' moisturizer'],
 'sodium lauroyl oat amino acids': ['antistatic'],
 'camellia sinensis leaf extract': ['moisturizer',
  ' antioxidant',
  ' astringent',
  ' sunscreen',
  ' emollient'],

## test usage: `link_sync()`

In [341]:
product = Product(DRIVER, 'chemist confessions aquafix')
product.name

'chemist confessions aquafix'

In [342]:
product.link_sync()
print(product.brand)
print()
print(product.product)
print()
print(product.ingredients)



chemist confessions aquafix

{'water': ['solvent'], 'propanediol': ['solvent', ' viscosity control'], 'glycerin': ['solvent', ' moisturizer'], 'ethoxydiglycol': ['solvent', ' viscosity control'], 'sodium pca': ['antistatic', ' moisturizer'], 'panthenol': ['antistatic', ' moisturizer'], 'allantoin': ['anti-inflammatory', ' anti-allergic'], 'vinyl dimethicone/methicone silsesquioxane crosspolymer': ['viscosity control'], 'madecassoside': ['antioxidant', ' plant extract'], 'asiaticoside': ['fragrance', ' antioxidant'], 'polyacrylate crosspolymer-6': ['viscosity control'], 'sodium hyaluronate': ['moisturizer'], 'xanthan gum': ['viscosity control'], 'lecithin': ['emollient', ' emulsifier'], 'sclerotium gum': ['viscosity control'], 'pullulan': [''], 'hexanediol': ['solvent'], 'caprylhydroxamic acid': [''], 'disodium edta': ['']}


In [327]:
product.synced

True

# cosdna class

In [None]:
class CosDNA():
    
    

# routine class

In [343]:
popular_ingredients = [
    'Vitamin C',
    'L-ascorbic acid', 
    'tetrahexydecyl ascorbate', 
    '3-o-ethyl ascorbic acid', 
    'ascorbyl glucoside', 
    'magnesium ascorbyl phosphate', 
    'sodium ascorbyl phosphate', 
    'ascorbyl palmitate', 
    'Glycolic Acid', 
    'Lactic Acid', 
    'Mandelic Acid',
    'Niacinamide', 
    'Retinol', 
    'Hydroxypinocolone Retinoate', 
    'Bakuchiol', 
    'Retinyl Palmitate', 
    'Ceramides', 
    'Salicylic Acid', 
    'Willow bark Extract', 
    'Adapalene', 
    'Sodium Lauryl Sulfate', 
    'Cocoamidopropyl Betaine'
    ]
popular_ingredients = [ing.lower() for ing in popular_ingredients]

In [344]:
class Routine():
    """
    Create a container of Products with useful ingredient tabulation methods
    
    Parameters
    ----------
    
    driver : current selenium.webdriver instance
    
    name : str, (default=None)
        name of routine
        
        
    Attributes
    ----------
    
    name : str
        user-defined name of Routine
        
    routine : list
        list of Products in Routine
        
    ingredients : list
        list of all ingredients in Routine
        
    counts : Counter object
        Counter object of all ingredients in Routine
        
    linked : bool
        whether all Products have been linked with a cosdna listing
        
    synced : bool
        whether all Products have collected information from linked cosdna listing
    """
    
    def __init__(self, driver, name=None):
        self.driver = driver
        self.name = name
        self.routine = []
        self.linked = False
        self.synced = False
        
    def add(self, *products):
        """
        Adds a product or list of products to the routine.
        
        Parameters
        ----------
        
        *products: unpackable
        """
        for product in products:
            if type(product) == Product:
                self.routine.append(product)
            elif type(product) == str:
                self.routine.append(Product(self.driver, product))
        return self
    
    def link(self, sort='featured', force=False):
        """
        Searches each Product for top match on cosdna
        
        updates (for each Product):
        cosdna_url
        cosdna_id
        
        Parameters
        ----------
        
        sort : str
            sort algorithm for cosdna results
            
        Attributes
        ----------
        linked : bool
            returns True if all Products are linked
            
        Returns
        -------
        
        self : object
        """
        for product in self.routine:
            if force:
                product.link()
            else:
                if not product.linked:
                    product.link()
        self.cosdna_urls = [product.cosdna_url for product in self.routine]
        self.cosdna_ids = [product.cosdna_id for product in self.routine]
        if all([product.linked for product in self.routine]):
            self.linked = True
        return self
    
    def sync(self, force=False):
        """
        Collects information for each Product
        
        Parameters
        ----------
        
        sort : str
            sort algorithm for cosdna results
            
        Attributes
        ----------
        linked : bool
            returns True if all Products are linked
            
        Returns
        -------
        
        self : object
        """
        changes = False
        for product in self.routine:
            if force:
                product.sync()
                changes = True
            else:
                if not product.synced:
                    product.sync()
                    changes = True
        if changes:
#             self.brands = list(set([product.brand for product in routine]))
            self._analyze()
        else:
            return self
            
    def _analyze(self):
        """
        Tabulates frequency of ingredients across all Products
        """
        self._all_ingredients = []
        for product in self.routine:
            self._all_ingredients += [*product.ingredients]
        self._counts = Counter(all_ingredients)
        self.ingredients = list(set(all_ingredients))

        # make product vectors
        self._product_vectors = []
        for product in self.routine:
            product_vector = []
            for routine_ingredient in self.ingredients:
                if routine_ingredient in [*product.ingredients]:
                    product_vector.append(1)
                else:
                    product_vector.append(0)
            self._product_vectors.append(product_vector)
        if all([product.synced for product in self.routine]):
            self.synced = True
        return self
    
    def link_sync(self, sort='featured', force=False):
        self.link(sort=sort, force=force)
        self.sync(force=force)
        return self
    
    def top_ingredients(self, top=None, mask=None):
        if mask is not None:
            masked_counts = Counter([x for x in self._all_ingredients if x in mask])
            return masked_counts.most_common(top)
        return self._counts.most_common(top)
    
    def has(self, ingredient):
        isolated_products = []
        for i, product_vector in enumerate(self._product_vectors):
            if product_vector[self.ingredients.index(ingredient)] == 1:
                isolated_products.append(self.routine[i].name)
        return isolated_products

In [293]:
tick = time.time()
routine = Routine(DRIVER)
routine.add('Biore Charcoal Acne Clearing Cleanser', 
    "kiehl's powerful strength line-reducing concentrate 12.5", 
    'Neutrogena Rapid Clear Spot Gel',
    'Roc Retinol Correxion eye cream', 
    'Body Merry Retinol Surge Moisturizer')
routine.link_sync()
tock = time.time()
elapsed = tock - tick
print(elapsed)

7.802136182785034


In [294]:
routine.top_ingredients()

[('glycerin', 4),
 ('disodium edta', 3),
 ('ethylhexylglycerin', 2),
 ('sodium hydroxide', 2),
 ('water', 2)]

In [297]:
routine.top_ingredients(mask=popular_ingredients)

[('retinol', 2), ('ascorbyl glucoside', 1)]

In [298]:
routine.has('retinol')

['Roc Retinol Correxion eye cream', 'Body Merry Retinol Surge Moisturizer']

In [299]:
routine.add('la roche posay antihelios mineral')

<__main__.Routine at 0x1b6a7dbc348>

In [300]:
tick = time.time()
routine.link_sync()
tock = time.time()
elapsed = tock - tick
print(elapsed)

1.9447968006134033


In [301]:
routine.top_ingredients(mask=popular_ingredients)

[('retinol', 2), ('ascorbyl glucoside', 1)]

In [302]:
routine.top_ingredients()

[('glycerin', 4),
 ('disodium edta', 3),
 ('ethylhexylglycerin', 2),
 ('sodium hydroxide', 2),
 ('water', 2)]

# ingredient class

In [330]:
import pubchempy as pcp

In [335]:
pca = pcp.get_compounds('28874-51-3', 'name')[0]

In [336]:
pca.synonyms

['Sodium L-pyroglutamate',
 '28874-51-3',
 'Sodium pidolate',
 'Sodium pyroglutamate',
 'Sodium 5-oxo-L-prolinate',
 'L-Proline, 5-oxo-, monosodium salt',
 'Ajidew N-50',
 'L-5-Oxoproline monosodium salt',
 'Sodium pyrrolidone-5-carboxylate',
 '5-Oxo-L-proline monosodium salt',
 'UNII-1V74VH163T',
 'Sodium L-pyrrolidonecarboxylate',
 'Sodium (S)-5-oxopyrrolidine-2-carboxylate',
 'Proline, 5-oxo-, monosodium salt',
 'EINECS 249-277-1',
 'Sodium PCA',
 '1V74VH163T',
 'Proline, 5-oxo-, monosodium salt, L-',
 '5-Oxo-DL-proline, monosodium salt',
 'PCA Soda',
 'C5H6NNaO3',
 'sodium (2S)-5-oxo-2-pyrrolidinecarboxylate',
 'sodium (2S)-5-oxidanylidenepyrrolidine-2-carboxylate',
 'PDSI 101',
 'DL-Proline, 5-oxo-, monosodium salt',
 'EC 249-277-1',
 'SCHEMBL164853',
 'DTXSID50183074',
 'CRPCXAMJWCDHFM-DFWYDOINSA-M',
 'AKOS006277120',
 'Sodium DL-2-pyrrolidone-5-carboxylate',
 'LS-118970',
 'CS-0112985',
 'A819646',
 'A830250',
 'C-36564',
 'Q27252940',
 'UNII-469OTG57A2 component CRPCXAMJWCDHFM-

In [345]:
class Ingredient():
    
    def __init__(self, driver, name=None, cosdna_id=None, cas=None):
        self.driver = driver
        self.name = name
        self.cosdna_id = cosdna_id
        if cosdna_id != None:
            self.cosdna_url = 'https://cosdna.com/eng/' + cosdna_id + '.html'
            self.linked = True
        else:
            self.cosdna_url = None
            self.linked = False
        self.cas = cas
        self.synced = False
        
    def link(self, name=None):
        # if name=None, use existing name. if not, update name
        if name is not None:
            self.name = name
        
        # search for name in cosdna.com
        name = str(self.name).translate(str.maketrans('', '', string.punctuation)).lower()
        name = name.replace(' ', '+')
        search_url = 'https://cosdna.com/eng/stuff.php?q=' + name
        self.driver.get(search_url)
        
        # try to get the top result. if successful, changed linked to True
        try:
            top = self.driver.find_element_by_xpath(("//table[@class='table table-hover']/tbody/tr/td[1]/a"))
            self.cosdna_url = top.get_attribute('href')
            self.cosdna_id = re.findall(".*eng//(.*).html", self.cosdna_url)[0]
            self.linked = True
            return self
        except:
            print('no results on cosdna')
            name = input('enter new search: ')
            if name == 'break':
                return self
            else:
                return self.link(name=name)
            
    def sync(self, cosdna_url=None):

        # navigate to url and get cosdna_id
        if self.cosdna_url == None:
            if cosdna_url == None:
                return 'create instance with cosdna_id or .link() instance with cosdna_url'
            else:
                self.cosdna_url = cosdna_url
        self.cosdna_id = re.findall(".*cosmetic_(.*).html", self.cosdna_url)[0]
        self.driver.get(self.cosdna_url)

        # get brand and product information from page
        self.brand = self.driver.find_element_by_class_name('brand-name').text.lower()
        self.product = self.driver.find_element_by_class_name('prod-name').text.lower()
        cosdna_name = self.brand + ' ' + self.product
        self.cosdna_name = cosdna_name.strip()

        # get ingredients information from ingredients table
        self.ingredients = {}
        table = self.driver.find_element_by_class_name('chem-list')
        rows = table.find_elements_by_tag_name('tr')
        for row in rows:
            ingredient = row.find_elements_by_tag_name('td')[0].text.strip().lower()
            if 'no results' in ingredient:
                ingredient = row.find_elements_by_class_name('text-muted')[0].text.strip().lower()
                function = None
            else:
                try:
                    function_cell = row.find_elements_by_tag_name('td')[1]
                    function = function_cell.text.strip().lower().split(',')
                    if 'sunscreen' in function:
                        try:
                            uva = re.search("uv[ab]\d", function_cell.find_elements_by_tag_name('img')[0].get_attribute('src'))[0]
                            uvb = re.search("uv[ab]\d", function_cell.find_elements_by_tag_name('img')[1].get_attribute('src'))[0]
                            function.append(uva)
                            function.append(uvb)
                        except:
                            continue
                except:
                    function = None
            self.ingredients[ingredient] = function
        # change state to True
        self.synced = True
        return self

In [348]:
pca = Ingredient(DRIVER, 'sodium pca')

In [357]:
pca = DRIVER.get('https://cosdna.com/eng/stuff.php?q=sodium+pca')

In [358]:
top = DRIVER.find_element_by_xpath(("//table[@class='table table-hover']/tbody/tr/td[1]/a"))

In [359]:
top.get_attribute('href')

'https://cosdna.com/eng/dfa882349.html'

In [360]:
re.findall(".*eng/(.*).html", top.get_attribute('href'))[0]

'dfa882349'

In [349]:
pca.link()

no results on cosdna


enter new search:  break


<__main__.Ingredient at 0x1b6a7ea0e48>

# fast text matching

adapting `string_grouper` so that we don't have to use dataframes if we don't want to

In [None]:
string_grouper = StringGrouper('la-roche posay')

In [None]:


# PRODUCTS = {}

# brands = pd.read_csv('./data\\brands.csv')
# brands.head()

# matches = match_strings(brands['brand_name'])
# matches[matches.left_side != matches.right_side].head()

In [None]:
# test = 'LA ROCHE-POSAY'

# def ngrams(string, n=3):
#     string = string.encode('ascii', errors='ignore').decode()
#     string = string.lower()
    
#     chars_to_remove = [")","(",".","|","[","]","{","}","'"]
#     rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
#     string = re.sub(rx, '', string)
    
#     string = string.replace('&', 'and')
#     string = string.replace(',', ' ')
#     string = string.replace('-', ' ')
#     string = re.sub(' +',' ',string).strip()
#     string = ' '+ string +' '
#     string = re.sub(r'[,-./]|\sBD',r'', string)
    
#     ngrams = zip(*[string[i:] for i in range(n)])
            
#     return [''.join(ngram) for ngram in ngrams]

In [None]:
# # cosine similarity
# # https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

# def cosine_similarity_top(A, B, ntop, lower_bound=0):
#     A = A.tocsr()
#     B = B.tocsr()
#     M, _ = A.shape
#     _, N = B.shape
    
#     idx.dtype = np.int32
    
#     nnz_max = M * ntop
    
#     indptr = np.zeros(M+1, dtype=idx_dtype)
#     indices = np.zeros(nnz_max, dtype=idx_dtype)
#     data = np.zeros(nnz_max, dtype=A.dtype)
    
#     sdt.sparse_dot_topn(
#         M, N, np.asarray(A.indptr, dtype=idx_dtype),
#         np.asarray(A.indices, dtype=idx_dtype),
#         A.data,
#         np.asarray(B.indptr, dtype=idx_dtype),
#         np.asarray(B.indices, dtype=idx_dtype),
#         B.data,
#         ntop,
#         lower_bound,
#         indptr, indices, data)
    
#     return csr_matrix((data,indices,indptr),shape=(M,N))

# sample routine analysis

# fuzzy string comparison

# extra stuff

In [None]:
# class Product():
#     """
#     Create a product object containing various features scraped from cosdna.com
    
#     Parameters
#     ----------
    
#     driver : current selenium.webdriver instance
    
#     name : str, (default=None)
#         name of product
        
#     cosdna_id : str, (default=None)
#         10-digit identifier at the end of the cosdna url
        
        
#     Attributes
#     ----------
    
#     name : str
#         user-defined name of Product
        
#     cosdna_name : str
#         full name of product as it appears on cosdna
        
#     brand : str
#         name of brand as it appears on cosdna
        
#     product : str
#         name of product as it appears on cosdna
        
#     cosdna_id : str
#         10-digit identifier at the end of the cosdna url
        
#     cosdna_url : str
#         full cosdna_url
        
#     ingredients : dict
#         full list of ingredients and their listed functions from cosdna
        
#     linked : bool
#         whether Product() has been linked with a cosdna listing
        
#     synced : bool
#         whether Product() has collected information from linked cosdna listing
#     """
    
#     def __init__(self, driver, name=None, cosdna_id=None):
#         self.driver = driver 
#         self.name = name
#         self.cosdna_id = cosdna_id
#         if cosdna_id != None:
#             self.cosdna_url = 'https://cosdna.com/eng/cosmetic_' + cosdna_id + '.html'
#             self.linked = True
#         else:
#             self.cosdna_url = None
#             self.linked = False
#         self.synced = False
    
    
#     def link(self, name=None, sort='featured'):
#         """
#         Searches for top match on cosdna
        
#         cosdna_url
#         cosdna_id
        
#         Parameters
#         ----------
        
#         name : str, (default=None)
#             full name of product 
#             if blank: uses name assigned when creating Product() instance
#             if not blank: updates name
            
#         Returns
#         -------
        
#         self : object
#         """
        
#         # if name=None, use existing name. if not, update name
#         if name is not None:
#             self.name = name
        
#         # search for name in cosdna.com
#         name = str(self.name).translate(str.maketrans('', '', string.punctuation)).lower()
#         name = name.replace(' ', '+')
#         search_url = 'https://cosdna.com/eng/product.php?q=' + name + '&sort=' + sort
#         self.driver.get(search_url)
        
#         # try to get the top result. if successful, changed linked to True
#         try:
#             top = self.driver.find_element_by_xpath(("//table[@class='table table-hover']/tbody/tr/td[1]/a"))
#             self.cosdna_url = top.get_attribute('href')
#             self.cosdna_id = re.findall(".*cosmetic_(.*).html", self.cosdna_url)[0]
#             self.linked = True
#             return self
#         except:
#             print('no results on cosdna')
#             name = input('enter new search: ')
#             if name == 'break':
#                 return self
#             else:
#                 return self.link(name=name)
        
        
#     def sync(self, cosdna_url=None):
#         """
#         Collects information from linked cosdna site:
        
#         cosdna_url
#         cosdna_id
#         brand
#         product
#         cosdna_name
#         ingredients
#         synced
        
#         Parameters
#         ----------
        
#         cosdna_url : str, (default=None)
#             full cosdna url of product. 
#             if blank: uses linked cosdna_url
#             if not blank: updates cosdna_url
            
#         Returns
#         -------
        
#         self : object
#         """
#         # navigate to url and get cosdna_id
#         if self.cosdna_url == None:
#             if new_url == None:
#                 return 'create instance with cosdna_id or .link() instance with cosdna_url'
#             else:
#                 self.cosdna_url = cosdna_url
#         self.cosdna_id = re.findall(".*cosmetic_(.*).html", self.cosdna_url)[0]
#         self.driver.get(self.cosdna_url)

#         # get brand and product information from page
#         self.brand = self.driver.find_element_by_class_name('brand-name').text.lower()
#         self.product = self.driver.find_element_by_class_name('prod-name').text.lower()
#         cosdna_name = self.brand + ' ' + self.product
#         self.cosdna_name = cosdna_name.strip()

#         # get ingredients information from ingredients table
#         self.ingredients = {}
#         table = self.driver.find_element_by_class_name('chem-list')
#         rows = table.find_elements_by_tag_name('tr')
#         for row in rows:
#             ingredient = row.find_elements_by_tag_name('td')[0].text.strip().lower()
#             if 'no results' in ingredient:
#                 ingredient = row.find_elements_by_class_name('text-muted')[0].text.strip().lower()
#                 function = None
#             else:
#                 try:
#                     function_cell = row.find_elements_by_tag_name('td')[1]
#                     function = function_cell.text.strip().lower().split(',')
#                     if 'sunscreen' in function:
#                         try:
#                             uva = re.search("uv[ab]\d", function_cell.find_elements_by_tag_name('img')[0].get_attribute('src'))[0]
#                             uvb = re.search("uv[ab]\d", function_cell.find_elements_by_tag_name('img')[1].get_attribute('src'))[0]
#                             function.append(uva)
#                             function.append(uvb)
#                         except:
#                             continue
#                 except:
#                     function = None
#             self.ingredients[ingredient] = function
#         # change state to True
#         self.synced = True
#         return self
        
        
#     def link_sync(self, sort='featured'):
#         """
#         Should operate the same way as .link().sync()
#         """
#         self.link(sort=sort)
#         self.sync(self.cosdna_url)
#         return self

In [None]:
# class Routine():
#     """
#     Create a container of Products with useful ingredient tabulation methods
    
#     Parameters
#     ----------
    
#     driver : current selenium.webdriver instance
    
#     name : str, (default=None)
#         name of routine
        
        
#     Attributes
#     ----------
    
#     name : str
#         user-defined name of Routine
        
#     routine : list
#         list of Products in Routine
        
#     ingredients : list
#         list of all ingredients in Routine
        
#     counts : Counter object
#         Counter object of all ingredients in Routine
        
#     linked : bool
#         whether all Products have been linked with a cosdna listing
        
#     synced : bool
#         whether all Products have collected information from linked cosdna listing
#     """
    
#     def __init__(self, driver, name=None):
#         self.driver = driver
#         self.name = name
#         self.routine = []
#         self.linked = False
#         self.synced = False
        
#     def add(self, *products):
#         """
#         Adds a product or list of products to the routine.
        
#         Parameters
#         ----------
        
#         *products: unpackable
#         """
#         for product in products:
#             if type(product) == Product:
#                 self.routine.append(product)
#             elif type(product) == str:
#                 self.routine.append(Product(self.driver, product))
#         return self
    
#     def link(self, sort='featured', force=False):
#         """
#         Searches each Product for top match on cosdna
        
#         updates (for each Product):
#         cosdna_url
#         cosdna_id
        
#         Parameters
#         ----------
        
#         sort : str
#             sort algorithm for cosdna results
            
#         Attributes
#         ----------
#         linked : bool
#             returns True if all Products are linked
            
#         Returns
#         -------
        
#         self : object
#         """
#         for product in self.routine:
#             if force:
#                 product.link()
#             else:
#                 if not product.linked:
#                     product.link()
#         self.cosdna_urls = [product.cosdna_url for product in self.routine]
#         self.cosdna_ids = [product.cosdna_id for product in self.routine]
#         if all([product.linked for product in self.routine]):
#             self.linked = True
#         return self
    
#     def sync(self, force=False):
#         """
#         Collects information for each Product
        
#         Parameters
#         ----------
        
#         sort : str
#             sort algorithm for cosdna results
            
#         Attributes
#         ----------
#         linked : bool
#             returns True if all Products are linked
            
#         Returns
#         -------
        
#         self : object
#         """
#         changes = False
#         for product in self.routine:
#             if force:
#                 product.sync()
#                 changes = True
#             else:
#                 if not product.synced:
#                     product.sync()
#                     changes = True
#         if changes:
# #             self.brands = list(set([product.brand for product in routine]))
#             self._analyze()
#         else:
#             return self
            
#     def _analyze(self):
#         """
#         Tabulates frequency of ingredients across all Products
#         """
#         self._all_ingredients = []
#         for product in self.routine:
#             self._all_ingredients += [*product.ingredients]
#         self._counts = Counter(all_ingredients)
#         self.ingredients = list(set(all_ingredients))

#         # make product vectors
#         self._product_vectors = []
#         for product in self.routine:
#             product_vector = []
#             for routine_ingredient in self.ingredients:
#                 if routine_ingredient in [*product.ingredients]:
#                     product_vector.append(1)
#                 else:
#                     product_vector.append(0)
#             self._product_vectors.append(product_vector)
#         if all([product.synced for product in self.routine]):
#             self.synced = True
#         return self
    
#     def link_sync(self, sort='featured', force=False):
#         self.link(sort=sort, force=force)
#         self.sync(force=force)
#         return self
    
#     def top_ingredients(self, top=None, mask=None):
#         if mask is not None:
#             masked_counts = Counter([x for x in self._all_ingredients if x in mask])
#             return masked_counts.most_common(top)
#         return self._counts.most_common(top)
    
#     def has(self, ingredient):
#         isolated_products = []
#         for i, product_vector in enumerate(self._product_vectors):
#             if product_vector[self.ingredients.index(ingredient)] == 1:
#                 isolated_products.append(self.routine[i].name)
#         return isolated_products

In [None]:
# class Ingredient():
    
#     def __init__(self, driver, name=None, cosdna_id=None, cas=None):
#         self.driver = driver
#         self.name = name
#         self.cosdna_id = cosdna_id
#         if cosdna_id != None:
#             self.cosdna_url = 'https://cosdna.com/eng/' + cosdna_id + '.html'
#             self.linked = True
#         else:
#             self.cosdna_url = None
#             self.linked = False
#         self.cas = cas
#         self.synced = False
        
#     def link(self, name=None):
#         # if name=None, use existing name. if not, update name
#         if name is not None:
#             self.name = name
        
#         # search for name in cosdna.com
#         name = str(self.name).translate(str.maketrans('', '', string.punctuation)).lower()
#         name = name.replace(' ', '+')
#         search_url = 'https://cosdna.com/eng/stuff.php?q=' + name
#         self.driver.get(search_url)
        
#         # try to get the top result. if successful, changed linked to True
#         try:
#             top = self.driver.find_element_by_xpath(("//table[@class='table table-hover']/tbody/tr/td[1]/a"))
#             self.cosdna_url = top.get_attribute('href')
#             self.cosdna_id = re.findall(".*eng//(.*).html", self.cosdna_url)[0]
#             self.linked = True
#             return self
#         except:
#             print('no results on cosdna')
#             name = input('enter new search: ')
#             if name == 'break':
#                 return self
#             else:
#                 return self.link(name=name)
            
#     def sync(self, cosdna_url=None):

#         # navigate to url and get cosdna_id
#         if self.cosdna_url == None:
#             if cosdna_url == None:
#                 return 'create instance with cosdna_id or .link() instance with cosdna_url'
#             else:
#                 self.cosdna_url = cosdna_url
#         self.cosdna_id = re.findall(".*cosmetic_(.*).html", self.cosdna_url)[0]
#         self.driver.get(self.cosdna_url)

#         # get brand and product information from page
#         self.brand = self.driver.find_element_by_class_name('brand-name').text.lower()
#         self.product = self.driver.find_element_by_class_name('prod-name').text.lower()
#         cosdna_name = self.brand + ' ' + self.product
#         self.cosdna_name = cosdna_name.strip()

#         # get ingredients information from ingredients table
#         self.ingredients = {}
#         table = self.driver.find_element_by_class_name('chem-list')
#         rows = table.find_elements_by_tag_name('tr')
#         for row in rows:
#             ingredient = row.find_elements_by_tag_name('td')[0].text.strip().lower()
#             if 'no results' in ingredient:
#                 ingredient = row.find_elements_by_class_name('text-muted')[0].text.strip().lower()
#                 function = None
#             else:
#                 try:
#                     function_cell = row.find_elements_by_tag_name('td')[1]
#                     function = function_cell.text.strip().lower().split(',')
#                     if 'sunscreen' in function:
#                         try:
#                             uva = re.search("uv[ab]\d", function_cell.find_elements_by_tag_name('img')[0].get_attribute('src'))[0]
#                             uvb = re.search("uv[ab]\d", function_cell.find_elements_by_tag_name('img')[1].get_attribute('src'))[0]
#                             function.append(uva)
#                             function.append(uvb)
#                         except:
#                             continue
#                 except:
#                     function = None
#             self.ingredients[ingredient] = function
#         # change state to True
#         self.synced = True
#         return self

In [16]:
# class BasePage():
#     """This class is the parent class for all the pages in our application."""
#     """It contains all common elements and functionalities available to all pages."""

#     # this function is called every time a new object of the base class is created.
#     def __init__(self, driver):
#         self.driver=driver

#     # this function performs click on web element whose locator is passed to it.
#     def click(self, by_locator):
#         WebDriverWait(self.driver, 10).until(EC.visibility_of_element_located(by_locator)).click()
    
#     # this function asserts comparison of a web element's text with passed in text.
#     def assert_element_text(self, by_locator, element_text):
#         web_element=WebDriverWait(self.driver, 10).until(EC.visibility_of_element_located(by_locator))
#         assert web_element.text == element_text

#     # this function performs text entry of the passed in text, in a web element whose locator is passed to it.
#     def enter_text(self, by_locator, text):
#         return WebDriverWait(self.driver, 10).until(EC.visibility_of_element_located(by_locator)).send_keys(text)

#     # this function checks if the web element whose locator has been passed to it, is enabled or not and returns
#     # web element if it is enabled.
#     def is_enabled(self, by_locator):
#         return WebDriverWait(self.driver, 10).until(EC.visibility_of_element_located(by_locator))

#     # this function checks if the web element whose locator has been passed to it, is visible or not and returns
#     # true or false depending upon its visibility.
#     def is_visible(self,by_locator):
#         element=WebDriverWait(self.driver, 10).until(EC.visibility_of_element_located(by_locator))
#         return bool(element)
    
#     # this function moves the mouse pointer over a web element whose locator has been passed to it.
#     def hover_to(self, by_locator):
#         element = WebDriverWait(self.driver, 10).until(EC.visibility_of_element_located(by_locator))
#         ActionChains(self.driver).move_to_element(element).perform()