In [1]:
import re
import time
import json
import unidecode
import numpy as np

from collections import Counter, OrderedDict
from requests_html import HTMLSession
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# figure out if i need to use __getattr__ or not

In [2]:
class OrderedCounter(Counter, OrderedDict):
    # no additional code is needed to combine these objects
    # https://stackoverflow.com/a/35448557
    pass

In [125]:
class Cosmetic(HTMLSession):
    '''
    Parent class for organizing Products and Ingredients from CosDNA.com.
    Not intended to be used on its own.
    '''

    _domain = 'https://cosdna.com'

    _sort_dict = {
        'latest': '&sort=date',
        'featured': '&sort=featured',
        'clicks': '&sort=click',
        'reviews': '&sort=review'
    }

    _stop_words = [
        'cleanser',
        'cream',
        'lotion',
        'mask',
        'masque',
        'moisturizer',
        'serum',
        'sunblock',
        'sunscreen',
        'toner',
        'treatment'
    ]

    with open('./data/ingredients/ingredients.json', 'rb') as handle:
        INGREDIENTS = json.load(handle)

    def __init__(self, name=None, cosdna_id=None):
        super().__init__()
        self._name = self.clean(name)
        # an explicit declaration with CosDNA ID assumed to be valid
        self.cosdna_id = cosdna_id
        self._skip = False
        self._synced = False

    def __getattr__(self, name):
        if name in ['_skip', '_synced']:
            return False
        else:
            return None

    def link(self, sort=None, cosdna_url=None):
        '''
        Updates self._cosdna_url if instantiated without CosDNA ID.
        If cosdna_url blank, checks if self._cosdna_url is valid
        If self._cosdna_url blank, searches for CosDNA URL
        '''
        # don't do anything if we have a valid CosDNA ID
        if self.linked:
            pass
        elif not cosdna_url:
            self._search(sort=sort)
        elif self._urls['base'] in cosdna_url:
            self.cosdna_id = self._u2i(cosdna_url)
        return self

    def _search(self, sort=None, _try_stopwords=True):
        # self._query defined in child classes
        if self._query and (not self._skip):
            search_url = self._get_search_url(query=self._query,
                                              sort=sort)
            self._r = self.get(search_url)
            top = self._r.html.find('td', first=True)
            if top:
                self._cosdna_url = (Cosmetic._domain
                                    + top.xpath('//a/@href', first=True))
                return self
            else:
                if _try_stopwords:
                    for word in self._stop_words:
                        self._query = re.sub('|'.join(self._stop_words),'', self._query)
                        return self._search(sort=sort, _try_stopwords=False)
                elif self._r.html.find('.text-danger'):
                    print(f'No results for {self._name} on CosDNA.')
                    print("Enter new search (to skip search, enter 'SKIP' w/o quotes):")
                    self._query = input(' ')
                    if self._query == 'SKIP':
                        self._skip = True
                        return self
                    else:
                        return self._search(sort=sort)
                else:
                    self._cosdna_url = self._r.url
                    return self
        else:
            print('Link with valid CosDNA URL or product name to proceed.')
            return self

    def _get_search_url(self, query=None, sort=None):
        '''
        Generates a php search_url directly from query
        '''
        query = self.clean(query, True)
        # _base_url defined in child classes
        # Product() has different sort options, Ingredient() does not
        if sort in [*Cosmetic._sort_dict]:
            search_url = self._urls['search'] + query + Cosmetic._sort_dict[sort]
        else:
            search_url = self._urls['search'] + query
        return search_url

    def sync(self):
        '''
        Sets up scrape from linked url
        Child classes define more actions
        '''
        if self._skip:
            pass
        elif self.cosdna_url:
            self._r = self.get(self.cosdna_url)
            # more actions defined in child classes
        else:
            print('Initialize or link with valid CosDNA URL to proceed')
        return self

    def _u2i(self, url):
        return re.findall("eng/(.*).html", url)[0]

    def _i2u(self, id_):
        return self._urls['base'] + id_ + '.html'

    @staticmethod
    def check(key, dict_):
        # https://stackoverflow.com/a/28860508
        try:
            var1 = dict_[key]
            return True
        except KeyError:
            return False

    @staticmethod
    def clean(string, query=False):
        # https://stackoverflow.com/a/2633310
        string = unidecode.unidecode(string.lower())
        string = re.sub('[\‘\’]', "'", string)
        string = re.sub("[^a-z0-9\s\-\']", '', string)
        if query:
            string = re.sub('and|or|not', '', string)
            string = re.sub('\s+','+',string).strip()
        else:
            string = re.sub('&', 'and', string)
            string = re.sub('\s+',' ',string).strip()
        return string

    @property
    def cosdna_url(self):
        '''
        Returns a unique ingredient identifier based on the URL

        Aliases of the same ingredient point to the same URL in the CosDNA
        database. In the absence of our own relational database, we rely on
        this identifier to collapse multiple aliases into a single entry,
        allowing us to:
        - collapse aliases together when analyzing routines
        - search for aliases
        '''
        if self.cosdna_id:
            return self._i2u(self.cosdna_id)
        else:
            return self._cosdna_url

    # linked and synced defined as properties
    # since Routine() will not share this behavior
    @property
    def linked(self):
        if self.cosdna_id:
            return True
        else:
            return False

    @property
    def synced(self):
        return self._synced

In [126]:
class Ingredient(Cosmetic):
    '''
    Syncs and stores ingredient information from CosDNA.com.

    Search for ingredients based on name, link to CosDNA page, and scrape
    information to store in instance.

    Parameters
    ----------
    name : str, default None
        Name of ingredient.

    cas_no : str, default None
        CAS Registry Number

    cosdna_url : str, default None
        URL of ingredient in CosDNA database

    >>> i = Ingredient('salicylic acid')
    >>> i.name                                     # returns assigned name
    'salicylic acid'
    >>> i.link_sync()                              # scrapes top result
    >>> i.name                                     # returns CosDNA name
    'bha'

    >>> i.link('https://cosdna.com/eng/0f1b7f1402.html')    # directly update link
    >>> i.sync()
    >>> i.name
    'capryloyl salicylic acid'
    '''

    _urls = {
        'base': 'https://cosdna.com/eng/',
        'search': 'https://cosdna.com/eng/stuff.php?q='
    }

    def __init__(self, name=None, cas_no=None, cosdna_id=None):
        super().__init__(name=name, cosdna_id=cosdna_id)
        self.cas_no = cas_no
        # self._query separated from self._name
        # to give priority to search via CAS No.
        if self.cas_no:
            self._query = self.cas_no
        else:
            self._query = self._name

    def link(self, cosdna_url=None):
        '''
        Links Ingredient() to cosdna_url

        Parameters
        ----------
        cosdna_url : str, default None
            CosDNA URL of ingredient
            If cosdna_url is None, searches for cosdna_url using either:
                - cas_no (preferential), or
                - name
        '''
        super().link(cosdna_url=cosdna_url)

    def sync(self):
        '''
        Scrapes information from linked URL
        - name on CosDNA website
        - ingredient aliases
        - molar mass
        - hydro-/lipo-philic balance
        - CAS Registry Number
        - ingredient description

        Visit the following websites for more information:
        - molar mass: <https://en.wikipedia.org/wiki/Molar_mass>
        - HLB: <https://en.wikipedia.org/wiki/Hydrophilic-lipophilic_balance>
        - CAS No.: <https://en.wikipedia.org/wiki/CAS_Registry_Number>
        '''
        try:
            super().INGREDIENTS[self.cosdna_id]
        super().sync()          # goes to cosdna_url
        if not self._skip:
            self._set_names()
            self._set_chemical_info()
            self._description = self._r.html.find(
                'div.chem.mb-5 > div.linkb1.ls-2.lh-1', first=True
                ).text
            self.cosdna_id = self._u2i(self._r.url)
            self._synced = True
        return self

    def link_sync(self, cosdna_url=None):
        self.link(cosdna_url=cosdna_url)
        self.sync()
        return self

    def _set_names(self):
        '''
        Helper function for self.sync()
        self.sync() > self._get_names()

        Returns the name and aliases of the ingredient as they appear in the
        linked URL
        '''
        self._cosdna_name = self._r.html.find('.text-vampire', first=True) \
                          .text.lower()
        self._aliases = self._r.html.find('div.chem.mb-5 > div.mb-2', first=True) \
                      .text.lower().split(', ')
        return self

    def _set_chemical_info(self):
        '''
        Helper function for self.sync()
        self.sync() > self._get_chemical_info()

        Returns the molar mass, hydro-/lipo-philic balance, and CAS Registry
        Number as they appear in the linked URL
        '''
        ci = self._r.html                                                \
                 .find('div.d-flex.justify-content-between', first=True) \
                 .text
        if 'Molecular Weight' in ci:
            try:
                self._mass = float(re.findall(".*Weight[^\d\.]+(\d+\.\d+).*", ci)[0])
            except:
                self._mass = None
        if 'HLB' in ci:
            try:
                self._hlb = float(re.findall(".*HLB[^\d\.]+(\d+\.\d+).*", ci)[0])
            except:
                self._hlb = None
        if 'Cas No' in ci:
            self._cas_no = re.findall(".*Cas No[^\d\-]+(\d+\-\d+\-\d+).*", ci)[0]
        return self

    @property
    def name(self):
        if self.synced:
            try:
                n = super().INGREDIENTS[self.cosdna_id]['name']
                return n
            except KeyError:
                return self._cosdna_name
        elif self._skip:
            return 'SKIP: ' + self._name
        else:
            return self._name

    @property
    def info(self):
        try:
            return dict([(self.cosdna_id, INGREDIENTS[self.cosdna_id])])
        except:
            sub_info = {}
            sub_info['name'] = self.name
            sub_info['cosdna_name'] = self._cosdna_name
            sub_info['aliases'] = self._aliases
            sub_info['mass'] = self._mass
            sub_info['hlb'] = self._hlb
            sub_info['cas_no'] = self._cas_no
            sub_info['description'] = self._description
            return dict([(self.cosdna_id, sub_info)])

    # # this is really ugly but oh well
    # def _make_sub_info(dict, name=None, cosdna_name=None, aliases=None, mass=None, hlb=None, cas_no=None, description=None):
    #     sub_info = {}
    #     sub_info['name'] = self.name
    #     sub_info['cosdna_name'] = self._cosdna_name
    #     sub_info['aliases'] = self._aliases
    #     sub_info['mass'] = self._mass
    #     sub_info['hlb'] = self._hlb
    #     sub_info['cas_no'] = self._cas_no
    #     sub_info['description'] = self._description
    #     return sub_info

    def __str__(self):
        return self.name

    def __repr__(self):
        return f"""Ingredient(
    name='{self.name}',
    cas_no='{self._cas_no}',
    cosdna_id='{self.cosdna_id}'
)
"""

In [106]:
with open('./data/ingredients/ingredient-names.json', 'rb') as handle:
    INGREDIENT_NAMES = json.load(handle)

In [113]:
with open('./data/ingredients/ingredients.json', 'rb') as handle:
    INGREDIENTS = json.load(handle)

for k, v in INGREDIENT_NAMES.items():
    if k != 'unavailable':
        try:
            b = INGREDIENTS[k]
        except:
            ingredient = Ingredient(name=v['name'], cosdna_id=k)
            ingredient.link_sync()
            INGREDIENTS.update(ingredient.info)
            with open('./data/ingredients/ingredients.json', 'w') as handle:
                json.dump(INGREDIENTS, handle, indent=4)
            time.sleep(5)


In [128]:
test = Ingredient('salicylic acid', cosdna_id='466110332')
test.info

{'466110332': {'name': 'salicylic acid',
  'cosdna_name': 'bha',
  'aliases': ['salicylates',
   '2-hydroxybenzoic',
   'salicylic acid',
   'beta hydroxy acid'],
  'mass': 138.12,
  'hlb': None,
  'cas_no': '69-72-7',
  'description': 'Exfoliator, Whitening'}}

In [95]:
test.link_sync()
test.info

{'466110332': {'name': 'salicylic acid',
  'cosdna_name': 'bha',
  'aliases': ['salicylates',
   '2-hydroxybenzoic',
   'salicylic acid',
   'beta hydroxy acid'],
  'mass': 138.12,
  'hlb': None,
  'cas_no': '69-72-7',
  'description': 'Exfoliator, Whitening'}}

In [123]:
class Product(Cosmetic):
    '''
    Syncs and stores product information from CosDNA.com.

    Search for products based on name, link to CosDNA page, and scrape
    information to store in instance.

    Parameters
    ----------
    name : str, default None
        Name of ingredient.

    brand : str, default None
        Name of brand. Does not affect search--purely for internal purposes.

    product : str, default None
        Name of product. Does not affect search--purely for internal purposes.

    cosdna_url : str, default None
        URL of ingredient in CosDNA database
    '''

    _urls = {
        'base': 'https://cosdna.com/eng/cosmetic_',
        'search': 'https://cosdna.com/eng/product.php?q='
    }

    def __init__(self, name=None, brand=None, product=None, cosdna_id=None):
        # initialize using 'name' property
        self.brand = brand
        self.product = product
        if brand and product:
            super().__init__(name=self.name, cosdna_id=cosdna_id)
        else:
            super().__init__(name=name, cosdna_id=cosdna_id)
        self._query = self._name

    def __getattr__(self, name):
        if name is '_ings':
            return []
        else:
            super().__getattr__(name)

    def link(self, sort='featured', cosdna_url=None):
        '''
        Links Product() to cosdna_url

        Parameters
        ----------
        sort : str, default 'featured'
            Selects the search parameters to use:
            - None (the default sort option on CosDNA)
            - 'latest' : most recent entries first
            - 'featured': seems to be a weighted average of 'latest' and
                    'clicks'
            - 'clicks' : most visited entries first
            - 'reviews' : most reviews first
        cosdna_url : str, default None
            CosDNA URL of ingredient
            If cosdna_url is None, searches for cosdna_url using name
        '''
        return super().link(sort=sort, cosdna_url=cosdna_url)

    def sync(self):
        '''
        Scrapes information from linked URL
        - brand name
        - product name
        - ingredient names and corresponding URLs
        Saves ingredients as Ingredient()

        Parameters
        ----------
        deep : bool, default False
            Calls Ingredient.sync() on every ingredient in the routine
        '''
        super().sync()
        if self._skip:
            self._ings = []
            return self
        else:
            self.cosdna_id = self._u2i(self._r.url).replace('cosmetic_', '')
            self._set_name_brand_product()
            self._set_ingredients()
            self._synced = True
            return self

    def link_sync(self, sort='featured', cosdna_url=None):
        self.link(sort=sort, cosdna_url=cosdna_url)
        self.sync()

    def _set_name_brand_product(self):
        """
        Helper function for self.sync()
        Returns the brand name and product name of the product as they appear
        in the linked URL
        """
        brand = super().clean(
            self._r.html.find('.brand-name', first=True).text
        )
        product = super().clean(
            self._r.html.find('.prod-name', first=True).text
        )
        if brand:
            self.cosdna_brand, self.cosdna_product = brand, product
        self.cosdna_name = (brand + ' ' + product).strip()

    def _set_ingredients(self):
        '''
        Helper function for self.sync()
        self.sync() > self._set_ingredients()

        Returns a list of ingredients in the product as Ingredient()

        Parameters
        ----------
        deep : bool, default False
            Calls Ingredient.sync() on every ingredient in the routine
        '''
        table = self._r.html.find('.tr-i')
        self._ings = []
        self._missing_ings = []
        for row in table:
            cells = row.find('td')
            sub_dict = {}
            if len(cells) == 5:
                # ingredient, function, acne, irritant, safety
                ing, _, _, _, _ = cells
                ing_url = (
                    Cosmetic._domain + ing.xpath('//a/@href', first=True)
                )
                ing_id = re.findall("eng/(.*).html", ing_url)[0]
                try:
                    ing_name = super().INGREDIENT_NAMES[ing_id]['name']
                except:
                    ing_name = super().clean(ing.text)
                ingredient = Ingredient(name=ing_name, cosdna_id=ing_id)
                # add function, acne, irritant, safety
                # if deep, sync with ingredient page too and sleep
                self._ings.append(ingredient)
            else:
                ing = cells[0]
                ing_name = ing.find('.text-muted', first=True).text.strip() \
                              .lower()
                self._missing_ings.append(ing_name)
        return self

    # def _get_function_info(self, fun):
    #     function = fun.text.strip().lower().split(',')
    #     if 'sunscreen' in function:
    #         try:
    #             uva = re.search("uv[ab]\d",
    #                             fun.xpath('//img')[0].attrs['src'])[0]
    #             uvb = re.search("uv[ab]\d",
    #                             fun.xpath('//img')[1].attrs['src'])[0]
    #             function.append(uva)
    #             function.append(uvb)
    #         except:
    #             pass
    #     return function

    @property
    def name(self):
        if self.brand is None and self.product is None:
            return self._name
        else:
            self._name = self.brand + ' ' + self.product
            return self._name

    @property
    def ingredients(self):
        return [ing.name for ing in self._ings]

    @property
    def cosdna_urls(self):
        return [super()._i2u(ing) for ing in [*self._ings]]

    @property
    def _ing_ids(self):
        return [ing.cosdna_id for ing in self._ings]

    @property
    def info(self):
        sub_info = {}
        # ing_info = {}
        sub_info['name'] = self.name
        sub_info['brand'] = self.brand
        sub_info['product'] = self.product
        ing_info = []
        for ing in self._ings:
            ing_info.append(ing.info)
        sub_info['ingredients'] = ing_info
        return dict([(self.cosdna_id, sub_info)])

    @property
    def ing_dict(self):
        sub_dict = {}
        for ing in self._ings:
            sub_dict[ing.cosdna_id] = ing.info[ing.cosdna_id]['name']
        return sub_dict

    # def __str__(self):
    #     return f'{self.name}\n\n{self.ingredients}'

In [124]:
test = Product(brand='chemist confessions', product='better oil')
test.link_sync()
test.info

{'5469385906': {'name': 'chemist confessions better oil',
  'brand': 'chemist confessions',
  'product': 'better oil',
  'ingredients': [{'18c16716561': {'name': 'hippophae rhamnoides seed oil',
     'cosdna_name': None,
     'aliases': None,
     'mass': None,
     'hlb': None,
     'cas_no': None,
     'description': None}},
   {'5358a014437': {'name': 'ribes nigrum seed oil',
     'cosdna_name': None,
     'aliases': None,
     'mass': None,
     'hlb': None,
     'cas_no': None,
     'description': None}},
   {'8a961d14427': {'name': 'rosa rubiginosa seed oil',
     'cosdna_name': None,
     'aliases': None,
     'mass': None,
     'hlb': None,
     'cas_no': None,
     'description': None}},
   {'828b29359': {'name': 'squalane',
     'cosdna_name': None,
     'aliases': None,
     'mass': None,
     'hlb': None,
     'cas_no': None,
     'description': None}},
   {'affbc2955': {'name': 'alpha-bisabolol',
     'cosdna_name': None,
     'aliases': None,
     'mass': None,
     'hlb'

In [98]:
test.ing_dict

{'18c16716561': 'hippophae rhamnoides seed oil',
 '5358a014437': 'ribes nigrum seed oil',
 '8a961d14427': 'rosa rubiginosa seed oil',
 '828b29359': 'squalane',
 'affbc2955': 'alpha-bisabolol',
 '0287c3229': 'linoleic acid',
 'ba78081902': 'linolenic acid',
 '3bf28a485': 'vitamin e',
 '12af7d17919': 'geranium oil',
 '4da61720318': 'salvia sclarea oil',
 '3f6f9914103': 'chamaecyparis obtusa oil'}

In [114]:
class Routine():
    '''
    Container for Product() and Ingredient() objects.

    Tabulates all ingredients in routine.

    Parameters
    ----------
    name : str, default None
        Name of routine. Optional

    routine : list, default None
        List if products in routine. Products are stored as Product() objects.
    '''

    def __init__(self, name=None, routine=None):
        self.products = []
        self._changes = False
        if routine:
            self.add(routine)

    def add(self, products):
        '''
        Adds products to the routine

        Parameters
        ----------
        routine : list, default None
            List if products in routine. Products are stored as Product()
            objects.
        '''
        products = np.array([products])
        products = products.ravel()
        products = [p for p in products if len(p) > 1]
        for product in products:
            if type(product) == Product:
                self.products.append(product)
            else:
                self.products.append(Product(product))
        return self

    def link(self, sort='featured'):
        '''
        Calls Product.link() for all products in routine

        Parameters
        ----------
        sort : str, default 'featured'
            Chooses how to sort the search results for each Product()

        force : bool, default False
            Calls Product.link() on Product() even if it has already been
            linked.
        '''
        self.link_sync(sort=sort, _link=True, _sync=False)

    def sync(self, force=False, sleep=0.5):
        '''
        Calls Product.sync() for all products in routine

        Parameters
        ----------
        force : bool, default False
            Calls Product.sync() on Product() even if it has already been
            synced

        deep : bool, default False
            Calls Ingredient.sync() on every ingredient in the routine
        '''
        self.link_sync(force=force, sleep=sleep, _link=False, _sync=True)

    def link_sync(self, sort='featured', force=False, sleep=0.5,
                  _link=True, _sync=True):
        '''
        Calls Product.link().sync() for all products in routine
        Tabulates frequency of ingredients across entire routine

        Parameters
        ----------
        force : bool, default False
            Calls Product.sync() on Product() even if it has already been
            synced

        deep : bool, default False
            Calls Ingredient.sync() on every ingredient in the routine
        '''
        for product in self.products:
            if _link:
                if force:
                    product.link(sort=sort)
                else:
                    if not product.linked:
                        product.link(sort=sort)
            if _sync:
                if force:
                    product.sync()
                    self._changes = True
                else:
                    if not product.synced:
                        product.sync()
                        self._changes = True
            time.sleep(sleep)
        if self._changes:
            self._analyze()
        else:
            return self

    def _analyze(self):
        '''
        Helper function for self.sync()
        self.sync() > self._analyze()

        Tabulates frequency of cosdna_ids across all Products
        '''
        self._set_routine_info()
        self._counts = self._translate_counter(self._ing_dict,
                                               Counter(self._ing_ids))
        self._set_product_vectors()
        return self

    def _set_routine_info(self):
        '''
        Helper function for self.sync()
        self.sync() > self._analyze() > self._get_routine_info()

        Creates dictionary to translate cosdna_id to ingredient name
        Creates routine_ids and routine_dict for entire routine
        '''
        self._ing_ids = []
        self._ing_dict = {}
        self.info = {}
        for product in self.products:
            if not product._skip:
                self._ing_ids += product._ing_ids
                self._ing_dict.update(product.ing_dict)
                self.info.update(product.info)
        return self

    def _translate_counter(self, translation_dict, counter):
        '''
        Helper function for self.sync()
        self.sync() > self._analyze() > self._translate_counter()

        Translates Counter() object using routine_dict
        Visit <https://stackoverflow.com/questions/51423217/> for more
        information
        '''
        return OrderedCounter(dict(
            (translation_dict.get(k, k), v) for (k, v) in counter.items()
        ))

    def _set_product_vectors(self):
        '''
        Helper function for self.sync()
        self.sync() > self._analyze() > self._get_product_vectors()

        Generates product vectors in order to quickly assess the presence of
        ingredients in a routine. Over 5x than searching through
        Product._cosdna_ids
        '''
        self._product_vectors = []
        for product in self.products:
            product_vector = []
            product_vector_append = product_vector.append
            for cosdna_id in self._ing_ids:
                if cosdna_id in product._ing_ids:
                    product_vector_append(1)
                else:
                    product_vector_append(0)
            self._product_vectors.append(product_vector)
        return self

    def top_ingredients(self, top=None, mask=None):
        '''
        Returns specified number of most common ingredients

        Parameters
        ----------
        top : int, default None
            Specifies number of ingredients to return

        mask : list, default None
            Specifies which ingredients to return
        '''
        # dict(Counter) returns a dictionary
        # then import scipy.sparse?
        if mask:
            mask = [Ingredient(x).link_sync().cosdna_id for x in mask]
            masked_counts = OrderedCounter(
                [x for x in self._routine_ids if x in mask]
            )
            return masked_counts
        else:
            return self._counts

    def has(self, ingredient):
        '''
        Returns products which include ingredient

        Parameters
        ----------
        ingredient : str
            Name of ingredient. Works for aliases as long as they are present
            in CosDNA.
        '''
        # add 'AND' / 'OR' functionality!
        ingredient_id = Ingredient(ingredient).link_sync().cosdna_id
        isolated_products = []
        for i, product_vector in enumerate(self._product_vectors):
            try:
                ingredient_index = self._routine_ids.index(ingredient_id)
                if product_vector[ingredient_index] == 1:
                    isolated_products.append(self.products[i].name)
            except:
                print(f'Routine does not have {ingredient}.')
                break
        return isolated_products

    @property
    def routine(self):
        try:
            return [product.name for product in self.products]
        except:
            return self.products

    @property
    def cosdna_urls(self):
        return [product.cosdna_url for product in self.products]

    @property
    def brands(self):
        return [product.brand for product in self.products]

    @property
    def linked(self):
        try:
            return all([product.linked for product in self.products])
        except:
            return False

    @property
    def synced(self):
        try:
            return all([product.synced for product in self.products])
        except:
            return False

    @property
    def ingredients(self):
        return list(set(self._routine_ingredients))

    @property
    def top(self):
        return self.top_ingredients(10)

#     def __str__(self):
#         return f'Routine "{self.name}" with {len(self.routine)} products'

#     def __repr__(self):
#         return f'Routine(name={self.name}, routine={[product.name for product in self.routine]})'

In [115]:
test = Routine(routine=[
    'chemist confessions cleanser',
    'chemist confessions aquafix',
    'chemist confessions better oil'
])
test.link_sync()

In [117]:
test.top_ingredients()

OrderedCounter({'water': 2,
                'sodium lauryl glucose carboxylate': 1,
                'lauryl glucoside': 1,
                'glycereth-26': 1,
                'propanediol': 2,
                'disodium cocoyl glutamate': 1,
                'peg-7 glyceryl cocoate': 1,
                'ceteareth-60 myristyl glycol': 1,
                'dicaprylyl ether': 1,
                'lactic acid': 1,
                'acrylates copolymer': 1,
                '12-hexanediol': 1,
                'caprylhydroxamic acid': 2,
                'chlorphenesin': 1,
                'glycerin': 1,
                'ethoxydiglycol': 1,
                'sodium pca': 1,
                'panthenol': 1,
                'allantoin': 1,
                'vinyl dimethiconemethicone silsesquioxane crosspolymer': 1,
                'madecassoside': 1,
                'asiaticoside': 1,
                'polyacrylate crosspolymer-6': 1,
                'sodium hyaluronate': 1,
                'xanthan gum'

In [None]:
def cosdna_id(url):
    '''
    Returns a unique ingredient identifier based on the URL

    Aliases of the same ingredient point to the same URL in the CosDNA
    database. In the absence of our own relational database, we rely on
    this identifier to collapse multiple aliases into a single entry,
    allowing us to:
    - collapse aliases together when analyzing routines
    - search for aliases
    '''
    try:
        if 'cosmetic' in url:
            return 'p_' + re.findall("eng/[cosmetic]*_*(.*).html", url)[0]
        elif url:
            return 'i_' + re.findall("eng/(.*).html", url)[0]
    except:
        return 'unavailable'

In [None]:
print(cosdna_id('https://cosdna.com/eng/cosmetic_6cc9510258.html'))
print(cosdna_id('https://cosdna.com/eng/90172810251.html'))
print(cosdna_id(None))

In [None]:
asession.close()

In [None]:
# CHROMEDRIVER_PATH = '/Users/Jelly/Documents/googledrive/chromedriver/macos/chromedriver'
CHROMEDRIVER_PATH = 'C:/chromedriver_win32/chromedriver.exe'

In [None]:
driver = webdriver.Chrome(CHROMEDRIVER_PATH)

In [None]:
file_list = glob('./data/*.csv')
file_list

In [None]:
responses = pd.read_csv(file_list[3])
new_columns = [
    'timestamp',
    'skin_type',
    'cannot_contain',
    'skin_concern',
    'am_routine',
    'pm_routine',
    'climate',
    'skin_sensitivities',
    'used_retinoids',
    'used_acids',
    'prone_to_breakouts',
    'miscellaneous',
    'permission'
]
responses.columns = new_columns
responses.head(2)

In [None]:
routine_columns = ['am_routine', 'pm_routine']

for col in routine_columns:
    responses[col] = responses[col].str.lower().str.replace(';', ',')\
                     .str.replace('\n', ',').str.split(r'\,+\s*')
    print(responses[col][:2])

In [None]:
responses['am_routine'][:5]

In [None]:
def routine_analyzer(routine):
    print(f'{len(routines)} routines')
    start = int(input('start:'))
    end = int(input('end:'))
    routine_objects = []
    cosdna_names = []
    for i in range(start - 1, end):
        print(f'{i + 1} of {end} ({len(routines)})')
        text = input('proceed?')
        if text == 'break':
            break
        elif text == 'skip':
            continue
        else:
            routine = Routine(name=i, routine=routines[i])
            routine.link_sync()
            routine_objects.append(routine)
            cosdna_names.append(routine.routine)
    return routine_objects, cosdna_names

In [None]:
with open('./data/ingredients/ingredients.json', 'rb') as handle:
    master_dict = json.load(handle)

In [None]:
master_dict

In [None]:
new_master_dict = {}

for k, v in master_dict.items():
    sub_dict = {'name': v}
    new_master_dict[k] = sub_dict

In [None]:
with open('ingredients.json', 'w') as handle:
    json.dump(new_master_dict, handle, indent=4)

In [None]:
with open('./data/am_routine_objects.pickle', 'rb') as handle:
    am_routine_objects = pickle.load(handle)

with open('./data/am_cosdna_names.pickle', 'rb') as handle:
    am_cosdna_names = pickle.load(handle)

In [None]:
len(am_cosdna_names)

In [None]:
am_routine_objects[0].__dict__

In [None]:
routines = responses['am_routine']

In [None]:
print(f'{len(routines)} routines')

start = int(input('start (1-index):'))
end = int(input('end (1-index):'))

for i in range(start - 1, end):
    print(f'{i + 1} of {end} ({len(routines)})')
    text = input('proceed?')
    if text == 'break':
        break
    elif text == 'skip':
        continue
    else:
        routine = Routine(name=i, routine=routines[i])
        routine.link_sync()
        am_routine_objects.append(routine)
        am_cosdna_names.append(routine.routine)

In [None]:
routines[12]

In [None]:
twelve = Routine(routine=routines[12])
twelve.link_sync()

In [None]:
twelve.routine

In [None]:
twelve.top

In [None]:
stridex = Product('stridex maximum alcohol free')
stridex.link_sync()

In [None]:
stridex.ingredients

In [None]:
papaya = Product('papaya & vitamin c revive & brighten bar soap')

In [None]:
papaya.link_sync()

In [None]:
papaya._skip

In [None]:
am_routine_objects

In [None]:
# resume at 13 after debugging

with open('./data/am_routine_objects.pickle', 'wb') as handle:
    pickle.dump(am_routine_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('./data/am_cosdna_names.pickle', 'wb') as handle:
    pickle.dump(am_cosdna_names, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# big_list = []
# for routine in responses['am_routine']:
#     for product in routine:
#         big_list.append(product)

In [None]:
# prior experience

popular_brands = [
    'La Roche Posay',
    'Cerave', 
    'Cetaphil', 
    'Timeless Skincare', 
    'Skinceuticals', 
    'Stratia Skincare', 
    'Paula’s Choice', 
    'CosRX', 
    'The Ordinary', 
    'Drunk Elephant', 
    'Klairs', 
    'Inkey List', 
    'Farmacy ', 
    'First Aid Beauty'
]

popular_ingredients = [
    'Vitamin C', 
    'Glycolic Acid', 
    'Lactic Acid', 
    'Mandelic Acid', 
    'Niacinamide', 
    'Retinol', 
    'Hydroxypinocolone Retinoate', 
    'Bakuchiol', 
    'Retinyl Palmitate', 
    'Ceramides', 
    'Salicylic Acid', 
    'Willow bark Extract', 
    'Adapalene', 
    'Sodium Lauryl Sulfate', 
    'Cocoamidopropyl Betaine', 
]

In [None]:
tick = time.time()
n = 3
routine = Routine(name=n, routine=responses['am_routine'][n])
routine.link_sync()
print(routine.routine)
tock = time.time()
print(f'{tock - tick} s')

In [None]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
dv = DictVectorizer(sparse=True)
features = dv.fit_transform(dict(routine.top_ingredients()))

In [None]:
routine.has('1,3-butanediol')

In [None]:
routine.routine

In [None]:
for ingredient in popular_ingredients:
    print(ingredient)
    print(routine.has(ingredient))
    print()

## string grouper demo

string_grouper is giving me a hard time, so i am going to do this manually according to:
https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

In [None]:
import json
from string_grouper import match_strings, match_most_similar, group_similar_strings, StringGrouper

In [None]:
with open('./data/big_dict.json', 'rb') as handle:
    big_dict = json.load(handle)

In [None]:
product_list = pd.read_json('./data/big_dict.json', orient='index').reset_index(level=0)
product_list.columns = ['name', 'brand', 'product']
product_list.head()

In [None]:
product_list['name']

In [None]:
# matches = match_most_similar(product_list['name'], sample_list)
# pd.DataFrame({'sample_list': sample_list, 'matches': matches})

In [None]:
import re
import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [None]:
# def ngrams(string, n=3):
#     string = re.sub(r'[,-./]|\s',r'', string)
#     ngrams = zip(*[string[i:] for i in range(n)])
#     return [''.join(ngram) for ngram in ngrams]

def ngrams(string, n=3):
    string = unidecode.unidecode(string)
    string = string.lower()
    chars_to_remove = [")", "(", ".", "|", "[", "]", "{", "}", "'", '"', 
                       "?", "!"]
    punctuation = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(punctuation, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = re.sub(' +',' ',string).strip()
    string = ' '*(n-1) + string +' '*(n-1) # pad names for ngrams...
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

# vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
# matrix = vectorizer.fit_transform(sample_list)

In [None]:
ngrams('Lancôme')

In [None]:
print(matrix[0])
print(ngrams(sample_list[0]))

In [None]:
# https://caserta.com/data-blog/string-matching-record-linkage-python-strategies/

def tfidf_match(list1, list2, sort=False):
    """For each item in list1, find the match in list2"""

    list1c = list1.copy()
    list2c = list2.copy()

    def split_sorter(name):
        split = re.split(' +', name)
        split.sort()
        return ' '.join(split)

    if sort:
        list1c = [split_sorter(e) for e in list1c]
        list2c = [split_sorter(e) for e in list2c]
            
    vectorizer = TfidfVectorizer(analyzer=ngrams, lowercase=True)
    tfidf = vectorizer.fit_transform(list2c)
    nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
    distances, indices = nbrs.kneighbors(vectorizer.transform(list1c))
    
    matches = [(round(distances[i][0], 2), list1[i], list2[j[0]]) 
               for i, j in enumerate(indices)]
    matches = pd.DataFrame(matches, 
                           columns=['distance', 'input', 'matched'])
    return matches

In [None]:
def split_sorter(name):
    split = re.split(' +', name)
    split.sort()
    return ' '.join(split)

In [None]:
split_sorter("kiehl's vitamin c serum powerful-strength")

In [None]:
tick = time.time()
kiehls = [
    "Kiel's Powerful-Strength Line-Reducing Concentrate 12.5% Vitamin C Serum",
    "kiehl's vitamin c serum powerful-strength",
    "Kiehl's Calendula hydrating mask 5 minute",
    "Kiehl's Blue Herbal Acne Cleansing Treatment",
    "Kiehl's ultrafacial toner"
]
matches = tfidf_match(kiehls, product_list['name'], sort=True)
tock = time.time()
print(f'{tock - tick} s')
matches

In [None]:
split = kiehls[0].split(' ')
split.sort()

In [None]:
split

In [None]:
lrp = [
    "La Roche-Posay tinted Anthelios 50 Mineral Sunscreen",
    "La Roche Posay Medicated Gel Cleanser",
    "La Roche-Posay Tolerane makeup remover"
]
tfidf_match(lrp, product_list['name'])

In [None]:
english = [
    'first i use la roche posay effaclare cleanser', 
    'then moisturize using kiehls super multi corrective cream',
    'then kiehls facial fuel sunblock if i go outside']

tfidf_match(english, product_list['name'], sort=True)

## using rapidfuzz

In [None]:
from rapidfuzz import fuzz
from rapidfuzz import process

In [None]:
process.extractOne(
    "La Roche-Posay Tolerane makeup remover", 
    product_list['name']
    )

## common ingredient dictionary

In [None]:
with open('./data/master_dict.pickle', 'rb') as handle:
    master_dict = pickle.load(handle)

In [None]:
def ingredient_dict_builder(routine_dict, master_dict):
    base_url = 'https://cosdna.com/eng/'
    filtered_dict = {k:v for k,v in routine_dict.items() if k not in master_dict.keys()}
    for k, v in filtered_dict.items():
        driver.get(base_url + k + '.html')
        print(f'{k}: {v}')
        ingredient = input('preferred name: ')
        if ingredient == 'BREAK':
            break
        elif ingredient == '':
            master_dict[k] = v
        else:
            master_dict[k] = ingredient
    return master_dict

In [None]:
am_routines = []
for rout in responses['am_routine']:
    routine = Routine(name=i, routine=responses['am_routine'][i])
    routine.link_sync()
#     master_dict = ingredient_dict_builder(routine._routine_dict, master_dict)
    am_routines.append(routine)

In [None]:
for i in range(5, 10):
    routine = Routine(name=i, routine=responses['am_routine'][i])
    routine.link_sync()
    master_dict= ingredient_dict_builder(routine._routine_dict, master_dict)

In [None]:
master_dict

In [None]:
# saving pickle
with open('./data/master_dict.pickle', 'wb') as handle:
    pickle.dump(master_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)