# Extract features from html

In [33]:
import requests, bs4
from bs4 import BeautifulSoup as bs
import json
import re
import time
import random
import pandas as pd
import pickle 
import numpy as np

### Feautures to extract from html
* Predictor: Rating value --`rating_val`
* Brand --`brand`
1. number of reviews -- `review_count	`
2. product name -- `productName` -- after scraping find `wordCount_name`
3. product description word count -- `wordCount_descr`
4. price (USD) -- `price_USD`
5. size(volume) -- `size`
6. length of ingredients list -- `len_ingredients`
7. Number of benefits (count bullet points) -- `benefits_count`
8. Number of key Ingredients (count bullet points) -- `key_ingredients`
9. count of made_without_list (Formulated without) --`clean_formula`
10. count of Clinical Results --`clinical`

Feautures after scraping

12. Price per oz 
13. key_ingredients/len_ingredients (%). Is the product packed with key ingr or fillers?

## 1. Load pickle file and parse with BeautifulSoup

In [2]:
def create_soup(pickle_file_name):
    with open(pickle_file_name, "rb") as f:
        file = pickle.load(f)
    
    soup_list = []
    for page in file:
        soup = bs(page.text)
        soup_list.append(soup)
    return soup_list

### 1a. moisturizers

In [3]:
pickle_files_list = (['Number_1_10.pickle',
                      'Number_11_20.pickle',
                      'Number_21_80.pickle',
                      'Number_81_200.pickle',
                      'Number_201_400.pickle',
                      'Number_401_1000.pickle',
                      'Number_1001_end.pickle'])

In [4]:
moisturizer_soup = []
for name in pickle_files_list:
    output_list = create_soup(name)
    moisturizer_soup.extend(output_list)

In [5]:
#1349
len(moisturizer_soup)

1349

In [6]:
# to find the product name 
def find_name(idx):
    name = moisturizer_soup[idx].find('div', class_="ProductMainSection__productName").text
    return name

### 1B: load pickle of face serums

In [7]:
serum_pickles = (['face_page.pickle',
                 'face_page2_150.pickle',
                 'face_page2_300.pickle',
                 'face_page2_end.pickle'])

In [8]:
serum_soup = []
for name in serum_pickles:
    output_list = create_soup(name)
    serum_soup.extend(output_list)

In [9]:
#557
len(serum_soup)

557

## 2. Extract features 

### I. Product detail

Features to extract
* Predictor: Rating value --`rating_val`
* Brand --`brand`
1. number of reviews -- `review_count	`
2. product name -- `productName` -- after scraping find `wordCount_name`
3. product description word count -- `wordCount_descr`
4. price (USD) -- `price_USD`

Note:
* `rating_val` and `review_count` missing for products
* These products will be dropped in EDA part

In [10]:
# find product detail from url 
def productDetail(soup_product):
    
    detail_tag = (soup_product.find(class_ = 'ProductDetail__wrapper')
          .find('script', type="application/ld+json"))
    
    # convert detail_tag to string type to parse with json
    # returns review in dict
    detail_json = json.loads(detail_tag.text) 
    
    if 'aggregateRating' in detail_json:
        rating_val = detail_json['aggregateRating']['ratingValue']
        review_count = detail_json['aggregateRating']['reviewCount'] 
    else:
        rating_val = None
        review_count = None

    wordCount_descr = len(detail_json['description'].split(' '))
    brand = detail_json['brand'].lower()
    productName = detail_json['name'].lower()
    price_USD = detail_json['offers']['price']
    
    return brand, productName, rating_val, review_count, wordCount_descr, price_USD

### II. Product size
5. Product size - `size`

Note: 
* drop products that don't have a size, they are part of a bundle
https://www.ulta.com/p/glow-starts-here-bestselling-skin-essentials-kit-pimprod2024933
* some products don't have the size listed, but should have one, what should we do?
https://www.ulta.com/p/maracuja-tinted-moisturizer-pimprod2014942

In [11]:
def product_size(soup_product):

    # change make sure we are extracting Size 
    main_section = soup_product.find('div', class_="ProductMainSection__itemNumber").text
    
    # size is in the main section 
    if 'Size' in main_section:
        size = main_section.split('|')[0]
    
    # size is in another section called color_panel
    elif soup_product.find(class_ = 'ProductDetail__colorPanel') != None: 
        options_tag = soup_product.find(class_ = 'ProductDetail__colorPanel')
        size_loc = list(options_tag.children)[0].text
        
        if 'Size' in size_loc:
            size = (options_tag
                         .find(class_ = "Text Text--body-2 Text--left Text--small").text)
            
        #size is not her 
        else:
            size = 'NaN'
    
    # product has no size information
    else:
        size = 'NaN'
        
    return size

### III. Ingredients list
6. length of ingredients list -- `len_ingredients`

Note: products where `len_ingredients` is `None` are missing an Ingredients list

In [12]:
# return the count of the ingredients list
def ingredients_info(soup_product):
    try:
        ingredients_tag = (soup_product.find('div', class_="ProductDetail__ingredients")
                       .find('div', class_="ProductDetail__productContent"))
        ingredients_child = list(ingredients_tag.children)
    except:
        return 'none', None, None
    
    # on type of ingredients followed by <br>
    if len(ingredients_child) <= 2:
        ingredients_1 = ingredients_child[0].text.lower()
        ingredients_list = ingredients_1.split(',')
        active_ingredients = ingredients_list[:7]
        
    # when ingredients list has 2 types
    # 'Active' - child[0], break line <br> - child[1], Inactive - child[2]
    else:     
        ingredients_1 = ingredients_child[0].text.lower()
        ingredients_2 = ingredients_child[2].text.lower()
        ingredients_list_1 = ingredients_1.split(',')
        ingredients_list_2 = ingredients_2.split(',')
        ingredients_list = ingredients_list_1 + ingredients_list_2
        active_ingredients = ingredients_list[:7]

    return ingredients_list, len(ingredients_list), len(active_ingredients)

### III Details

7. Number of benefits (count bullet points) -- `benefits_count`
8. Number of key Ingredients (count bullet points) -- `key_ingredients`
9. count of made_without_list (Formulated without) --`clean_formula`
10. count of Clinical Results --`clinical`

In [13]:
# count how many times the li tag appears in the ul tag (unsorted list)
def count_li_tag(ul_tag):
    count = 0
    for child in list(ul_tag.children):
        if child.name == "li":
            count += 1
    return count

# count bullet points in details 
def details_count(soup_product):
    details_tag = soup_product.find('div', class_="ProductDetail__productContent")
    benefits_count = None
    key_ingredients = None
    clean_formula = None
    clinical = None 
    
    mode = "tag_search"
    tagSearch = "tag_search"
    benefits_ul_tag = "benefits_ul"
    ingredients_ul_tag = "ingredients_ul"
    formulated_ul_tag = "formulated_ul"
    clinical_ul_tag = "clinical_ul"
    
    for detail_child in list(details_tag.children):
        if mode == tagSearch:
            if detail_child.text.strip() == "Benefits":
                mode = benefits_ul_tag
                continue
            elif detail_child.text.strip() == "Key Ingredients":
                mode = ingredients_ul_tag
                continue
            elif detail_child.text.strip() == "Formulated Without":
                mode = formulated_ul_tag
                continue 
            elif detail_child.text.strip() == "Clinical Results":
                mode = clinical_ul_tag
                continue             
            else:
                continue

        # search within ul tag        
        if mode == benefits_ul_tag:
            if detail_child.name == "ul": #if the tag is ul
                benefits_count = count_li_tag(detail_child)
                mode = tagSearch
                continue
            else:
                continue
        if mode == ingredients_ul_tag:
            if detail_child.name == "ul": #if the tag is ul
                key_ingredients = count_li_tag(detail_child)
                mode = tagSearch
            else:
                continue      
        if mode == formulated_ul_tag:
            if detail_child.name == "ul":
                clean_formula = count_li_tag(detail_child)
                mode = tagSearch
        if mode == clinical_ul_tag:
            if detail_child.name == "ul":
                clinical = count_li_tag(detail_child)
                mode = tagSearch  
                
    # test statements if no ul was found
    if mode == benefits_ul_tag:
        print("Benefits with no ul:" + soup_product)
    elif mode == ingredients_ul_tag:
        print("Key Ingredients with no ul:" + soup_product)
    elif mode == formulated_ul_tag:
        print('Formuated wihout with no ul')
    elif mode == clinical_ul_tag:
        print('Clinical Results with no ul')    
    
    return benefits_count, key_ingredients, clean_formula, clinical

## 3. Putting all features together

* Predictor: Rating value --`rating_val`
* Brand --`brand`
1. number of reviews -- `review_count	`
2. product name -- `productName` -- after scraping find `wordCount_name`
3. product description word count -- `wordCount_descr`
4. price (USD) -- `price_USD`
5. size(volume) -- `size`
6. length of ingredients list -- `len_ingredients`
7. Number of benefits (count bullet points) -- `benefits_count`
8. Number of key Ingredients (count bullet points) -- `key_ingredients`
9. count of made_without_list (Formulated without) --`clean_formula`
10. count of Clinical Results --`clinical`

In [14]:
def products_features(product_soup, productCategory):  
    
    brand, productName, rating_val, review_count, wordCount_descr, price_USD\
    = productDetail(product_soup)
    
    size = product_size(product_soup)
    
    ingredients_list,len_ingredients,len_active = ingredients_info(product_soup)
    
    benefits_count, key_ingredients, clean_formula, clinical = details_count(product_soup)
    
    productType = productCategory
    
    product_dict = ({
        'brand':brand,
        'productName':productName,
        'price_USD':price_USD,
        'rating_val':rating_val,
        'review_count':review_count,
        'wordCount_descr':wordCount_descr,
        'size':size,
        'key_ingredients':key_ingredients,
        'ingredients_list':ingredients_list,
        'len_ingredients':len_ingredients, 
        'len_actives':len_active,
        'benefits_count':benefits_count,
        'clean_formula':clean_formula,
        'clinical':clinical,
        'productType':productType
    }) 
    
    return product_dict

## 4. Get features of pipeline soup, create df

### a) moisturizers

In [15]:
moisturizer_pipeline = []
for soup in moisturizer_soup:
    feature = products_features(soup, 'moisturizer')
    moisturizer_pipeline.append(feature)

In [16]:
len(moisturizer_pipeline)

1349

In [17]:
moisturizer_df = pd.DataFrame(moisturizer_pipeline)

In [18]:
moisturizer_df.head()

Unnamed: 0,brand,productName,price_USD,rating_val,review_count,wordCount_descr,size,key_ingredients,ingredients_list,len_ingredients,len_actives,benefits_count,clean_formula,clinical,productType
0,it cosmetics,confidence in a cream anti-aging moisturizer,49.5,4.5,4431.0,26,2.0 oz,10.0,"[aqua (water, eau), butylene glycol, cyclop...",102.0,7.0,7.0,,5.0,moisturizer
1,clinique,dramatically different moisturizing lotion+,32.5,4.4,4962.0,15,4.2 oz,2.0,"[water / aqua / eau, mineral oil / paraffinum...",23.0,7.0,9.0,7.0,,moisturizer
2,urban decay cosmetics,hydromaniac glowy tinted hydrator foundation,29.0,4.3,2200.0,23,Size 1.1 oz,2.0,"[50: aqua / water / eau, phenyl trimethicone,...",32.0,7.0,8.0,,4.0,moisturizer
3,strivectin,sd advanced plus intensive moisturizing concen...,79.0,4.6,613.0,27,2.0 oz,,"[aqua (water, eau), glycerin, pentaerythrit...",72.0,7.0,7.0,,,moisturizer
4,josie maran,whipped argan oil face butter,42.0,4.8,1188.0,28,Size 1.7 oz,2.0,"[aqua (water)**, argania spinosa (argan) kern...",36.0,7.0,4.0,,,moisturizer


In [19]:
moisturizer_df.shape

(1349, 15)

### b) face serums

In [20]:
serum_pipeline = []
for soup in serum_soup:
    feature = products_features(soup, 'serum')
    serum_pipeline.append(feature)

In [21]:
serum_df = pd.DataFrame(serum_pipeline)

In [22]:
serum_df.head(2)

Unnamed: 0,brand,productName,price_USD,rating_val,review_count,wordCount_descr,size,key_ingredients,ingredients_list,len_ingredients,len_actives,benefits_count,clean_formula,clinical,productType
0,clarins,double serum,169.0,4.3,9997.0,24,2.5 oz,7.0,"[aqua/water/eau, cetearyl isononanoate, glyc...",52.0,7.0,9.0,,,serum
1,strivectin,super-c retinol brighten & correct vitamin c s...,72.0,4.4,627.0,27,Size 1.0 oz,3.0,"[aqua (water, eau), sodium ascorbyl phosphat...",49.0,7.0,3.0,,6.0,serum


### Combining dataframes

In [50]:
skincare_df = pd.concat([moisturizer_df,serum_df], axis=0, join='outer')

In [51]:
skincare_df.head(1)

Unnamed: 0,brand,productName,price_USD,rating_val,review_count,wordCount_descr,size,key_ingredients,ingredients_list,len_ingredients,len_actives,benefits_count,clean_formula,clinical,productType
0,it cosmetics,confidence in a cream anti-aging moisturizer,49.5,4.5,4431.0,26,2.0 oz,10.0,"[aqua (water, eau), butylene glycol, cyclop...",102.0,7.0,7.0,,5.0,moisturizer


In [52]:
skincare_df.tail(1)

Unnamed: 0,brand,productName,price_USD,rating_val,review_count,wordCount_descr,size,key_ingredients,ingredients_list,len_ingredients,len_actives,benefits_count,clean_formula,clinical,productType
556,derma e,acne control treatment serum,11.89,4.4,266.0,15,Size 2.0 oz,2.0,[active: salicylic acid 0.5%. inactive: purifi...,19.0,7.0,2.0,,,serum


In [53]:
skincare_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1906 entries, 0 to 556
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   brand             1906 non-null   object 
 1   productName       1906 non-null   object 
 2   price_USD         1906 non-null   object 
 3   rating_val        1849 non-null   float64
 4   review_count      1849 non-null   float64
 5   wordCount_descr   1906 non-null   int64  
 6   size              1906 non-null   object 
 7   key_ingredients   1381 non-null   float64
 8   ingredients_list  1906 non-null   object 
 9   len_ingredients   1875 non-null   float64
 10  len_actives       1875 non-null   float64
 11  benefits_count    1722 non-null   float64
 12  clean_formula     708 non-null    float64
 13  clinical          326 non-null    float64
 14  productType       1906 non-null   object 
dtypes: float64(8), int64(1), object(6)
memory usage: 238.2+ KB


## Data cleanup and EDA

In [54]:
# drop rows where rating_val is none
# reset_index to keep original index
skincare_df = skincare_df.dropna(subset= 'rating_val', axis = 0)

In [55]:
skincare_df.head(1)

Unnamed: 0,brand,productName,price_USD,rating_val,review_count,wordCount_descr,size,key_ingredients,ingredients_list,len_ingredients,len_actives,benefits_count,clean_formula,clinical,productType
0,it cosmetics,confidence in a cream anti-aging moisturizer,49.5,4.5,4431.0,26,2.0 oz,10.0,"[aqua (water, eau), butylene glycol, cyclop...",102.0,7.0,7.0,,5.0,moisturizer


In [56]:
# Fill NaN values with 0 since it means that the product does not contain the feature,
skincare_df = skincare_df.fillna(0)

In [57]:
# clean size column - return int
def clean_size(size_val):
    if isinstance(size_val, str): 
        size_val = (size_val.replace('Size', " ").replace('oz', " ")
                    .replace('ct', " ").replace('pair', " ")
                    .replace('fl', " ").strip())
    return size_val

In [59]:
# problematic size
#skincare_df[skincare_df['size'] == 'Size 7 x 0.35 oz']
skincare_df = skincare_df.drop(531)

In [60]:
# convert string to int type
skincare_df['size_float'] = skincare_df['size'].apply(clean_size).astype(np.float64)

# find mode of size, then fill NaN with mode
mode_val = float(skincare_df['size_float'].mode())
skincare_df['size_float'].fillna(mode_val, inplace=True)

In [61]:
def replace_zero(val):
    if val == 0:
        val = mode_val
    return val

# replace zero with mode 
skincare_df['size_float'] = skincare_df['size_float'].apply(replace_zero)

In [62]:
del skincare_df['size']

In [63]:
print(skincare_df.shape)
skincare_df.sample(5)

(1847, 15)


Unnamed: 0,brand,productName,price_USD,rating_val,review_count,wordCount_descr,key_ingredients,ingredients_list,len_ingredients,len_actives,benefits_count,clean_formula,clinical,productType,size_float
893,specific beauty,dark spot go away target treatment,24.99,4.4,88.0,46,5.0,"[water, butylene glycol, dimethyl isosorbide...",24.0,7.0,7.0,4.0,0.0,moisturizer,0.5
656,aveeno,positively radiant daily moisturizer spf 15,19.99,4.3,407.0,32,2.0,"[active: avobenzone 3%, octinoxate 7.5%, oct...",33.0,7.0,5.0,0.0,1.0,moisturizer,4.0
176,fresh,sugar lip balm gift set,18.5,4.7,10.0,25,2.0,[sugar lip treatment advanced therapy: cera al...,72.0,7.0,2.0,0.0,0.0,moisturizer,1.0
101,perricone md,cold plasma plus+ sub-d / neck,135.0,4.2,402.0,20,3.0,none,0.0,0.0,14.0,0.0,3.0,moisturizer,2.0
1203,holika holika,good cera super ceramide mist,20.0,5.0,1.0,45,7.0,"[water, methylpropanediol, pentylene glycol,...",36.0,7.0,5.0,0.0,0.0,moisturizer,1.0


In [64]:
#update v
skincare_df.to_csv('skincare_products.csv', index = False)