In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
# from selenium import webdriver
# from selenium.webdriver.common.by import By
import time
from io import BytesIO
from PIL import Image
import random
import re
import json

In [2]:
# get all lipstick links from Sephora
def get_lipstick_links():
    url = 'https://www.sephora.com/shop/lips-makeup?pageSize=12&currentPage='
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
    lipstick_links = list()
    page_number = 1
    while True:
        time.sleep(1)
        try:
            is_valid_page = False
            response = requests.get(url + str(page_number), headers = headers, timeout = 5)
            if not response.status_code == 200:
                return f'Http Errors: {response.status_code}'
            else:
                result_page = BeautifulSoup(response.content, 'lxml')
                all_a_tags = result_page.find_all('a')
                for tag in all_a_tags:
                    if tag.get('href').startswith('/product'):
                        is_valid_page = True
                        link = 'https://www.sephora.com' + tag.get('href')
                        lipstick_links.append(link)
            if is_valid_page:
                page_number += 1
            else:
                return lipstick_links
        except:
            return None

In [3]:
lipstick_links = get_lipstick_links()

In [5]:
# get basic lipstick info including product_name(str), brand_name(str), price(float), love_count(int)
def get_lipstick_info(lipstick_link):
    time.sleep(1)
    lipstick_info = dict()
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
    response = requests.get(lipstick_link, headers = headers, timeout = 5)
    if not response.status_code == 200:
        return f'Http Errors: {response.status_code}'
    else:
        result_page = BeautifulSoup(response.content, 'lxml')
        try:
            product_name = result_page.find('span',class_='css-r4ddnb ').get_text().strip()
        except:
            product_name = np.nan
        try:
            brand_name = result_page.find('img', attrs = {'data-comp': 'BrandLogo Image Box'}).get('alt').strip().upper()
        except:
            brand_name = np.nan
        try:
            if len(list(result_page.find('div', attrs = {'data-comp': 'Price Box'}).children)) > 1:
                price = float(result_page.find('div', attrs = {'data-comp': 'Price Box'}).get_text()[:6].replace('$', ''))
            else:
                price = float(result_page.find('div', attrs = {'data-comp': 'Price Box'}).get_text().strip().replace('$', ''))
        except:
            price = np.nan
        try:
            love_count = int(result_page.find('span', attrs = {'data-at': 'product_love_count'}).get_text().strip())
        except:
            love_count = np.nan
        lipstick_info['product_name'] = product_name
        lipstick_info['brand_name'] = brand_name
        lipstick_info['price'] = price
        lipstick_info['love_count'] = love_count
        lipstick_info['link'] = lipstick_link
        return lipstick_info

In [6]:
# create a pandas data fram to store all the lipstick basic info and get the csv file
lipstick_info_df = pd.DataFrame({'product_name': [get_lipstick_info(url)['product_name'] for url in lipstick_links],
                                'brand_name': [get_lipstick_info(url)['brand_name'] for url in lipstick_links],
                                'price': [get_lipstick_info(url)['price'] for url in lipstick_links],
                                'love_count': [get_lipstick_info(url)['love_count'] for url in lipstick_links],
                                'link': [get_lipstick_info(url)['link'] for url in lipstick_links]})
lipstick_info_df.to_csv('lipstick_info.csv')

In [39]:
# get the text info of lipsticks, including product_name, details, how_to_use and about_the_brand
def get_lipstick_text_info(lipstick_link):
    time.sleep(1)
    lipstick_text_info = dict()
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
    response = requests.get(lipstick_link, headers = headers, timeout = 5)
    if not response.status_code == 200:
        return f'Http Errors: {response.status_code}'
    else:
        result_page = BeautifulSoup(response.content, 'lxml')
        try:
            product_name = result_page.find('span', class_ = 'css-r4ddnb ').get_text().strip()
        except:
            product_name = np.nan
        try:
            details = result_page.find_all('div', class_ = 'css-192qj50')[0].get_text().strip()
        except:
            details = np.nan
        try:
            how_to_use = result_page.find_all('div', class_ = 'css-192qj50')[1].get_text().strip()
        except:
            how_to_use = np.nan
        try:
            about_the_brand = result_page.find('p', class_='css-1loxqbt').get_text().strip()
        except:
            about_the_brand = np.nan
        lipstick_text_info['product_name'] = product_name
        lipstick_text_info['details'] = details
        lipstick_text_info['how_to_use'] = how_to_use
        lipstick_text_info['about_the_brand'] = about_the_brand
        return lipstick_text_info

In [12]:
lipstick_text_info_df = pd.DataFrame({'product_name': [get_lipstick_text_info(url)['product_name'] for url in lipstick_links],
                                'details': [get_lipstick_text_info(url)['details'] for url in lipstick_links],
                                'how_to_use': [get_lipstick_text_info(url)['how_to_use'] for url in lipstick_links],
                                'about_the_brand': [get_lipstick_text_info(url)['about_the_brand'] for url in lipstick_links]})
lipstick_text_info_df.to_csv('lipstick_text_info.csv')

TypeError: string indices must be integers

In [40]:
# get the reviews of each lipstick with its sephora link and return a list of reviews
def get_lipstick_reviews(link):
    reviews = list()
    pattern = re.compile(r'P\d+')
    product_id = re.search(pattern, string = link).group()
    off_set = 0
    while True:
        api_url = ''.join(["https://api.bazaarvoice.com/data/reviews.json?Filter=ProductId%3A",
                          product_id,
                          "&Sort=Helpfulness%3Adesc&Limit=100&Offset=",
                           str(off_set),
                           "&Include=Products%2CComments&Stats=Reviews&passkey=rwbw526r2e7spptqd2qzbkp7&apiversion=5.4"])
        response = requests.get(api_url)
        results = json.loads(response.content, encoding = 'utf-8')['Results']
        if results:
            for review in results:
                reviews.append(review['ReviewText'])
        else:
            break
        off_set += 100
    return reviews

In [21]:
# get the rgb value of each color of a lipstick
def get_lipstick_colors(lipstick_link):
    lipstick_colors = dict()
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
    response = requests.get(lipstick_link, headers = headers, timeout = 5)
    if not response.status_code == 200:
        return f'Http Errors: {response.status_code}'
    else:
        result_page = BeautifulSoup(response.content, 'lxml')
        try:
            product_name = result_page.find('span',class_='css-r4ddnb ').get_text().strip()
            colors = dict()
            if result_page.find('div', class_ = "css-gth5yg "):
                for tag in result_page.find('div', class_ = "css-gth5yg "):
                    color = tag.find('div', attrs = {'data-at': 'selected_swatch'})['aria-label']
                    img_src = ''.join(["https://www.sephora.com", tag.find('img')['src']])
                    response = requests.get(img_src, headers = headers, timeout = 5)
                    im = Image.open(BytesIO(response.content))
                    pix = im.load()
                    width = im.size[0]
                    height = im.size[1]
                    r, g, b = pix[width / 2, height / 2]
                    colors[color] = (r, g, b)
                    time.sleep(1)
        except:
            return None
    lipstick_colors['product_name'] = product_name
    lipstick_colors['colors'] = colors
    return lipstick_colors

In [34]:
lipstick_colors_df = pd.DataFrame({'product_name': p_list,
                                  'color': c_list,
                                  '(r, g, b)': rgb_list})
lipstick_colors_df.to_csv('lipstick_colors.csv')

In [13]:
# combine get_lispstick_info and get_lipstick_colors and generate a csv file containing all the previous attrs of lipsticks
def get_lipstick(lipstick_link):
    time.sleep(random.randint(1,3))
    lipstick = dict()
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
    response = requests.get(lipstick_link, headers = headers, timeout = 10)
    result_page = BeautifulSoup(response.content, 'lxml')
    try:
        product_name = result_page.find('span',class_='css-r4ddnb ').get_text().strip()
    except:
        product_name = np.nan
    try:
        brand_name = result_page.find('img', attrs = {'data-comp': 'BrandLogo Image Box'}).get('alt').strip().upper()
    except:
        brand_name = np.nan
    try:
        if len(list(result_page.find('div', attrs = {'data-comp': 'Price Box'}).children)) > 1:
            price = float(result_page.find('div', attrs = {'data-comp': 'Price Box'}).get_text()[:6].replace('$', ''))
        else:
            price = float(result_page.find('div', attrs = {'data-comp': 'Price Box'}).get_text().strip().replace('$', ''))
    except:
        price = np.nan
    try:
        love_count = int(result_page.find('span', attrs = {'data-at': 'product_love_count'}).get_text().strip())
    except:
        love_count = np.nan
    colors = dict()
    if result_page.find('div', class_ = "css-gth5yg "):
        for tag in result_page.find('div', class_ = "css-gth5yg "):
            if tag.find('img'):
                color = tag.find('div', attrs = {'data-at': 'selected_swatch'})['aria-label']
                img_src = ''.join(["https://www.sephora.com", tag.find('img')['src']])
                response = requests.get(img_src, headers = headers, timeout = 5)
                im = Image.open(BytesIO(response.content))
                pix = im.load()
                width = im.size[0]
                height = im.size[1]
                r, g, b = pix[width / 2, height / 2]
                colors[color] = (r, g, b)
                time.sleep(0.5)

    lipstick['product_name'] = product_name
    lipstick['brand_name'] = brand_name
    lipstick['price'] = price
    lipstick['love_count'] = love_count
    lipstick['colors'] = colors
    lipstick['link'] = lipstick_link
    return lipstick

In [25]:
product_name_l = []
brand_name_l = []
price_l = []
love_count_l = []
color_l = []
rgb_l = []
link_l = []
for url in lipstick_links:
    try:
        lipstick_info = get_lipstick(url)
        count = len(lipstick_info['colors'])
        product_name_l.extend([lipstick_info['product_name']] * count)
        brand_name_l.extend([lipstick_info['brand_name']] * count)
        price_l.extend([lipstick_info['price']] * count)
        love_count_l.extend([lipstick_info['love_count']] * count)
        color_l.extend(list(lipstick_info['colors'].keys()))
        rgb_l.extend(list(lipstick_info['colors'].values()))
        link_l.extend([url] * count)
    except:
        print(url)
        continue
        
lipstick_df = pd.DataFrame({'product_name': product_name_l,
                           'brand_name': brand_name_l,
                           'price': price_l,
                           'love_count': love_count_l,
                           'color': color_l,
                           'rgb': rgb_l,
                           'link': link_l})
lipstick_df.to_csv("lipstick.csv")