### Libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re


### HTTP Requests

In [2]:
# big ecommerce
web_sephora = 'https://www.sephora.co.id/search?q=moisturizer' #req can


In [3]:
response = requests.get(web_sephora)

In [4]:
response.status_code

200

### Soup Object

In [5]:
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
soup

<!DOCTYPE html>

<html data-n-head="%7B%22lang%22:%7B%22ssr%22:%22en%22%7D%7D" data-n-head-ssr="" lang="en">
<head>
<title>Pencarian | Sephora Indonesia</title><meta charset="utf-8" data-n-head="ssr"/><meta content="width=device-width, initial-scale=1" data-n-head="ssr" name="viewport"/><meta content="SEPHORA" data-n-head="ssr" name="author"/><meta content="#000" data-n-head="ssr" name="theme-color"/><meta content="https://static-reg.lximg.com/assets/sephora_og_image-34f1169559fdb47662226655a01e2f90dabe8622664f82b7f9b4ed5907237b3d.jpg" data-n-head="ssr" id="og-image" name="og:image" property="og:image"/><meta content="website" data-n-head="ssr" id="og-type" name="og:type" property="og:type"/><meta content="Sephora" data-n-head="ssr" id="og-site-name" property="og:site_name"/><meta content="false" data-n-head="ssr" property="al:web:should_fallback"/><meta charset="utf-8" data-hid="charset" data-n-head="ssr"/><meta content="yes" data-hid="mobile-web-app-capable" data-n-head="ssr" name="m

In [7]:
results = soup.find_all('div', {'class': 'products-card-container'})

In [8]:
len(results)

36

In [9]:
results[0]

<div class="products-card-container" data-v-2034eb20=""><div class="product-card" data-list="search" data-position="1" data-product-brand="Bobbi Brown" data-product-id="17518" data-product-name="Vitamin Enriched Face Base - Moisturizer and Primer" data-v-893a5fe2=""><div class="product-card-details" data-v-893a5fe2=""><div class="product-card-ad" data-v-893a5fe2=""><!-- --></div> <!-- --> <a aria-label="/products/bobbi-brown-vitamin-enriched-face-base-moisturizer-and-primer/v/7ml" class="product-card-image-link" data-v-893a5fe2="" href="/products/bobbi-brown-vitamin-enriched-face-base-moisturizer-and-primer/v/7ml"><img alt="Vitamin Enriched Face Base - Moisturizer and Primer" class="product-card-image" loading="lazy" src="https://image-optimizer-id.production.sephora-asia.net/eyJlZGl0cyI6eyJyZXNpemUiOnt9fX0=/images/product_images/default_1_Product_716170269443-Bobbi-Brown-Vitamin-Enriched-Face-Base-Face-Primer-7ml_8b82231c699f6994f5310ef4c7540690221f546f_1627052192.png"/></a> <!-- --> 

### Target Necessary Data

In [None]:
# Link              V
# Brand
# Product Name      V
# Price             V
# Rating            V
# Sold Count        V
# Sizes

In [None]:
def get_skincare_data(results):
    # Initialize a list to hold the product data
    product_data = []

    # Loop through each index in results
    for index in range(len(results)):
        # Get all of the info of the ecommerce
        product_cards = results[index].find_all('div', {'class': 'product-card'})

        # Extract data-product-brand and data-product-name 
        for card in product_cards:
            product_name = card.get('data-product-name')
            product_brand = card.get('data-product-brand')
            
            # Extract sell price
            sell_price_text = card.find('span', class_='sell-price').get_text(strip=True) if card.find('span', class_='sell-price') else 'N/A'
            sell_price = int(sell_price_text.replace('Rp. ', '').replace('.', '').strip()) if 'Rp.' in sell_price_text else 'N/A'
            
            # Extract reviews count
            reviews_count_text = card.find('span', class_='reviews-count').get_text(strip=True) if card.find('span', class_='reviews-count') else 'N/A'
            reviews_count = int(reviews_count_text.replace('(', '').replace(')', '').strip()) if '(' in reviews_count_text else 'N/A'
            
            # Extract rating
            rating = card.find('div', class_='stars')['style'] if card.find('div', class_='stars') else 'N/A'
            highlighted_percentage = 'N/A'
            
            if rating != 'N/A':
                # Use regex to find the highlighted percentage value
                match = re.search(r'--highlighted-percentage:(\d+)%', rating)
                if match:
                    highlighted_percentage = int(match.group(1))  # Convert to integer
            
            # Append the extracted data to the list
            product_data.append({
                'brand': product_brand,
                'product_name': product_name,
                'price': sell_price,
                'reviews_count': reviews_count,
                'rating_percentage': highlighted_percentage
            })

    # Create a DataFrame from the list of product data
    df = pd.DataFrame(product_data)
    
    return df

# Call the function and print the DataFrame
print(get_skincare_data(results))


                 Brand                                       Product Name  \
0          Bobbi Brown  Vitamin Enriched Face Base - Moisturizer and P...   
1        Laura Mercier            Tinted Moisturizer Blurred Matte SPF 30   
2             Caudalie      Vinoperfect Dark Spot Niacinamide Moisturizer   
3         bareMinerals  Complexion Rescue Tinted Moisturizer Mineral S...   
4                FRESH  Lotus Youth Preserve Moisturizer With Multi-Ac...   
5         Estée Lauder  Revitalizing Supreme+ Youth Power Soft Creme M...   
6        Laura Mercier             Tinted Moisturizer Natural Dewy SPF 30   
7              KIEHL'S     Expertly Clear Moisturizer For Acne Prone Skin   
8              LANEIGE  Water Bank Blue Hyaluronic Moisturizer Cream R...   
9             Nudestix                  Citrus-C Mask & Daily Moisturizer   
10              La Mer       The Hydrating Infused Emulsion - Moisturizer   
11   Benefit Cosmetics           The Porefessional Smooth Sip Moisturizer   

In [None]:
# origin

# get all of the info of the ecommerce
product_cards = results[0].find_all('div', {'class':'product-card'})

# Extract data-product-brand and data-product-name
if product_cards:
    for card in product_cards:
        product_name = card.get('data-product-name')
        product_brand = card.get('data-product-brand')
        # Extract sell price
        sell_price = card.find('span', class_='sell-price').get_text(strip=True) if card.find('span', class_='sell-price') else 'N/A'
        
        # Extract reviews count
        reviews_count = card.find('span', class_='reviews-count').get_text(strip=True) if card.find('span', class_='reviews-count') else 'N/A'
        
        # Extract rating
        rating = card.find('div', class_='stars')['style'] if card.find('div', class_='stars') else 'N/A'
        highlighted_percentage = 'N/A'
        
        if rating != 'N/A':
            # Use regex to find the highlighted percentage value
            match = re.search(r'--highlighted-percentage:(\d+%)', rating)
            if match:
                highlighted_percentage = match.group(1)  # Get the percentage value
        
        print(f'Product Name: {product_name}, Brand: {product_brand}, Sell Price: {sell_price}, Reviews Count: {reviews_count}, Highlighted Percentage: {highlighted_percentage}')
else:
    print('No product card found.')

Product Name: Vitamin Enriched Face Base - Moisturizer and Primer, Brand: Bobbi Brown, Sell Price: Rp 260.000, Reviews Count: (5562), Highlighted Percentage: 92%


In [None]:
# Product Name, Rating, & Seller
name_text = results[5].find_all('a', {'class':'bl-link'})

name_data = []

for i in name_text:
    text = re.sub('[^A-Za-z0-9. ]+', '', i.get_text())
    wipe_space = text.strip()
    name_data.append(wipe_space)

prod_name = name_data[0]
rating = float(name_data[1])
seller = name_data[2]

print(prod_name)
print(rating)
print(seller)

KERUPUK RAMBAK KULIT SAPI MENTAH
4.8
Barcode Excellent


In [None]:
price_div = results[0].find('div', class_='bl-product-card__description-price')
price_text = price_div.find('p').text.strip()
print(price_text)

Rp17.400


In [None]:
# Sold Count
sold_text = results[0].find_all('div', {'class':'bl-product-card__description-rating-and-sold'})

sold_data = []

for i in sold_text:
    text = re.sub('[^A-Za-z0-9. ]+', '', i.get_text())
    wipe_space = text.strip()

    # Split the text and reassemble it
    parts = wipe_space.split()
    if 'Terjual' in parts:
        index = parts.index('Terjual')
        cleaned = ' '.join(parts[index:index+2])
        final_clean = cleaned.replace('Terjual ', '')
        sold_data.append(int(final_clean))

sold = sold_data[0]

print(sold)

4


In [None]:
# Location
location_text = results[0].find_all('div', {'class':'bl-product-card__description-store'})

location_data = []

for i in location_text:
    text = re.sub('[^A-Za-z0-9. ]+', '', i.get_text())
    wipe_space = text.strip()
    
    # Split the text and take only the first two parts
    parts = wipe_space.split()
    cleaned = ' '.join(parts[:2])
    location_data.append(cleaned)

location = location_data[0]

print(location)

Kab. Bandung


### Formulization

In [None]:
def get_link_and_categories(web_reqs, index):
    # scraping the ['link', 'main category', 'sub category'] from requests
    
    link_text = web_reqs[index].find_all('a', {'class':'bl-link'}, href=True)

    linkcat_list = []

    for a in link_text:
        while linkcat_list == []:
            if a and a.has_attr('href'):
                    href = a['href']
                    linkcat_list.append(href)
            else:
                print('No link found or no href attribute present.')

    breaking = re.compile(r'[\:/?=\-&]+',re.UNICODE).split(linkcat_list[0])
    linkcat_list.append(breaking[3])
    linkcat_list.append(breaking[5])

    return linkcat_list

In [None]:
def get_name_rating_seller(web_reqs, index):
    # scraping the ['prod_name', 'rating', 'seller']
    # ['rating'] needs to be converted into float after returned

    name_rate_seller_text = web_reqs[index].find_all('a', {'class':'bl-link'})

    name_rate_seller_data = []

    for i in name_rate_seller_text:
        text = re.sub('[^A-Za-z0-9. ]+', '', i.get_text())
        wipe_space = text.strip()
        name_rate_seller_data.append(wipe_space)

    return name_rate_seller_data


In [None]:
def get_price(web_reqs, index):
    # scraping the product price
    # price already in integer when returned

    price_text = web_reqs[index].find_all('div', {'class':'bl-product-card__description-price'})

    price_data = []

    for i in price_text:
        text = re.sub('[^A-Za-z0-9 ]+', '', i.get_text())
        wipe_space = text.strip()
        cleaned = wipe_space.replace('Rp', '')
        price_data.append(int(cleaned))

    return price_data

In [None]:
def get_sold_count(web_reqs, index):
    # scraping amount of product sold
    # amount already in inter when returned
    
    sold_text = web_reqs[index].find_all('div', {'class':'bl-product-card__description-rating-and-sold'})

    sold_data = []

    for i in sold_text:
        text = re.sub('[^A-Za-z0-9. ]+', '', i.get_text())
        wipe_space = text.strip()

        # Split the text and reassemble it
        parts = wipe_space.split()
        if 'Terjual' in parts:
            index = parts.index('Terjual')
            cleaned = ' '.join(parts[index:index+2])
            final_clean = cleaned.replace('Terjual ', '')
            sold_data.append(int(final_clean))

    return sold_data


In [None]:
def get_location(web_reqs, index):
    # getting the location of the seller

    location_text = web_reqs[index].find_all('div', {'class':'bl-product-card__description-store'})

    location_data = []

    for i in location_text:
        text = re.sub('[^A-Za-z0-9. ]+', '', i.get_text())
        wipe_space = text.strip()
        
        # Split the text and take only the first two parts
        parts = wipe_space.split()
        cleaned = ' '.join(parts[:2])
        location_data.append(cleaned)

    return location_data

### Compiling

In [None]:
df_bukalapak = pd.DataFrame(
    {
        'seller': [],
        'location': [],
        'product_name': [],
        'main_category': [],
        'sub_category': [],
        'rating': [],
        'price': [],
        'sold_count': [],
        'link': []
    }
)

In [None]:
print(df_bukalapak)

Empty DataFrame
Columns: [seller, location, product_name, main_category, sub_category, rating, price, sold_count, link]
Index: []


In [None]:
for i in results
    

### Notes

In [None]:
# Define the string with special characters
text = "Hello, World! This is a test string with special characters: #, @, $, %..."

# Define the regex pattern to match any character that is not alphanumeric or space
pattern = r'[^a-zA-Z0-9 ]'

# Use re.sub() to replace all occurrences of the pattern with an empty string
cleaned_text = re.sub(pattern, '', text)