In [1]:
import boto3
import re
import logging
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup, ResultSet
from functools import partial
from datetime import datetime

logging.basicConfig(level=logging.INFO)


def get_df(data: dict) -> pd.DataFrame:
    return pd.DataFrame(data)


def get_json(data, file_name: str = 'bags-data.json'):
    df = get_df(data)
    return df.to_json(file_name, orient='records')


def upload_to_s3(
    s3_bucket: str, 
    s3_prefix: str,
    file_path: str = 'bags-data.json',
    aws_profile: str = None
):
    if aws_profile:
        boto_session = boto3.Session(profile_name=aws_profile)
        s3 = boto_session.client('s3')
    else:
        s3 = boto3.client('s3')

    s3.upload_file(
        file_path, 
        s3_bucket, 
        s3_prefix,
    )

## Local Environment Setup (On Mac)
1. Please have Google Chrome installed on your laptop
2. Create an python virtual environment with `python -m venv venv` 
3. Prepare the working environment with 
   1. `source venv/bin/activate`
   2. `pip install -r requirements.txt`



In [2]:
class MKBag(object):

    def __init__(self):
        driver_options = Options()
        driver_options.add_argument("--headless=new")
        driver_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36")
        self.driver = webdriver.Chrome(options=driver_options)

    def browse_web(
        self,
        url: str = 'https://www.michaelkors.global/hk/en/women/handbags/?pmin=1.00&start=0&sz={max_item}',
        item_count: int = 50,
    ) -> ResultSet:
        self.driver.get(url.format(max_item=item_count))
        return BeautifulSoup(self.driver.page_source, 'html.parser')
    
    def cleanup(self):
        self.driver.quit()

    @staticmethod
    def extract_item_info(raw_result: ResultSet, path: str, get_item: str) -> list:
        found = raw_result.select(path)
        if not found:
            return []
        if get_item == 'text':
            return [txt.text for txt in found]
        else:
            return [item.get(get_item) for item in found]

    def item_data_constructor(self, raw_result: ResultSet, get_all_details: bool) -> dict:

        extract_info_partial = partial(self.extract_item_info, raw_result=raw_result)
        product_link=f"https://www.michaelkors.global/{extract_info_partial(path='div.pdp-link a', get_item='href')[0]}"

        brand = extract_info_partial(path='div.product-brand a', get_item='text')
        item_name=extract_info_partial(path='div.pdp-link a', get_item='text')[0]

        logging.info(f'Pulling Basic information of item: {item_name}')

        basic_details = dict(
            brand=brand[0] if brand else 'MICHAEL Michael Kors',
            item_name=item_name,
            product_link=product_link,
            default_price=max(extract_info_partial(path='span.default-price .value', get_item='content')),
            current_price=min(extract_info_partial(path='span.default-price .value', get_item='content')),
            colors=extract_info_partial(path='div.swatches img', get_item='title'),
            product_images=extract_info_partial(path='div.image-container img', get_item='src'),
            timestamp=datetime.now(),
        )

        if get_all_details:
            logging.info(f'Pulling dtailed information of item: {item_name}')
            extra_details = self.get_item_details(product_link)
            basic_details.update(extra_details)
        
        return basic_details

    def get_all_bags(self, get_all_details: bool = False) -> list:

        total_bags = self.browse_web().select_one('span.results-count-value').text

        logging.info(f'Start pulling total {total_bags} bags data')
 
        lst_bags = self.browse_web(item_count=total_bags).findAll('div', class_='product-tile')

        bags_data = []

        for idx, bags in enumerate(lst_bags):
            logging.info(f'Working on {idx+1}/{total_bags} bags')
            bags_data.append(self.item_data_constructor(bags, get_all_details))

        return bags_data
    
    def get_item_details(self, product_link: str) -> dict:
        raw_product_detail = self.browse_web(product_link)

        extra_data = dict(
            desceiption=raw_product_detail.select('div.col-12.value.content')[0].text.strip(),
            availability=raw_product_detail.select('div.availability')[0].get('data-available'),
            product_details=raw_product_detail.select('div.col-sm-12.col-md-8.col-lg-12.value.content')[0].text.strip().split('\n')
        )
        dim_expression = r'(\d+(?:\.\d+)?)\S\s?([WHD])'
        dimension = [
            re.findall(dim_expression, details) 
            for details in extra_data['product_details'] 
            if re.findall(dim_expression, details)
        ]

        extra_data['dimension'] = {item[1]: item[0] for item in dimension[0]} if dimension else {}

        return extra_data

        

Starts Scraping

- It will scrape below attributes by default:
  - item name
  - brand
  - default price
  - current price
  - colors
  - product link
  - product image
  - timestamp
- You may choose to retrieve extra details with arg "get_all_details=True" (it takes 10x longer):
  - dimension
  - desceiption
  - availability
- It returns a list of dictionary

**Default Scraping (Basic information)**

In [3]:
mk_obj = MKBag()
bags_data = mk_obj.get_all_bags(get_all_details=False)
mk_obj.cleanup()

INFO:root:Start pulling total 400 bags data
INFO:root:Working on 1/400 bags
INFO:root:Pulling Basic information of item: Ruthie Large Pebbled Leather Satchel
INFO:root:Working on 2/400 bags
INFO:root:Pulling Basic information of item: Ruthie Large Signature Logo Satchel
INFO:root:Working on 3/400 bags
INFO:root:Pulling Basic information of item: Jet Set Large Leather Crossbody Bag
INFO:root:Working on 4/400 bags
INFO:root:Pulling Basic information of item: Jet Set Large Smartphone Convertible Crossbody Bag
INFO:root:Working on 5/400 bags
INFO:root:Pulling Basic information of item: Chantal Medium Pebbled Leather Satchel
INFO:root:Working on 6/400 bags
INFO:root:Pulling Basic information of item: Colby Medium Leather Shoulder Bag
INFO:root:Working on 7/400 bags
INFO:root:Pulling Basic information of item: Colby Medium Two-Tone Neoprene Shoulder Bag
INFO:root:Working on 8/400 bags
INFO:root:Pulling Basic information of item: Colby Medium Leather Shoulder Bag
INFO:root:Working on 9/400 ba

In [43]:
# Check the result
get_df(bags_data)

Unnamed: 0,brand,item_name,product_link,default_price,current_price,colors,product_images,timestamp
0,MICHAEL Michael Kors,Ruthie Large Pebbled Leather Satchel,https://www.michaelkors.global//hk/en/ruthie-l...,3400.00,3400.00,[],[https://michaelkors.scene7.com/is/image/Micha...,2024-08-12 12:09:01.268511
1,MICHAEL Michael Kors,Ruthie Large Signature Logo Satchel,https://www.michaelkors.global//hk/en/ruthie-l...,3400.00,3400.00,"[BRN/ACORN, VANILLA/ACORN]",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-12 12:09:01.271057
2,MICHAEL Michael Kors,Jet Set Large Leather Crossbody Bag,https://www.michaelkors.global//hk/en/jet-set-...,4200.00,4200.00,"[BLACK, LUGGAGE, LT CREAM, POWDER BLUSH]",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-12 12:09:01.273774
3,MICHAEL Michael Kors,Jet Set Large Smartphone Convertible Crossbody...,https://www.michaelkors.global//hk/en/jet-set-...,4100.00,4100.00,"[VANILLA, BROWN]",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-12 12:09:01.276095
4,MICHAEL Michael Kors,Chantal Medium Pebbled Leather Satchel,https://www.michaelkors.global//hk/en/chantal-...,4600.00,4600.00,[],"[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",2024-08-12 12:09:01.278132
...,...,...,...,...,...,...,...,...
395,MICHAEL Michael Kors,Empire Medium Crocodile Embossed Leather Chain...,https://www.michaelkors.global//hk/en/empire-m...,2600.00,1300.00,"[OPTIC WHITE, PALE PEANUT]","[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",2024-08-12 12:09:02.738705
396,MICHAEL Michael Kors,Astor Large Studded Leather Shoulder Bag,https://www.michaelkors.global//hk/en/astor-la...,,4700.00,[],"[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",2024-08-12 12:09:02.741164
397,MICHAEL Michael Kors,Jet Set Charm Small Empire Logo Jacquard Pochette,https://www.michaelkors.global//hk/en/jet-set-...,,2340.00,[],"[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",2024-08-12 12:09:02.750256
398,MICHAEL Michael Kors,Empire Medium Frayed Denim Chain-Link Pochette,https://www.michaelkors.global//hk/en/empire-m...,,2600.00,[],"[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",2024-08-12 12:09:02.778904


**Detailed Scraping (Basic + Extra information)**

By enabling 'get_all_details=True', the script will drill into the product page and extract some extra information there. For example, dimension and decription

In [3]:
mk_obj = MKBag()
bags_data_detailed = mk_obj.get_all_bags(get_all_details=True)
mk_obj.cleanup()

INFO:root:Start pulling total 10 bags data
INFO:root:Working on 1/10 bags
INFO:root:Pulling Basic information of item: Ruthie Large Pebbled Leather Satchel
INFO:root:Pulling dtailed information of item: Ruthie Large Pebbled Leather Satchel
INFO:root:Working on 2/10 bags
INFO:root:Pulling Basic information of item: Ruthie Large Signature Logo Satchel
INFO:root:Pulling dtailed information of item: Ruthie Large Signature Logo Satchel
INFO:root:Working on 3/10 bags
INFO:root:Pulling Basic information of item: Jet Set Large Leather Crossbody Bag
INFO:root:Pulling dtailed information of item: Jet Set Large Leather Crossbody Bag
INFO:root:Working on 4/10 bags
INFO:root:Pulling Basic information of item: Jet Set Large Smartphone Convertible Crossbody Bag
INFO:root:Pulling dtailed information of item: Jet Set Large Smartphone Convertible Crossbody Bag
INFO:root:Working on 5/10 bags
INFO:root:Pulling Basic information of item: Chantal Medium Pebbled Leather Satchel
INFO:root:Pulling dtailed info

In [4]:
# Check the result with details
get_df(bags_data_detailed)

Unnamed: 0,brand,item_name,product_link,default_price,current_price,colors,product_images,timestamp,desceiption,availability,product_details,dimension
0,MICHAEL Michael Kors,Ruthie Large Pebbled Leather Satchel,https://www.michaelkors.global//hk/en/ruthie-l...,3400.0,3400.0,[],[https://michaelkors.scene7.com/is/image/Micha...,2024-08-12 16:54:55.995881,"Designed to hold a day’s worth of essentials, ...",True,"[• Satchel, • Pebbled leather, • 100% leather,...","{'W': '13.75', 'H': '9.75', 'D': '4.25'}"
1,MICHAEL Michael Kors,Ruthie Large Signature Logo Satchel,https://www.michaelkors.global//hk/en/ruthie-l...,3400.0,3400.0,"[BRN/ACORN, VANILLA/ACORN]",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-12 16:54:57.419647,"Designed to hold a day’s worth of essentials, ...",True,"[• Satchel, • Logo-print canvas, • 90% coated ...","{'W': '13.75', 'H': '9.75', 'D': '4.25'}"
2,MICHAEL Michael Kors,Jet Set Large Leather Crossbody Bag,https://www.michaelkors.global//hk/en/jet-set-...,4200.0,4200.0,"[BLACK, LUGGAGE, LT CREAM, POWDER BLUSH]",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-12 16:54:58.547290,Sporty chic meets modern minimalism on this Je...,True,"[• Crossbody bag, • Leather, • 100% leather, •...","{'W': '8.25', 'H': '5', 'D': '1.75'}"
3,MICHAEL Michael Kors,Jet Set Large Smartphone Convertible Crossbody...,https://www.michaelkors.global//hk/en/jet-set-...,4100.0,4100.0,"[VANILLA, BROWN]",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-12 16:55:00.104466,Our Jet Set crossbody wallet is a small wonder...,True,"[• Smartphone crossbody bag, • Logo-print canv...","{'W': '8.25', 'H': '5', 'D': '1.75'}"
4,MICHAEL Michael Kors,Chantal Medium Pebbled Leather Satchel,https://www.michaelkors.global//hk/en/chantal-...,4600.0,4600.0,[],[https://michaelkors.scene7.com/is/image/Micha...,2024-08-12 16:55:01.113624,A spacious interior and timeless silhouette ma...,True,"[• Satchel, • Pebbled leather, • 100% leather ...","{'W': '12.75', 'H': '9.75', 'D': '6.25'}"
5,MICHAEL Michael Kors,Colby Medium Leather Shoulder Bag,https://www.michaelkors.global//hk/en/colby-me...,4300.0,4300.0,"[DK CHAMBRAY, CAMEL, SMOKEY OLIVE, OPTIC ORANG...",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-12 16:55:02.417892,The modern-yet-classic Colby shoulder bag evok...,True,"[• Shoulder bag, • Leather, • 100% leather, • ...","{'W': '10.5', 'H': '6.75', 'D': '2.75'}"
6,MICHAEL Michael Kors,Colby Medium Two-Tone Neoprene Shoulder Bag,https://www.michaelkors.global//hk/en/colby-me...,4300.0,4300.0,[],[https://michaelkors.scene7.com/is/image/Micha...,2024-08-12 16:55:03.696196,The modern-yet-classic Colby shoulder bag evok...,True,"[• Shoulder bag, • Neoprene, • 100% polyester,...","{'W': '10.5', 'H': '6.75', 'D': '2.75'}"
7,MICHAEL Michael Kors,Colby Medium Leather Shoulder Bag,https://www.michaelkors.global//hk/en/colby-me...,4300.0,4300.0,"[BRIGHT DANDELION, BLACK, LUGGAGE, CERISE, OPT...",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-12 16:55:04.743719,The modern-yet-classic Colby shoulder bag evok...,True,"[• Shoulder bag, • Leather, • 100% leather, • ...","{'W': '10.5', 'H': '6.75', 'D': '2.75'}"
8,MICHAEL Michael Kors,Mercer Extra-Small Logo and Leather Crossbody Bag,https://www.michaelkors.global//hk/en/mercer-e...,3510.0,3510.0,"[VANILLA, BROWN]","[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",2024-08-12 16:55:05.888630,Our scaled-down Mercer crossbody allows you to...,True,"[• Crossbody bag, • Logo-print canvas/leather,...","{'W': '6', 'H': '6.75', 'D': '2.5'}"
9,MICHAEL Michael Kors,Mercer Extra-Small Pebbled Leather Crossbody Bag,https://www.michaelkors.global//hk/en/mercer-e...,3510.0,3510.0,"[LUGGAGE, NAVY, LT CREAM, POWDER BLUSH]","[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",2024-08-12 16:55:07.436006,Our scaled-down Mercer crossbody bag will enab...,False,"[• Crossbody bag, • Pebbled leather, • 100% le...","{'W': '6', 'H': '6.75', 'D': '2.5'}"


# Upload result data to S3

By default, we transform the data into JSON and store in S3. Please replace BUCKET_NAME & FILE_NAME for file destination on Cloud


In [40]:
# Transform result dictionary into JSON file
get_json(bags_data)

# Create S3 client and upload to destination
boto_session = boto3.Session(profile_name='sit')
s3 = boto_session.client('s3')
BUCKET_NAME = 'efsg-data-analytics-ap-southeast-1-215702702661-sit'
FILE_NAME = 'test-data/bags-data.json'

s3.upload_file(
    'bags-data.json', 
    BUCKET_NAME, 
    FILE_NAME,
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
