In [92]:
import re
import logging
import pandas as pd

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup, ResultSet
from functools import partial
from datetime import datetime

logging.basicConfig(level=logging.INFO)

In [95]:
class MKBag(object):

    def __init__(self):
        driver_options = Options()
        driver_options.add_argument("--headless")
        driver_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36")
        self.driver = webdriver.Firefox(options=driver_options)

    def browse_web(
        self,
        url: str = 'https://www.michaelkors.global/hk/en/women/handbags/?pmin=1.00&start=0&sz={max_item}',
        item_count: int = 50,
    ) -> ResultSet:
        self.driver.get(url.format(max_item=item_count))
        return BeautifulSoup(self.driver.page_source, 'html.parser')
    
    def cleanup(self):
        self.driver.quit()

    @staticmethod
    def extract_item_info(raw_result: ResultSet, path: str, get_item: str) -> list:
        found = raw_result.select(path)
        if not found:
            return []
        if get_item == 'text':
            return [txt.text for txt in found]
        else:
            return [item.get(get_item) for item in found]

    def item_data_constructor(self, raw_result: ResultSet, get_all_details: bool) -> dict:

        extract_info_partial = partial(self.extract_item_info, raw_result=raw_result)
        product_link=f"https://www.michaelkors.global/{extract_info_partial(path='div.pdp-link a', get_item='href')[0]}"

        brand = extract_info_partial(path='div.product-brand a', get_item='text')
        item_name=extract_info_partial(path='div.pdp-link a', get_item='text')[0]

        logging.info(f'Pulling Basic information of item: {item_name}')

        basic_details = dict(
            brand=brand[0] if brand else 'MICHAEL Michael Kors',
            item_name=item_name,
            product_link=product_link,
            default_price=max(extract_info_partial(path='span.default-price .value', get_item='content')),
            current_price=min(extract_info_partial(path='span.default-price .value', get_item='content')),
            colors=extract_info_partial(path='div.swatches img', get_item='title'),
            product_images=extract_info_partial(path='div.image-container img', get_item='src'),
            timestamp=datetime.now(),
        )

        if get_all_details:
            logging.info(f'Pulling dtailed information of item: {item_name}')
            extra_details = self.get_item_details(product_link)
            basic_details.update(extra_details)
        
        return basic_details

    def get_all_bags(self, get_all_details: bool = False) -> list:

        total_bags = self.browse_web().select_one('span.results-count-value').text

        logging.info(f'Start pulling total {total_bags} bags data')
 
        lst_bags = self.browse_web(item_count=total_bags).findAll('div', class_='product-tile')

        bags_data = []

        for idx, bags in enumerate(lst_bags):
            logging.info(f'Working on {idx+1}/{total_bags} bags')
            bags_data.append(self.item_data_constructor(bags, get_all_details))

        return bags_data
    
    def get_item_details(self, product_link: str) -> dict:
        raw_product_detail = self.browse_web(product_link)

        extra_data = dict(
            desceiption=raw_product_detail.select('div.col-12.value.content')[0].text.strip(),
            availability=raw_product_detail.select('div.availability')[0].get('data-available'),
            product_details=raw_product_detail.select('div.col-sm-12.col-md-8.col-lg-12.value.content')[0].text.strip().split('\n')
        )
        dim_expression = r'(\d+(?:\.\d+)?)\S\s?([WHD])'
        dimension = [
            re.findall(dim_expression, details) 
            for details in extra_data['product_details'] 
            if re.findall(dim_expression, details)
        ]

        logging.info(dimension) ##
      
        extra_data['dimension'] = {item[1]: item[0] for item in dimension[0]} if dimension else {}

        return extra_data

        

In [96]:
mk_obj = MKBag()
bags_data = mk_obj.get_all_bags(get_all_details=False)
mk_obj.cleanup()

INFO:root:Start pulling total 402 bags data
INFO:root:Working on 1/402 bags
INFO:root:Pulling Basic information of item: Ruthie Large Pebbled Leather Satchel
INFO:root:Working on 2/402 bags
INFO:root:Pulling Basic information of item: Ruthie Large Signature Logo Satchel
INFO:root:Working on 3/402 bags
INFO:root:Pulling Basic information of item: Jet Set Large Leather Crossbody Bag
INFO:root:Working on 4/402 bags
INFO:root:Pulling Basic information of item: Jet Set Large Smartphone Convertible Crossbody Bag
INFO:root:Working on 5/402 bags
INFO:root:Pulling Basic information of item: Chantal Medium Pebbled Leather Satchel
INFO:root:Working on 6/402 bags
INFO:root:Pulling Basic information of item: Colby Medium Leather Shoulder Bag
INFO:root:Working on 7/402 bags
INFO:root:Pulling Basic information of item: Colby Medium Two-Tone Neoprene Shoulder Bag
INFO:root:Working on 8/402 bags
INFO:root:Pulling Basic information of item: Colby Medium Leather Shoulder Bag
INFO:root:Working on 9/402 ba

In [97]:
pd.DataFrame(bags_data)

Unnamed: 0,brand,item_name,product_link,default_price,current_price,colors,product_images,timestamp
0,MICHAEL Michael Kors,Ruthie Large Pebbled Leather Satchel,https://www.michaelkors.global//hk/en/ruthie-l...,3400.00,3400.00,[],[https://michaelkors.scene7.com/is/image/Micha...,2024-08-11 22:57:35.050116
1,MICHAEL Michael Kors,Ruthie Large Signature Logo Satchel,https://www.michaelkors.global//hk/en/ruthie-l...,3400.00,3400.00,"[BRN/ACORN, VANILLA/ACORN]",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-11 22:57:35.052801
2,MICHAEL Michael Kors,Jet Set Large Leather Crossbody Bag,https://www.michaelkors.global//hk/en/jet-set-...,4200.00,4200.00,"[BLACK, LUGGAGE, LT CREAM, POWDER BLUSH]",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-11 22:57:35.055581
3,MICHAEL Michael Kors,Jet Set Large Smartphone Convertible Crossbody...,https://www.michaelkors.global//hk/en/jet-set-...,4100.00,4100.00,"[VANILLA, BROWN]",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-11 22:57:35.057854
4,MICHAEL Michael Kors,Chantal Medium Pebbled Leather Satchel,https://www.michaelkors.global//hk/en/chantal-...,4600.00,4600.00,[],"[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",2024-08-11 22:57:35.059890
...,...,...,...,...,...,...,...,...
397,MICHAEL Michael Kors,Chantal Extra-Small Logo Messenger Bag,https://www.michaelkors.global//hk/en/chantal-...,3200.00,3200.00,"[BRN/ACORN, VANILLA/ACORN]","[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",2024-08-11 22:57:36.515580
398,MICHAEL Michael Kors,Empire Medium Crocodile Embossed Leather Chain...,https://www.michaelkors.global//hk/en/empire-m...,2600.00,1300.00,"[OPTIC WHITE, PALE PEANUT]","[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",2024-08-11 22:57:36.521234
399,MICHAEL Michael Kors,Astor Large Studded Leather Shoulder Bag,https://www.michaelkors.global//hk/en/astor-la...,,4700.00,[],"[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",2024-08-11 22:57:36.526455
400,MICHAEL Michael Kors,Jet Set Charm Small Empire Logo Jacquard Pochette,https://www.michaelkors.global//hk/en/jet-set-...,,2340.00,[],"[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",2024-08-11 22:57:36.529356
