In [80]:
import re
import logging
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup, ResultSet
from functools import partial
from datetime import datetime

logging.basicConfig(level=logging.INFO)

In [85]:
class MKBag(object):

    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless=new")
        chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36")
        self.driver = webdriver.Chrome(options=chrome_options)

    def browse_web(
        self,
        url: str = 'https://www.michaelkors.global/hk/en/women/handbags/?pmin=1.00&start=0&sz={max_item}',
        item_count: int = 50,
    ) -> ResultSet:
        self.driver.get(url.format(max_item=item_count))
        return BeautifulSoup(self.driver.page_source, 'html.parser')
    
    def cleanup(self):
        self.driver.quit()

    @staticmethod
    def extract_item_info(raw_result: ResultSet, path: str, get_item: str) -> list:
        found = raw_result.select(path)
        if not found:
            return []
        if get_item == 'text':
            return [txt.text for txt in found]
        else:
            return [item.get(get_item) for item in found]

    def item_data_constructor(self, raw_result: ResultSet, get_all_details: bool) -> dict:

        extract_info_partial = partial(self.extract_item_info, raw_result=raw_result)
        product_link=f"https://www.michaelkors.global/{extract_info_partial(path='div.pdp-link a', get_item='href')[0]}"

        brand = extract_info_partial(path='div.product-brand a', get_item='text')
        item_name=extract_info_partial(path='div.pdp-link a', get_item='text')[0]

        logging.info(f'Pulling Basic information of item: {item_name}')

        basic_details = dict(
            brand=brand[0] if brand else 'MICHAEL Michael Kors',
            item_name=item_name,
            product_link=product_link,
            default_price=max(extract_info_partial(path='span.default-price .value', get_item='content')),
            current_price=min(extract_info_partial(path='span.default-price .value', get_item='content')),
            colors=extract_info_partial(path='div.swatches img', get_item='title'),
            product_images=extract_info_partial(path='div.image-container img', get_item='src'),
            timestamp=datetime.now(),
        )

        if get_all_details:
            logging.info(f'Pulling dtailed information of item: {item_name}')
            extra_details = self.get_item_details(product_link)
            basic_details.update(extra_details)
        
        return basic_details

    def get_all_bags(self, get_all_details: bool = False) -> list:

        total_bags = self.browse_web().select_one('span.results-count-value').text

        logging.info(f'Start pulling total {total_bags} bags data')
 
        lst_bags = self.browse_web(item_count=total_bags).findAll('div', class_='product-tile')

        bags_data = []

        for idx, bags in enumerate(lst_bags):
            logging.info(f'Working on {idx+1}/{total_bags} bags')
            bags_data.append(self.item_data_constructor(bags, get_all_details))

        return bags_data
    
    def get_item_details(self, product_link: str) -> dict:
        raw_product_detail = self.browse_web(product_link)

        extra_data = dict(
            desceiption=raw_product_detail.select('div.col-12.value.content')[0].text.strip(),
            availability=raw_product_detail.select('div.availability')[0].get('data-available'),
            product_details=raw_product_detail.select('div.col-sm-12.col-md-8.col-lg-12.value.content')[0].text.strip().split('\n')
        )
        dim_expression = r'(\d+(?:\.\d+)?)\S\s?([WHD])'
        dimension = [
            re.findall(dim_expression, details) 
            for details in extra_data['product_details'] 
            if re.findall(dim_expression, details)
        ]

        logging.info(dimension) ##
      
        extra_data['dimension'] = {item[1]: item[0] for item in dimension[0]} if dimension else {}

        return extra_data

        

In [90]:
mk_obj = MKBag()
bags_data = mk_obj.get_all_bags(get_all_details=True)
mk_obj.cleanup()

INFO:root:Start pulling total 401 bags data
INFO:root:Working on 1/401 bags
INFO:root:Pulling Basic information of item: Ruthie Large Pebbled Leather Satchel
INFO:root:Pulling dtailed information of item: Ruthie Large Pebbled Leather Satchel
INFO:root:[[('13.75', 'W'), ('9.75', 'H'), ('4.25', 'D')]]
INFO:root:Working on 2/401 bags
INFO:root:Pulling Basic information of item: Ruthie Large Signature Logo Satchel
INFO:root:Pulling dtailed information of item: Ruthie Large Signature Logo Satchel
INFO:root:[[('13.75', 'W'), ('9.75', 'H'), ('4.25', 'D')]]
INFO:root:Working on 3/401 bags
INFO:root:Pulling Basic information of item: Jet Set Large Leather Crossbody Bag
INFO:root:Pulling dtailed information of item: Jet Set Large Leather Crossbody Bag
INFO:root:[[('8.25', 'W'), ('5', 'H'), ('1.75', 'D')]]
INFO:root:Working on 4/401 bags
INFO:root:Pulling Basic information of item: Jet Set Large Smartphone Convertible Crossbody Bag
INFO:root:Pulling dtailed information of item: Jet Set Large Smar

In [None]:
pd.DataFrame(bags_data)

Unnamed: 0,brand,item_name,product_link,default_price,current_price,colors,product_images,timestamp,desceiption,availability,product_details,dimension
0,[MICHAEL Michael Kors],Ruthie Large Pebbled Leather Satchel,https://www.michaelkors.global//hk/en/ruthie-l...,3400.0,3400.0,[],[https://michaelkors.scene7.com/is/image/Micha...,2024-08-10 13:14:08.654318,"Designed to hold a day’s worth of essentials, ...",True,"[• Satchel, • Pebbled leather, • 100% leather,...","{'W': '13.75', 'H': '9.75', 'D': '4.25'}"
1,[MICHAEL Michael Kors],Ruthie Large Signature Logo Satchel,https://www.michaelkors.global//hk/en/ruthie-l...,3400.0,3400.0,"[BRN/ACORN, VANILLA/ACORN]",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-10 13:14:09.809778,"Designed to hold a day’s worth of essentials, ...",True,"[• Satchel, • Logo-print canvas, • 90% coated ...","{'W': '13.75', 'H': '9.75', 'D': '4.25'}"
2,[],Jet Set Travel Large Signature Logo Messenger Bag,https://www.michaelkors.global//hk/en/jet-set-...,,4690.0,[],[https://michaelkors.scene7.com/is/image/Micha...,2024-08-10 13:14:10.563454,The Jet Set Travel messenger bag offers a time...,False,"[• Messenger bag, • Logo-print canvas, • 89.4%...","{'W': '9.75', 'H': '10', 'D': '3'}"
3,[],Jet Set Large Leather Crossbody Bag,https://www.michaelkors.global//hk/en/jet-set-...,4200.0,4200.0,"[BLACK, LUGGAGE, LT CREAM, POWDER BLUSH]",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-10 13:14:11.744079,Sporty chic meets modern minimalism on this Je...,True,"[• Crossbody bag, • Leather, • 100% leather, •...","{'W': '8.25', 'H': '5', 'D': '1.75'}"
4,[],Jet Set Large Smartphone Convertible Crossbody...,https://www.michaelkors.global//hk/en/jet-set-...,4100.0,4100.0,"[VANILLA, BROWN]",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-10 13:14:12.534287,Our Jet Set crossbody wallet is a small wonder...,True,"[• Smartphone crossbody bag, • Logo-print canv...","{'W': '8.25', 'H': '5', 'D': '1.75'}"
5,[MICHAEL Michael Kors],Chantal Medium Logo Satchel,https://www.michaelkors.global//hk/en/chantal-...,,4600.0,"[VANILLA/ACORN, BROWN/BLK]",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-10 13:14:13.423801,A spacious interior and timeless silhouette ma...,False,"[• Satchel, • Logo-print canvas, • 90% coated ...","{'W': '12.75', 'H': '9.75', 'D': '6.25'}"
6,[MICHAEL Michael Kors],Chantal Medium Pebbled Leather Satchel,https://www.michaelkors.global//hk/en/chantal-...,4600.0,4600.0,[],[https://michaelkors.scene7.com/is/image/Micha...,2024-08-10 13:14:14.565630,A spacious interior and timeless silhouette ma...,True,"[• Satchel, • Pebbled leather, • 100% leather ...","{'W': '12.75', 'H': '9.75', 'D': '6.25'}"
7,[MICHAEL Michael Kors],Colby Medium Leather Shoulder Bag,https://www.michaelkors.global//hk/en/colby-me...,4300.0,4300.0,"[DK CHAMBRAY, CAMEL, SMOKEY OLIVE, OPTIC ORANG...",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-10 13:14:15.557786,The modern-yet-classic Colby shoulder bag evok...,True,"[• Shoulder bag, • Leather, • 100% leather, • ...","{'W': '10.5', 'H': '6.75', 'D': '2.75'}"
8,[MICHAEL Michael Kors],Colby Medium Two-Tone Neoprene Shoulder Bag,https://www.michaelkors.global//hk/en/colby-me...,4300.0,4300.0,[],[https://michaelkors.scene7.com/is/image/Micha...,2024-08-10 13:14:16.666505,The modern-yet-classic Colby shoulder bag evok...,True,"[• Shoulder bag, • Neoprene, • 100% polyester,...","{'W': '10.5', 'H': '6.75', 'D': '2.75'}"
9,[MICHAEL Michael Kors],Colby Medium Leather Shoulder Bag,https://www.michaelkors.global//hk/en/colby-me...,4300.0,4300.0,"[BRIGHT DANDELION, BLACK, LUGGAGE, CERISE, OPT...",[https://michaelkors.scene7.com/is/image/Micha...,2024-08-10 13:14:17.477086,The modern-yet-classic Colby shoulder bag evok...,True,"[• Shoulder bag, • Leather, • 100% leather, • ...","{'W': '10.5', 'H': '6.75', 'D': '2.75'}"


In [87]:
mk_obj2 = MKBag()
lst = mk_obj2.get_item_details('https://www.michaelkors.global/hk/en/rain-stain-protector/32H6MNYN4P.html')
lst

INFO:root:[[('3', 'H')]]


{'desceiption': 'Come rain or shine, lengthen the life of your handbags and shoes courtesy of our rain and stain protector, formulated exclusively for Michael Kors leather, suede and fabric products.',
 'availability': '0.0',
 'product_details': ['• Fabric protector ',
  '• 6 oz. bottle',
  '• Use on leather, nubuck and suede fabrics',
  '• Imported',
  '        ',
  '        • Style #',
  '        32H6MNYN4P'],
 'dimension': {'H': '3'}}

In [88]:
lst2 = mk_obj2.get_item_details('https://www.michaelkors.global/hk/en/bardot-mini-leather-hobo-shoulder-bag/31S4GBRH1L.html?astc=true')
lst2

INFO:root:[[('10.43', 'W'), ('7.48', 'H'), ('3.15', 'D')]]


{'desceiption': 'The mini Bardot bag evokes the chic, laid-back feel of a great vacation. The softly structured hobo silhouette, smooth leather, and ring-embellished strap channel a boho-glam sensibility that works year-round, in the city or at the beach. The striking carryall features a suede-lined interior sized to hold an evening’s worth of essentials. Made in Italy.',
 'availability': 'true',
 'product_details': ['• Hobo bag',
  '• Leather',
  '• 100% leather',
  '• Gold-tone hardware',
  '• 10.43"W X 7.48"H X 3.15"D ',
  '• Handle drop: 11”',
  '• Interior details: slip pocket',
  '• Lining: 100% suede',
  '• Snap fastening',
  '• Made in Italy',
  '        ',
  '        • Style #',
  '        31S4GBRH1L'],
 'dimension': {'W': '10.43', 'H': '7.48', 'D': '3.15'}}

In [89]:
lst3 = mk_obj2.get_item_details('https://www.michaelkors.global/hk/en/rhea-medium-color-block-logo-backpack/30S0GEZB2V.html?dwvar_30S0GEZB2V_color=2620')
lst3

INFO:root:[]


{'desceiption': 'Laid-back yet luxe, our Rhea backpack redefines big-city accessorizing. We love the combination of color-blocked logo print twill and polished hardware. With its multiple zipper pockets and delicate shoulder straps, it’s a stylish update to the enduring design.',
 'availability': 'true',
 'product_details': ['• Backpack',
  '• Logo-print canvas',
  '• 69% coated canvas/17% polyester/13% cotton/1% polyurethane',
  '• Gold-tone hardware',
  '• 9.5"" W X 12.5"" H X 5"" D',
  '• Handle drop: 1""',
  '• Exterior details: 2 front zip compartments',
  '• Interior details:  back zip pocket, padded tech compartment, 2 front slip pockets',
  '• Lining: 100% polyester',
  '• Zip fastening',
  '• Dust bag not included',
  '• Imported',
  '        ',
  '        • Style #',
  '        30S0GEZB2V'],
 'dimension': {}}

In [71]:
extra_data = lst3
dim_expression = r'(\d+(?:\.\d+)?)\S\s?([WHD])'
dimension = [
    re.findall(dim_expression, details) 
    for details in extra_data['product_details'] 
    if re.findall(dim_expression, details)
]
dimension

[[('18', 'W'), ('18', 'H')]]

In [72]:
{item[1]: item[0] for item in dimension[0]}


{'W': '18', 'H': '18'}

In [55]:
dimension[0]

[('3', 'H')]