In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
import uuid
import csv

class ProductScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.driver = webdriver.Chrome()
        self.all_data = []

    def product_detail(self, product_url):
        self.driver.get(product_url)

        try:
            WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@x-ref="thumbnails"]')))
            soup_product = BeautifulSoup(self.driver.page_source, 'html.parser')

            thumbnail = soup_product.find('div', class_="thumbs").find('img')
            img_src = "https:" + thumbnail['src'] if thumbnail else 'No Image'

            brand_name_element = soup_product.find('div', class_='pdp-brand-name').find('a')
            brand_name = brand_name_element.text.strip() if brand_name_element else 'Unknown Brand'

            product_title_element = soup_product.find('h1', class_='pdp-product-title')
            product_title = product_title_element.text.strip() if product_title_element else 'Untitled Product'

            price_element = soup_product.find('span', id='price').find('span', class_='money')
            price = price_element.text.strip() if price_element else 'No Price'

            condition_element = soup_product.find('li', class_='flex justify-center cond-item cond-item-active')
            condition = condition_element.text.strip() if condition_element else 'No Condition'


            sizing_details = {}
            sizing_items = soup_product.find_all('ul', class_='item-specifics-container')
            for ul in sizing_items:
                for li in ul.find_all('li'):
                    key = li.contents[0].strip()
                    value = li.find('span', class_='font-normal').text.strip() if li.find('span', class_='font-normal') else 'No Value'
                    sizing_details[key] = value

            product_info = {
                'UUID': str(uuid.uuid4()),
                'Brand name': brand_name,
                'Title': product_title,
                'Product image': img_src,
                'Condition': condition,
                'Price': price,  
            }
            product_info.update(sizing_details)
            self.all_data.append(product_info)
        except Exception as e:
            print(f"Error fetching product details: {e}")
            pass

    def scrap_product(self):
        page_url = self.base_url
        while True:
            print(f"Processing URL: {page_url}")
            try:
                self.driver.get(page_url)
                WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'pagination')))

                soup = BeautifulSoup(self.driver.page_source, 'html.parser')

                product_urls = []
                product_cards = soup.find_all('div', class_='group divide-y divide-gray-50 relative product-card')
                preorder_product_cards = soup.find_all('div', class_='group divide-y divide-gray-50 relative product-card preorder')

                for card in product_cards + preorder_product_cards:
                    link = card.find('a', class_='absolute inset-0')
                    if link:
                        product_urls.append('https://mygemma.com' + link['href'])

                for product_url in product_urls:
                    self.product_detail(product_url)

                try:
                    page_url = soup.find('li', class_='nav-next').find('a').get('href')
                except:
                    page_url = None            
                if page_url == None:
                    break

                time.sleep(2)
            except Exception as e:
                print(f"Error while processing page: {e}")
                break

        self.save_data()
        
    def save_data(self):
        if self.all_data:
           
            fieldnames = ['UUID','Brand name','Title','Product image','Condition','Price']

            for data in self.all_data:
                for key in data.keys():
                    if key not in fieldnames:
                        fieldnames.append(key)

            with open('mygemma_product_data.csv', 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames = fieldnames)
                writer.writeheader()
                writer.writerows(self.all_data)

            print("Data saved to product_data.csv")
            record_count = len(self.all_data)
            print(f"Number of records stored: {record_count}")
        else:
            print("No data to save.")

if __name__ == "__main__":
    base_url = 'https://mygemma.com/collections/handbags'
    scraper = ProductScraper(base_url)
    scraper.scrap_product()

Processing URL: https://mygemma.com/collections/handbags
Processing URL: https://mygemma.com/collections/handbags?_=pf&page=2
Processing URL: https://mygemma.com/collections/handbags?_=pf&page=3
Processing URL: https://mygemma.com/collections/handbags?_=pf&page=4
Processing URL: https://mygemma.com/collections/handbags?_=pf&page=5
Processing URL: https://mygemma.com/collections/handbags?_=pf&page=6
Processing URL: https://mygemma.com/collections/handbags?_=pf&page=7
Processing URL: https://mygemma.com/collections/handbags?_=pf&page=8
Processing URL: https://mygemma.com/collections/handbags?_=pf&page=9
Processing URL: https://mygemma.com/collections/handbags?_=pf&page=10
Processing URL: https://mygemma.com/collections/handbags?_=pf&page=11
Processing URL: https://mygemma.com/collections/handbags?_=pf&page=12
Processing URL: https://mygemma.com/collections/handbags?_=pf&page=13
Processing URL: https://mygemma.com/collections/handbags?_=pf&page=14
Processing URL: https://mygemma.com/colle

In [None]:
1353