In [None]:
import csv
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from webdriver_manager.microsoft import EdgeChromiumDriverManager

def configure_driver():
    """
    Configures and initializes the Edge driver.
    
    Returns:
        WebDriver: Initialized Edge WebDriver instance.
    """
    driver = webdriver.Edge()
    return driver

def get_categories(driver, url):
    """
    Retrieves product categories from the Coles website.
    
    Args:
        driver (WebDriver): Initialized WebDriver instance.
        url (str): URL of the Coles website.
        
    Returns:
        list: List of BeautifulSoup elements representing categories.
    """
    driver.get(url + "/browse")
    soup = BeautifulSoup(driver.page_source, "html.parser")
    categories = soup.find_all("a", class_="coles-targeting-ShopCategoriesShopCategoryStyledCategoryContainer")
    for category in categories:
        print(category.text)
    return categories

def scrape_products_in_category(driver, category, url):
    """
    Scrapes products within a given category and writes them to a CSV file.
    
    Args:
        driver (WebDriver): Initialized WebDriver instance.
        category (BeautifulSoup): BeautifulSoup element representing a category.
        url (str): Base URL of the Coles website.
    """
    category_link = category.get("href")
    if category_link == "/browse/tobacco":
        return
    category_link = url + category_link
    print(category_link)
    driver.get(category_link)
    
    while True:
        soup = BeautifulSoup(driver.page_source, "html.parser")
        products = soup.find_all("header", class_="product__header")
        filename = category.text + ".csv"
        filepath = os.path.join("D:\\Documents\\Budget", filename)
        
        with open(filepath, "a", newline="") as f:
            writer = csv.writer(f)
            
            for product in products:
                name = product.find("h2", class_="product__title")
                price = product.find("span", class_="price__value")
                product_link = product.find("a", class_="product__link")["href"]
                product_code = product_link.split("-")[-1]
                if name and price:
                    name = name.text.strip()
                    price = price.text.strip()
                    link = url + product_link
                    writer.writerow([product_code, name, price, link])
                    
            pagination = soup.find("ul", class_="coles-targeting-PaginationPaginationUl")
            if not pagination:
                break
            if pagination:
                pages = pagination.find_all("li")
                last_page = int(pages[-2].text.strip()) if pages else 1
            else:
                last_page = 1
            total_pages = int(pages[-2].text.strip())
            print(total_pages)            
            for page in range(2, last_page + 1):
                next_page_link = f"{category_link}?page={page}"
                driver.get(next_page_link)
                soup = BeautifulSoup(driver.page_source, "html.parser")
                products = soup.find_all("header", class_="product__header")
                
                for product in products:
                    name = product.find("h2", class_="product__title")
                    price = product.find("span", class_="price__value")
                    product_link = product.find("a", class_="product__link")["href"]
                    product_code = product_link.split("-")[-1]
                    if name and price:
                        name = name.text.strip()
                        price = price.text.strip()
                        link = url + product_link
                        writer.writerow([product_code, name, price, link])
                time.sleep(random.randint(1, 6))     
            if page == last_page:
                break

def main():
    """
    Main function to orchestrate the scraping process.
    """
    url = "https://www.coles.com.au"
    options = Options()
    driver = configure_driver()
    
    try:
        print("Here we go...")
        categories = get_categories(driver, url)
        
        for category in categories:
            scrape_products_in_category(driver, category, url)
            time.sleep(random.randint(1, 6))
            
        print("Finished")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()


In [None]:
import os
import time
import random
import sqlite3
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from webdriver_manager.microsoft import EdgeChromiumDriverManager

def configure_driver():
    """
    Configures and initializes the Edge driver.
    
    Returns:
        WebDriver: Initialized Edge WebDriver instance.
    """
    driver = webdriver.Edge()
    return driver

def get_categories(driver, url):
    """
    Retrieves product categories from the Coles website.
    
    Args:
        driver (WebDriver): Initialized WebDriver instance.
        url (str): URL of the Coles website.
        
    Returns:
        list: List of BeautifulSoup elements representing categories.
    """
    driver.get(url + "/browse")
    soup = BeautifulSoup(driver.page_source, "html.parser")
    categories = soup.find_all("a", class_="coles-targeting-ShopCategoriesShopCategoryStyledCategoryContainer")
    for category in categories:
        print(category.text)
    return categories

def create_database():
    """
    Creates a SQLite database to store product information.
    
    Returns:
        str: Path to the SQLite database file.
    """
    db_path = "products.db"
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    
    # Create products table if it doesn't exist
    c.execute('''CREATE TABLE IF NOT EXISTS products (
                    product_code INTEGER PRIMARY KEY,
                    category TEXT,
                    name TEXT,
                    price TEXT,
                    link TEXT,
                    on_special BOOLEAN DEFAULT 0
                 )''')
    
    conn.commit()
    conn.close()
    
    return db_path

def insert_product(conn, category, product_code, name, price, link, on_special=False):
    """
    Inserts or updates a product into the SQLite database.
    
    Args:
        conn (sqlite3.Connection): SQLite database connection.
        category (str): Product category.
        product_code (str): Product code.
        name (str): Product name.
        price (str): Product price.
        link (str): Product link.
        on_special (bool): Whether the product is on special (default is False).
    """
    c = conn.cursor()
    
    # Check if the product_code already exists
    c.execute("SELECT * FROM products WHERE product_code=?", (product_code,))
    existing_product = c.fetchone()
    
    if existing_product:
        # Update the existing row
        c.execute("UPDATE products SET category=?, name=?, price=?, link=?, on_special=? WHERE product_code=?",
                  (category, name, price, link, on_special, product_code))
    else:
        # Insert a new row
        c.execute("INSERT INTO products (category, product_code, name, price, link, on_special) VALUES (?, ?, ?, ?, ?, ?)",
                  (category, product_code, name, price, link, on_special))
    
    conn.commit()

def scrape_products_in_category(driver, category, url, conn):
    """
    Scrapes products within a given category and inserts them into the SQLite database.
    
    Args:
        driver (WebDriver): Initialized WebDriver instance.
        category (BeautifulSoup): BeautifulSoup element representing a category.
        url (str): Base URL of the Coles website.
        conn (sqlite3.Connection): SQLite database connection.
    """
    category_name = category.text.strip()
    category_link = category.get("href")
    if category_link == "/browse/tobacco":
        return
    category_link = url + category_link
    print(category_link)
    driver.get(category_link)
    
    while True:
        soup = BeautifulSoup(driver.page_source, "html.parser")
        products = soup.find_all("header", class_="product__header")
        
        for product in products:
            name = product.find("h2", class_="product__title")
            price = product.find("span", class_="price__value")
            product_link = product.find("a", class_="product__link")["href"]
            product_code = product_link.split("-")[-1]
            if name and price:
                name = name.text.strip()
                price = price.text.strip()
                link = url + product_link
                insert_product(conn, category_name, product_code, name, price, link)
                
        pagination = soup.find("ul", class_="coles-targeting-PaginationPaginationUl")
        if not pagination:
            break
        if pagination:
            pages = pagination.find_all("li")
            last_page = int(pages[-2].text.strip()) if pages else 1
        else:
            last_page = 1
        total_pages = int(pages[-2].text.strip())
        print(total_pages)            
        for page in range(2, last_page + 1):
            next_page_link = f"{category_link}?page={page}"
            driver.get(next_page_link)
            soup = BeautifulSoup(driver.page_source, "html.parser")
            products = soup.find_all("header", class_="product__header")
            
            for product in products:
                name = product.find("h2", class_="product__title")
                price = product.find("span", class_="price__value")
                product_link = product.find("a", class_="product__link")["href"]
                product_code = product_link.split("-")[-1]
                if name and price:
                    name = name.text.strip()
                    price = price.text.strip()
                    link = url + product_link
                    special = 0;
                    if category_name == "Specials":
                        special = 1;
                    insert_product(conn, category_name, product_code, name, price, link, special)
            time.sleep(random.randint(1, 5))     
        if page == last_page:
            break

def main():
    """
    Main function to orchestrate the scraping process.
    """
    url = "https://www.coles.com.au"
    options = Options()
    driver = configure_driver()
    db_path = create_database()
    conn = sqlite3.connect(db_path)
    
    try:
        print("Here we go...")
        categories = get_categories(driver, url)
        
        for category in categories:
            scrape_products_in_category(driver, category, url, conn)
            time.sleep(random.randint(1, 5))
            
        print("Finished")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        conn.close()
        driver.quit()

if __name__ == "__main__":
    main()
