In [None]:
import asyncio
import nest_asyncio
import random
import requests
import time
import re
import math
import json
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from fake_useragent import UserAgent
from tenacity import (
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_random,
)
nest_asyncio.apply()

headers = {
    "Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Accept-Language': 'en-US,en;q=0.9',
    'Cache-Control': 'max-age=0',
    "User-Agent": UserAgent().random,
    'Priority': "u=0, i",
    "Upgrade-Insecure-Requests": "1",
    "Cookie": "wp_ga4_customerGroup=NOT+LOGGED+IN; private_content_version=1fd9b0bb9111f815fb7cc0a2e1b795ad; aws-waf-token=b91f6c13-c3c0-4ec3-a7d8-550794a8bab3:BgoAk3cBOREiAAAA:PKYZTtk3ZLHvTjqbebPH7ufj4dmmpWy1IlXw54gtVwoFfEp98V/nK037tyC5DSvl9OzpfidRLN+1piQTIY2t/NTfkI0XZyGaSMZ/3npm2dZE+AjvhGX6qTmw1wqTX5LRfU22N36ziK2KEU9pZAHu9DJzmdRP1i7Crd1RGecYNV/y3r+7tDwE0A2HqpfwOIMBWFw=",
    "referer": 'https://www.google.com/',
    "Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Opera GX\";v=\"118\", \"Chromium\";v=\"133\"",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "\"Windows\"",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1"
}

MAX_RETRIES = 10
MAX_WAIT_BETWEEN_REQ = 2
MIN_WAIT_BETWEEN_REQ = 0
REQUEST_TIMEOUT = 30

class ProductsETL():
    def __init__(self):
        self.session = requests.Session()
        
    @retry(
        wait=wait_random(min=MIN_WAIT_BETWEEN_REQ, max=MAX_WAIT_BETWEEN_REQ),
        stop=stop_after_attempt(MAX_RETRIES),
        retry=retry_if_exception_type(requests.RequestException),
        reraise=True,
    )
    async def extract_scrape_content(self, url, selector):
        soup = None
        browser = None
        try:
            async with async_playwright() as p:
                browser_args = {
                    "headless": True,
                    "args": ["--disable-blink-features=AutomationControlled"]
                }

                browser = await p.chromium.launch(**browser_args)
                context = await browser.new_context(
                    locale="en-US",
                    user_agent=UserAgent().random,
                    viewport={"width": 1280, "height": 800},
                    device_scale_factor=1,
                    is_mobile=False,
                    has_touch=False,
                    screen={"width": 1280, "height": 800},
                    timezone_id="Asia/Manila"
                )

                page = await context.new_page()

                await page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
                await page.set_extra_http_headers(headers)
                await page.goto(url, wait_until="domcontentloaded")
                await page.wait_for_selector(selector, timeout=30000)

                for _ in range(random.randint(3, 6)):
                    await page.mouse.wheel(0, random.randint(300, 700))
                    await asyncio.sleep(random.uniform(0.5, 1))

                for _ in range(random.randint(5, 10)):
                    await page.mouse.move(random.randint(0, 800), random.randint(0, 600))
                    await asyncio.sleep(random.uniform(0.5, 1))

                rendered_html = await page.content()
                return BeautifulSoup(rendered_html, "html.parser")

        except Exception as e:
            print(f"An error occurred: {e}")

        finally:
            if browser:
                await browser.close()

    async def _scroll_products(self, url):
        soup = None
        browser = None
        try:
            async with async_playwright() as p:
                browser_args = {
                    "headless": True,
                    "args": ["--disable-blink-features=AutomationControlled"]
                }

                browser = await p.chromium.launch(**browser_args)
                context = await browser.new_context(
                    user_agent=UserAgent().random,
                    viewport={"width": random.randint(
                        1200, 1600), "height": random.randint(800, 1200)},
                    locale="en-US"
                )

                page = await context.new_page()
                await page.set_extra_http_headers(headers)

                await page.goto(url, wait_until="domcontentloaded")
                await page.wait_for_selector('#root-product-list', timeout=30000)

                print(
                    "Starting to scrape the product list (Infinite scroll scrape)...")

                scroll_step = 1500
                scroll_delay = 5

                previous_count = 0
                same_count_retries = 0
                max_retries = 3

                while True:
                    # Scroll to the bottom
                    await page.evaluate(f'window.scrollBy(0, {scroll_step})')
                    await asyncio.sleep(scroll_delay)

                    # Check if the spinner exists
                    current_count = await page.evaluate("""
                        () => document.querySelectorAll('div.item-siminia-product-grid-item-3do').length
                    """)

                    print(f"Current item count: {current_count}")

                    if current_count > previous_count:
                        previous_count = current_count
                        scroll_step += scroll_step
                        same_count_retries = 0
                    else:
                        same_count_retries += 1
                        print(f"No new items loaded. Retry {same_count_retries}/{max_retries}")

                        if same_count_retries >= max_retries:
                            print("No more items being loaded. Done scrolling.")
                            break

                print("Scraping complete. Extracting content...")

                rendered_html = await page.content()
                print(
                    f"Successfully extracted data from {url}"
                )
                soup = BeautifulSoup(rendered_html, "html.parser")
                return soup.find_all('div', class_="item-siminia-product-grid-item-3do")

        except Exception as e:
            print(f"An error occurred: {e}")

        finally:
            if browser:
                await browser.close()

    def extract_from_url(self, method: str, url: str, params: dict = None, data: dict = None, headers: dict = None, verify: bool = True) -> BeautifulSoup:
        try:
            # Parse request response
            response = self.session.request(
                method=method, url=url, params=params, data=data, headers=headers, verify=verify)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")
            print(
                f"Successfully extracted data from {url} {response.status_code}"
            )
            sleep_time = random.uniform(
                MIN_WAIT_BETWEEN_REQ, MAX_WAIT_BETWEEN_REQ)
            print(f"Sleeping for {sleep_time} seconds...")
            return soup

        except Exception as e:
            print(f"Error in parsing {url}: {e}")


    def transform(self, soup, url):
       pass


    def extract_links(self, url: str) -> pd.DataFrame:
        pass
    

e = ProductsETL()
# e.extract_links('https://western.com.ph/shop/gadgets/smartphones/')

product_list = [
    'https://western.com.ph/product/apple-iphone-16-256gb-teal/',
    'https://western.com.ph/product/oppo-a3bk/',
    'https://western.com.ph/product/realme-realmec75go/',
    'https://western.com.ph/product/samsung-a06ltbl/'
]

q = asyncio.run(e.extract_scrape_content(product_list[0], '#produduct-info'))
e.transform(q, product_list[0])