In [None]:
import re
import time
import pandas as pd
from bs4 import BeautifulSoup

# ---------- Selenium setup ----------
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ------------ Config -------------
BASE_URL = "https://www.dubizzle.com.om"
URL_TEMPLATE = BASE_URL + "/en/vehicles/cars-for-sale/q-cars-for-sale/?page={}"
MAX_PAGES = 199                    # increase if needed
RENDER_TIMEOUT = 20              # seconds to wait for listings to appear
DETAIL_TIMEOUT = 15              # seconds to wait for detail content
POLITE_DELAY = 2                 # seconds between pages / requests

# Data storage
dubizzle_cars = {
    'car_title': [],
    'location': [],
    'year': [],
    'price': [],
    'kilometers': [],
    'transmission_type': [],
    'number_of_doors': [],
    'source': [],   # GCC, US, etc.
    'model': [],
    'color': [],
    'contact_number': [],
}

# ---------- Utilities ----------
def norm(s: str) -> str:
    return re.sub(r'\s+', ' ', s or '').strip()

def get_value_by_label(container, label_keywords, label_tag='span'):
    """
    Look for a container where the 1st <span> is label ('Transmission') and 2nd <span> is value ('Automatic').
    """
    for box in container.find_all(['div', 'li', 'section', 'article'], recursive=True):
        spans = box.find_all('span', recursive=False)
        if len(spans) < 2:
            spans = box.find_all('span', recursive=True)

        if len(spans) >= 2:
            label_text = norm(spans[0].get_text()).lower()
            for kw in label_keywords:
                if kw in label_text:
                    value_text = norm(spans[1].get_text())
                    if value_text:
                        return value_text
    return None

# ---------- Field-specific extractors ----------
def extract_kilometers(listing):
    """
    Extract kilometers value using several strategies.
    """
    km_value = "N/A"

    # A) Generic label -> value
    generic = get_value_by_label(listing, ['kilometer', 'km', 'mileage'])
    if generic:
        return generic

    # B) Known class pattern (harmless if classes differ)
    for block in listing.select('div._948d9e0a'):
        spans = block.select('span._8206696c')
        if len(spans) >= 2 and spans[0].get_text(strip=True).lower() in ["kilometers", "km", "mileage"]:
            km_value = spans[1].get_text(strip=True)
            break

    # C) Any span with "km" or "kilometers"
    if km_value == "N/A":
        km_spans = listing.find_all('span', string=re.compile(r'\d+.*km|kilometers', re.IGNORECASE))
        for span in km_spans:
            km_text = span.get_text(strip=True)
            km_match = re.search(r'(\d[\d,]*)', km_text)
            if km_match:
                km_value = km_match.group(1)
                break

    # D) aria-label
    if km_value == "N/A":
        km_tag = listing.find('span', attrs={'aria-label': re.compile(r'kilometer', re.IGNORECASE)})
        if km_tag:
            km_span = km_tag.find('span')
            if km_span:
                km_value = km_span.text.strip()

    # E) Final free regex scan
    if km_value == "N/A":
        for span in listing.find_all('span'):
            text = span.get_text(strip=True)
            if re.search(r'\d[\d,]*\s*(km|kilometers)', text, re.IGNORECASE):
                km_match = re.search(r'(\d[\d,]*)', text)
                if km_match:
                    km_value = km_match.group(1)
                    break

    return km_value

def extract_transmission(listing):
    """
    Strict rule you wanted:
    If FIRST <span> == 'Transmission' (or 'Transmission Type'), return SECOND <span>.
    """
    for box in listing.find_all('div'):
        spans = box.find_all('span', recursive=False)
        if len(spans) < 2:
            spans = box.find_all('span', recursive=True)
        if len(spans) >= 2:
            label = norm(spans[0].get_text()).lower()
            if label in ('transmission', 'transmission type'):
                return norm(spans[1].get_text())
    return "N/A"

def extract_doors(listing):
    """
    Extract number of doors.
    Priority:
      1) Class-based: <div class="_9a8eacd9"><span>Number of doors</span><span>4/5</span></div>
      2) Generic 'first span = label / second = value'
      3) aria-label fallback
    """
    # --- 1) Class-based extraction (your provided class) ---
    door_block = listing.find('div', class_='_9a8eacd9')
    if door_block:
        spans = door_block.find_all('span')
        if len(spans) >= 2:
            return norm(spans[1].get_text())

    # --- 2) Generic rule ---
    for box in listing.find_all('div'):
        spans = box.find_all('span', recursive=False)
        if len(spans) < 2:
            spans = box.find_all('span', recursive=True)
        if len(spans) >= 2 and norm(spans[0].get_text()).lower() in ['number of doors', 'doors']:
            return norm(spans[1].get_text())

    # --- 3) Fallback: aria-label ---
    tag = listing.find('span', attrs={'aria-label': re.compile(r'door', re.IGNORECASE)})
    if tag:
        value = tag.find('span')
        if value:
            return norm(value.get_text())

    return "N/A"

def extract_source(container):
    """
    Extract Source (e.g., GCC, US) from:
      <div class="_9a8eacd9"><span>Source</span><span>GCC</span></div>
    Falls back to generic label->value search.
    Works for either a listing card or a detail page soup.
    """
    # 1) Class-based block
    src_block = container.find('div', class_='_9a8eacd9')
    if src_block:
        spans = src_block.find_all('span')
        if len(spans) >= 2 and norm(spans[0].get_text()).lower() == 'source':
            return norm(spans[1].get_text())

    # 2) Generic fallback (covers other wordings too)
    generic = get_value_by_label(container, ['source', 'regional specs', 'regional specification'])
    if generic:
        return generic

    return "N/A"

# ---------- Selenium helpers ----------
def make_driver():
    opts = uc.ChromeOptions()
    opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--window-size=1366,768")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    # Same UA helps reduce bot checks
    opts.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/91.0.4472.124 Safari/537.36")
    return uc.Chrome(options=opts)

def get_rendered_soup(driver, url, wait_selector="li[aria-label='Listing'], div[class*='listing'], article"):
    driver.get(url)
    driver.execute_script("window.scrollTo(0, 300);")
    try:
        WebDriverWait(driver, RENDER_TIMEOUT).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, wait_selector))
        )
    except TimeoutException:
        pass
    return BeautifulSoup(driver.page_source, 'html.parser')

def get_detail_soup(driver, url):
    driver.get(url)
    try:
        WebDriverWait(driver, DETAIL_TIMEOUT).until(
            EC.any_of(
                EC.presence_of_element_located((By.CSS_SELECTOR, "h1, h2")),
                EC.presence_of_element_located((By.CSS_SELECTOR, "div, section"))
            )
        )
    except TimeoutException:
        pass
    driver.execute_script("window.scrollTo(0, 600);")
    time.sleep(0.5)
    return BeautifulSoup(driver.page_source, 'html.parser')

# ---------- Main scrape ----------
def main():
    driver = make_driver()

    current_page = 1
    while True:
        url = URL_TEMPLATE.format(current_page)
        print(f"Fetching page {current_page}: {url}")

        soup = get_rendered_soup(driver, url)

        listing_selectors = [
            ('li', {"aria-label": "Listing"}),
            ('div', {"class": re.compile(r'listing', re.IGNORECASE)}),
            ('article', {}),
            ('div', {"class": re.compile(r'item|card|post', re.IGNORECASE)}),
            ('li', {"class": re.compile(r'listing|item|card', re.IGNORECASE)})
        ]

        listings = []
        for tag, attrs in listing_selectors:
            potential = soup.find_all(tag, attrs) if attrs else soup.find_all(tag)
            print(f"Found {len(potential)} with selector: {tag} {attrs}")
            if potential:
                listings = potential
                print(f"Using selector: {tag} {attrs}")
                break

        if not listings:
            print(f":x: No listings found on page {current_page}. Saved HTML for inspection.")
            with open(f"debug_page_{current_page}.html", "w", encoding="utf-8") as f:
                f.write(soup.prettify())
            break

        print(f":white_check_mark: {len(listings)} listings found")

        for listing in listings:
            # Title
            title_tag = listing.find('h2') or listing.find('h3')
            title = norm(title_tag.get_text()) if title_tag else "N/A"

            # Location (best-effort)
            loc_tag = listing.find('span', class_=re.compile(r'f7d5e47e')) \
                   or listing.find('span', string=re.compile(r',\s*Muscat|AL', re.IGNORECASE))
            location = norm(loc_tag.get_text()) if loc_tag else "N/A"

            # Year
            year_tag = listing.find('span', attrs={'aria-label': re.compile(r'year', re.IGNORECASE)})
            year_span = year_tag.find('span') if year_tag else None
            year = norm(year_span.get_text()) if year_span else "N/A"

            # Price
            price_tag = listing.find('span', class_=re.compile(r'ddc1b288')) \
                        or listing.find('span', string=re.compile(r'OMR|\d'))
            price = norm(price_tag.get_text()) if price_tag else "N/A"

            # Kilometers / Transmission / Doors
            kilometers = extract_kilometers(listing)
            transmission = extract_transmission(listing)
            doors = extract_doors(listing)

            # Detail URL
            link_tag = listing.find('a', href=True)
            detail_url = None
            if link_tag:
                href = link_tag['href']
                detail_url = BASE_URL + href if href.startswith('/') else href

            # Defaults
            model, color, source, contact_number = "N/A", "N/A", "N/A", "N/A"

            # Detail page for more info
            if detail_url:
                try:
                    detail_soup = get_detail_soup(driver, detail_url)

                    def get_detail(label_list):
                        return get_value_by_label(detail_soup, [l.lower() for l in label_list])

                    model_val = get_detail(['Model'])
                    if model_val: model = model_val

                    color_val = get_detail(['Color'])
                    if color_val: color = color_val

                    # --- Source (class-based first, then generic) ---
                    src_val = extract_source(detail_soup)
                    if src_val != "N/A":
                        source = src_val
                    else:
                        source_val = get_detail(['Source', 'Regional Specs', 'Regional Specification'])
                        if source_val: source = source_val

                    # Phone (simple version from aria-label=call; keep as-is)
                    phone_tag = detail_soup.find('a', href=True, attrs={'aria-label': re.compile(r'call', re.IGNORECASE)})
                    if phone_tag:
                        contact_number = norm(phone_tag.get_text())

                    if transmission == "N/A":
                        trans_val = get_detail(['Transmission', 'Transmission Type'])
                        if trans_val: transmission = trans_val

                    if doors == "N/A":
                        doors_val = get_detail(['Number of doors', 'Doors'])
                        if doors_val: doors = doors_val

                    if kilometers == "N/A":
                        km_val = get_detail(['Kilometers', 'KM', 'Mileage'])
                        if km_val: kilometers = km_val

                except Exception as e:
                    print(f":warning: Detail fetch failed: {detail_url} -> {e}")

            # Prevent Excel auto-date for values like "4/5"
            doors_for_csv = f"'{doors}" if re.match(r'^\d+/\d+$', doors) else doors

            # Save row
            dubizzle_cars['car_title'].append(title)
            dubizzle_cars['location'].append(location)
            dubizzle_cars['year'].append(year)
            dubizzle_cars['price'].append(price)
            dubizzle_cars['kilometers'].append(kilometers)
            dubizzle_cars['transmission_type'].append(transmission)
            dubizzle_cars['number_of_doors'].append(doors_for_csv)
            dubizzle_cars['source'].append(source)
            dubizzle_cars['model'].append(model)
            dubizzle_cars['color'].append(color)
            dubizzle_cars['contact_number'].append(contact_number)

            print(f"Title: {title[:40]} | KM: {kilometers} | Trans: {transmission} | Doors: {doors} | Source: {source}")

        current_page += 1
        if current_page > MAX_PAGES:
            print(":white_check_mark: Reached MAX_PAGES.")
            break

        time.sleep(POLITE_DELAY)

    # Export
    df = pd.DataFrame(dubizzle_cars)
    df.to_csv("dubizzle_cars.csv", index=False)
    print(f"\n:white_check_mark: Scraping complete. {len(df)} car listings saved to 'dubizzle_cars.csv'")

    if len(df) > 0:
        km_extracted = (df['kilometers'] != 'N/A').sum()
        transmission_extracted = (df['transmission_type'] != 'N/A').sum()
        doors_extracted = (df['number_of_doors'] != 'N/A').sum()
        source_extracted = (df['source'] != 'N/A').sum()
        print(f"Kilometers extracted: {km_extracted}/{len(df)} ({km_extracted/len(df)*100:.1f}%)")
        print(f"Transmission extracted: {transmission_extracted}/{len(df)} ({transmission_extracted/len(df)*100:.1f}%)")
        print(f"Doors extracted: {doors_extracted}/{len(df)} ({doors_extracted/len(df)*100:.1f}%)")
        print(f"Source extracted: {source_extracted}/{len(df)} ({source_extracted/len(df)*100:.1f}%)")
    else:
        print("No data to analyze - no listings were scraped.")

    driver.quit()

if __name__ == "__main__":
    main()
