In [1]:
"""
Dubizzle Oman car listings scraper using Selenium.

Install:
    pip install selenium webdriver-manager
Optional:
    pip install undetected-chromedriver
"""

import argparse
import csv
import json
import os
import re
import time
from dataclasses import dataclass, field, asdict
from typing import Any, Dict, List, Optional

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Optional helpers (best-effort)
try:
    from webdriver_manager.chrome import ChromeDriverManager
    _USE_WDM = True
except Exception:
    _USE_WDM = False

try:
    import undetected_chromedriver as uc  # type: ignore
    _USE_UC = True
except Exception:
    _USE_UC = False

# Safe base directory for both scripts and notebooks
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    BASE_DIR = os.getcwd()

DEFAULT_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0 Safari/537.36"
    )
}

def _is_notebook() -> bool:
    """Detect if running inside a Jupyter/Colab/QtConsole environment."""
    try:
        from IPython import get_ipython  # type: ignore
        ip = get_ipython()
        if ip is None:
            return False
        return "ZMQInteractiveShell" in type(ip).__name__
    except Exception:
        return False


@dataclass
class CarAd:
    """Representation of a single car advertisement."""
    ad_id: str
    title: Optional[str] = None
    location: Optional[str] = None
    price: Optional[int] = None
    year: Optional[int] = None
    mileage: Optional[int] = None
    condition: Optional[str] = None
    fuel_type: Optional[str] = None
    transmission: Optional[str] = None
    make: Optional[str] = None
    model: Optional[str] = None
    version: Optional[str] = None
    color: Optional[str] = None
    doors: Optional[str] = None
    seats: Optional[int] = None
    interior: Optional[str] = None
    warranty: Optional[str] = None
    owners: Optional[int] = None
    body_type: Optional[str] = None
    payment_option: Optional[str] = None
    source: Optional[str] = None
    extra_features: List[str] = field(default_factory=list)

    @classmethod
    def from_data_layer(cls, data: Dict[str, Any]) -> "CarAd":
        """Construct a CarAd instance from the dataLayer dictionary."""
        def to_int(x):
            try:
                return int(x)
            except Exception:
                try:
                    s = re.sub(r"[^\d]", "", str(x))
                    return int(s) if s else None
                except Exception:
                    return None

        return cls(
            ad_id=str(data.get("ad_id", "")),
            title=data.get("ad_title"),
            price=to_int(data.get("price")),
            year=to_int(data.get("year")),
            mileage=to_int(data.get("mileage")),
            make=data.get("make"),
            model=data.get("model"),
            version=data.get("version"),
            seats=to_int(data.get("seats")),
            owners=to_int(data.get("owners")),
            fuel_type=data.get("petrol") or data.get("fuel_type"),
            transmission=data.get("transmission"),
            color=data.get("color"),
            doors=data.get("doors"),
            interior=data.get("interior"),
            warranty=(
                "Yes" if str(data.get("has_warranty")) == "1"
                else "No" if str(data.get("has_warranty")) == "2"
                else None
            ),
            body_type=data.get("body_type"),
            payment_option=data.get("payment_option"),
            source=data.get("source"),
            extra_features=[str(feat) for feat in data.get("extra_features", [])],
        )


def get_driver(headless: bool = True) -> webdriver.Chrome:
    """Create and return a Selenium Chrome WebDriver."""
    if _USE_UC:
        uc_options = uc.ChromeOptions()
        if headless:
            uc_options.add_argument("--headless=new")
        uc_options.add_argument("--no-sandbox")
        uc_options.add_argument("--disable-dev-shm-usage")
        uc_options.add_argument("--disable-gpu")
        uc_options.add_argument("--window-size=1920,1080")
        uc_options.add_argument(f"--user-agent={DEFAULT_HEADERS['User-Agent']}")
        uc_options.add_argument("--disable-blink-features=AutomationControlled")
        return uc.Chrome(options=uc_options)  # type: ignore

    options = Options()
    if headless:
        options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument(f"--user-agent={DEFAULT_HEADERS['User-Agent']}")

    if _USE_WDM:
        driver_path = ChromeDriverManager().install()
        service = Service(driver_path)
        driver = webdriver.Chrome(service=service, options=options)
    else:
        driver = webdriver.Chrome(options=options)

    # Light evasion
    try:
        driver.execute_cdp_cmd(
            "Page.addScriptToEvaluateOnNewDocument",
            {"source": "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"}
        )
    except Exception:
        pass
    return driver


def extract_ad_links_from_page(driver: webdriver.Chrome) -> List[str]:
    """Extract listing URLs from the current results page."""
    urls: List[str] = []
    links = driver.find_elements(By.CSS_SELECTOR, "a[href*='-ID']")
    for link in links:
        href = link.get_attribute("href")
        if href:
            urls.append(href)
    # dedupe while preserving order
    seen, out = set(), []
    for u in urls:
        if u not in seen:
            seen.add(u)
            out.append(u)
    return out


def resolve_ad_url(ad_id: str) -> str:
    """Build a minimal slug URL for a given ad id."""
    return f"https://www.dubizzle.com.om/en/ad/ID{ad_id}.html"


def _assign_kv_to_car(car: CarAd, label: str, value: str) -> None:
    """
    Map label/value pairs from spec blocks like:
    <div class="_9a8eacd9"><span>Model</span><span>Sportage</span></div>
    """
    key = (label or "").strip().lower()
    val = (value or "").strip()

    label_map = {
        "model": "model",
        "version": "version",
        "payment options": "payment_option",
        "payment option": "payment_option",
        "make": "make",
        "color": "color",
        "doors": "doors",
        "seats": "seats",
        "warranty": "warranty",
        "body type": "body_type",
        "fuel type": "fuel_type",
        "transmission": "transmission",
        "interior": "interior",
        "owners": "owners",
        "year": "year",
        "mileage": "mileage",
        "location": "location",
        "condition": "condition",
        "price": "price",
    }

    attr = label_map.get(key)
    if not attr:
        if label and val:
            car.extra_features.append(f"{label}: {value}")
        return

    def to_int_or_none(s: str) -> Optional[int]:
        try:
            s_clean = re.sub(r"[^\d]", "", s or "")
            return int(s_clean) if s_clean else None
        except Exception:
            return None

    if attr in {"seats", "owners", "year", "mileage", "price"}:
        setattr(car, attr, to_int_or_none(val))
    else:
        setattr(car, attr, val or None)


def parse_listing(driver: webdriver.Chrome, ad_url: str) -> Optional[CarAd]:
    """
    Navigate to a listing page and extract detailed information.
    Tries window.dataLayer first, then JSON-LD, then on-page key/value blocks.
    """
    try:
        driver.get(ad_url)
    except Exception:
        return None

    # Wait a bit for dynamic content
    try:
        WebDriverWait(driver, 8).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "body"))
        )
    except Exception:
        pass

    ad_id_match = re.search(r"ID(\d+)", ad_url)
    ad_id = ad_id_match.group(1) if ad_id_match else ""

    # 1) window.dataLayer
    data_layer = None
    try:
        data_layer = driver.execute_script(
            "return (window.dataLayer && window.dataLayer[0]) || null;"
        )
    except Exception:
        data_layer = None

    if isinstance(data_layer, dict):
        car = CarAd.from_data_layer(data_layer)
        # Title fallback
        if not car.title:
            try:
                h1 = driver.find_element(By.TAG_NAME, "h1")
                car.title = h1.text.strip() or None
            except Exception:
                pass
        # Location fallback
        if not car.location:
            try:
                loc = driver.find_element(
                    By.CSS_SELECTOR,
                    "[data-testid*='location'], a[href*='/locations/'], span[aria-label*='location' i]"
                )
                car.location = loc.text.strip() or None
            except Exception:
                pass
        # Enrich via KV blocks
        try:
            kv_blocks = driver.find_elements(By.CSS_SELECTOR, "div._9a8eacd9")
            for blk in kv_blocks:
                spans = blk.find_elements(By.TAG_NAME, "span")
                if len(spans) >= 2:
                    _assign_kv_to_car(car, spans[0].text, spans[1].text)
        except Exception:
            pass
        return car

    # 2) JSON-LD fallback
    html = driver.page_source
    ld_json_pattern = re.compile(
        r'<script[^>]+type="application/ld\+json"[^>]*>(.*?)</script>',
        re.DOTALL | re.IGNORECASE,
    )
    for match in ld_json_pattern.finditer(html):
        script_content = match.group(1).strip()
        try:
            data = json.loads(script_content)
        except Exception:
            continue

        objs = data if isinstance(data, list) else [data]
        for obj in objs:
            if not isinstance(obj, dict):
                continue
            if obj.get("@type") == "Product":
                year = None
                if obj.get("modelDate"):
                    try:
                        year = int(obj["modelDate"])
                    except Exception:
                        pass
                mileage = None
                if isinstance(obj.get("mileageFromOdometer"), dict):
                    try:
                        mileage = int(obj["mileageFromOdometer"].get("value"))
                    except Exception:
                        pass
                price = None
                offers = obj.get("offers")
                if isinstance(offers, dict) and offers.get("price"):
                    try:
                        price = int(offers["price"])
                    except Exception:
                        pass

                brand = obj.get("brand")
                make = brand.get("name") if isinstance(brand, dict) else (brand if isinstance(brand, str) else None)

                car = CarAd(
                    ad_id=ad_id,
                    title=obj.get("name"),
                    make=make,
                    model=obj.get("model"),
                    year=year,
                    mileage=mileage,
                    fuel_type=obj.get("fuelType"),
                    price=price,
                )
                # Enrich via KV blocks
                try:
                    kv_blocks = driver.find_elements(By.CSS_SELECTOR, "div._9a8eacd9")
                    for blk in kv_blocks:
                        spans = blk.find_elements(By.TAG_NAME, "span")
                        if len(spans) >= 2:
                            _assign_kv_to_car(car, spans[0].text, spans[1].text)
                except Exception:
                    pass

                # Title fallback
                if not car.title:
                    try:
                        h1 = driver.find_element(By.TAG_NAME, "h1")
                        car.title = h1.text.strip() or None
                    except Exception:
                        pass

                return car

    # 3) Last resort: KV blocks only
    car = CarAd(ad_id=ad_id)
    try:
        kv_blocks = driver.find_elements(By.CSS_SELECTOR, "div._9a8eacd9")
        for blk in kv_blocks:
            spans = blk.find_elements(By.TAG_NAME, "span")
            if len(spans) >= 2:
                _assign_kv_to_car(car, spans[0].text, spans[1].text)
        if any([car.model, car.version, car.payment_option, car.make]):
            try:
                h1 = driver.find_element(By.TAG_NAME, "h1")
                car.title = h1.text.strip() or None
            except Exception:
                pass
            return car
    except Exception:
        pass

    return None


def scrape_cars(driver: webdriver.Chrome, delay: float = 1.0) -> List[CarAd]:
    """Iterate over result pages and collect car adverts."""
    all_ads: List[CarAd] = []
    base_url = "https://www.dubizzle.com.om/en/vehicles/cars-for-sale/"
    page = 1
    empty_hits = 0

    # Pagination capped at 199
    while page <= 199:
        page_url = base_url if page == 1 else f"{base_url}?page={page}"
        print(f"[INFO] Fetching page {page}: {page_url}")
        try:
            driver.get(page_url)
        except Exception as exc:
            print(f"[WARN] Could not navigate to {page_url}: {exc}")
            break

        try:
            WebDriverWait(driver, 8).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "body"))
            )
        except Exception:
            pass

        time.sleep(1.5)  # allow lazy content
        ad_links = extract_ad_links_from_page(driver)

        if not ad_links:
            empty_hits += 1
            print(f"[INFO] No advertisement links found on page {page}. ({empty_hits}x)")
            if empty_hits >= 2:
                print("[INFO] Stopping due to consecutive empty pages.")
                break
            page += 1
            time.sleep(delay)
            continue

        empty_hits = 0
        print(f"[INFO] Page {page}: found {len(ad_links)} ads")

        for ad_url in ad_links:
            ad = parse_listing(driver, ad_url)
            if ad:
                all_ads.append(ad)
                print(f"[INFO] Parsed ad {ad.ad_id or 'UNKNOWN'}: {ad.title}")
            else:
                id_match = re.search(r"ID(\d+)", ad_url)
                ad_id = id_match.group(1) if id_match else ad_url
                print(f"[WARN] Failed to parse ad {ad_id}")

            time.sleep(0.6)  # be gentle

        time.sleep(delay)
        page += 1

    return all_ads


def save_to_csv(cars: List[CarAd], output_path: str) -> None:
    """Save a list of CarAd objects to a CSV file."""
    if not cars:
        print("[WARN] No car data to write.")
        return
    fieldnames = list(asdict(cars[0]).keys())
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for car in cars:
            writer.writerow(asdict(car))
    print(f"[INFO] Wrote {len(cars)} ads to {output_path}")


def run_in_notebook(output: str = None, delay: float = 1.0, headless: bool = True) -> str:
    """Convenience runner for Jupyter/REPL."""
    if output is None:
        output = os.path.join(BASE_DIR, "ml-test", "data", "cars.csv")
    os.makedirs(os.path.dirname(output), exist_ok=True)
    driver = get_driver(headless=headless)
    try:
        cars = scrape_cars(driver, delay=delay)
    finally:
        driver.quit()
    save_to_csv(cars, output)
    return output


def main(argv: Optional[List[str]] = None) -> None:
    """CLI entry. Uses parse_known_args to ignore unknown Jupyter args."""
    parser = argparse.ArgumentParser(description="Scrape Dubizzle Oman car adverts using Selenium")
    parser.add_argument(
        "--output",
        type=str,
        default=os.path.join(BASE_DIR, "ml-test", "data", "cars.csv"),
        help="Path to output CSV file (default: ml-test/data/cars.csv relative to this script or CWD)",
    )
    parser.add_argument(
        "--delay",
        type=float,
        default=1.0,
        help="Delay in seconds between result page navigations (default: 1.0)",
    )
    parser.add_argument(
        "--headless",
        dest="headless",
        action="store_true",
        default=True,
        help="Run in headless mode (default)",
    )
    parser.add_argument(
        "--ui",
        dest="headless",
        action="store_false",
        help="Run with a visible browser window",
    )

    if argv is None:
        argv = []
    args, _unknown = parser.parse_known_args(argv)

    driver = get_driver(headless=args.headless)
    try:
        cars = scrape_cars(driver, delay=args.delay)
    finally:
        driver.quit()
    save_to_csv(cars, args.output)


# --- Entry point that works in both worlds ---
if __name__ == "__main__":
    if _is_notebook():
        # Notebook: skip argparse and run with defaults
        out_path = run_in_notebook(headless=True, delay=1.0)
        print("Saved to:", out_path)
    else:
        import sys
        main(sys.argv[1:])


[INFO] Fetching page 1: https://www.dubizzle.com.om/en/vehicles/cars-for-sale/
[INFO] Page 1: found 34 ads
[INFO] Parsed ad 130597916: CHERY ARRIZO 6 PRO 1.5T+CVT FLAGSHIP - MY:2024 - UNUSED
[INFO] Parsed ad 130597996: SUZUKI GRAND VITARA 1.6 5DR : MY 2020
[INFO] Parsed ad 130617836: Jeep GRAND CHEROKEE LTD - MY: 2015
[INFO] Parsed ad 129976145: Porsche - Cayenne 2019
[INFO] Parsed ad 129978253: Porsche Taycan 4S 2021
[INFO] Parsed ad 130472674: تويوتا لاند كروزر بيك أب 2025
[INFO] Parsed ad 130531373: Land Rover Range Rover Velar 2022
[INFO] Parsed ad 130580052: تويوتا راف فور LE  موديل 2022 مع ضمان 3 سنوات
[INFO] Parsed ad 130570159: مرسيدس S450 موديل 2019 مع ضمان 3 سنوات
[INFO] Parsed ad 130285029: مرسيدس c300  2023 ماشي 1400 كيلو
[INFO] Parsed ad 129879386: Nissan Sentra 2020
[INFO] Parsed ad 130339949: لاند روفر رينج روڤر ڤيلار 2018 صبغ الوكالة بدون حوادث خليجي
[INFO] Parsed ad 130190937: تويوتا كورولا 2023 خليجي زيرو كيلو متر
[INFO] Parsed ad 130581640: شيفروليه سلفرادو 2022
[INF

In [2]:
import pandas as pd

path = r"C:\Users\bbuser\Desktop\driver\ml-test\data\cars.csv"
df = pd.read_csv(path)

In [3]:
df.head(10)

Unnamed: 0,ad_id,title,location,price,year,mileage,condition,fuel_type,transmission,make,...,color,doors,seats,interior,warranty,owners,body_type,payment_option,source,extra_features
0,130597916.0,CHERY ARRIZO 6 PRO 1.5T+CVT FLAGSHIP - MY:2024...,"Ruwi, Muscat",6000.0,2024.0,100.0,,3.0,2.0,134,...,Grey,4-5,5.0,Part Leather,No,1.0,Sedan,Cash,gcc,"['3', '1', '21', '17', '27', '20', '18', '19',..."
1,130597996.0,SUZUKI GRAND VITARA 1.6 5DR : MY 2020,"Ruwi, Muscat",3300.0,2020.0,190000.0,,3.0,2.0,36,...,White,4-5,5.0,,No,1.0,SUV,Cash,gcc,"['17', '19', '20', '3', '1', '8', 'Brand: Suzu..."
2,130617836.0,Jeep GRAND CHEROKEE LTD - MY: 2015,"Ruwi, Muscat",4100.0,2015.0,169957.0,,3.0,2.0,64,...,Red,4-5,5.0,Part Leather,No,1.0,SUV,Cash,gcc,"['17', '20', '19', '18', '3', '1', '28', '26',..."
3,129976145.0,Porsche - Cayenne 2019,"Al Ghubrah, Muscat",28000.0,2019.0,58000.0,,3.0,2.0,30,...,Purple,4-5,5.0,Full Leather,No,1.0,SUV,Installments,company,"['Brand: Porsche', 'Power (hp): 340', 'Air Con..."
4,129978253.0,Porsche Taycan 4S 2021,"Al Ghubrah, Muscat",39900.0,2021.0,43000.0,,2.0,2.0,30,...,White,4-5,5.0,Full Leather,No,,Sedan,Installments,company,"['Brand: Porsche', 'Air Conditioning: Automati..."
5,130472674.0,تويوتا لاند كروزر بيك أب 2025,"Al Masn'a, Al Batinah",20800.0,2025.0,0.0,,1.0,1.0,65,...,Beige,4-5,,Cloth,No,1.0,Pickup,Cash,gcc,"['Brand: Toyota', 'Power (hp): 201', 'Consumpt..."
6,130531373.0,Land Rover Range Rover Velar 2022,"Al Khoud, Muscat",19400.0,2022.0,32000.0,,3.0,2.0,77,...,Other,4-5,6.0,Full Leather,No,1.0,SUV,Cash,us,"['34', '36', '17', '19', '20', '27', '11', '18..."
7,130580052.0,تويوتا راف فور LE موديل 2022 مع ضمان 3 سنوات,"Al Seeb, Muscat",8200.0,2022.0,36000.0,,3.0,2.0,65,...,White,4-5,5.0,Velour,Yes,,SUV,Cash,us,"['36', '17', '19', '11', '18', '25', '1', '3',..."
8,130570159.0,مرسيدس S450 موديل 2019 مع ضمان 3 سنوات,"Al Khaboura, Al Batinah",16200.0,2019.0,67000.0,,3.0,2.0,33,...,Black,4-5,5.0,Full Leather,Yes,,Sedan,Cash,us,"['36', '17', '19', '20', '11', '25', '1', '3',..."
9,130285029.0,مرسيدس c300 2023 ماشي 1400 كيلو,"Al Maabilah, Muscat",16600.0,2023.0,1400.0,,3.0,2.0,33,...,White,,,,Yes,,Sedan,Installments,us,"['Brand: Mercedes-Benz', 'Source: US']"


In [4]:
df.shape

(6732, 22)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6732 entries, 0 to 6731
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ad_id           6731 non-null   float64
 1   title           6732 non-null   object 
 2   location        6706 non-null   object 
 3   price           6709 non-null   float64
 4   year            6709 non-null   float64
 5   mileage         6709 non-null   float64
 6   condition       0 non-null      float64
 7   fuel_type       6709 non-null   float64
 8   transmission    6709 non-null   float64
 9   make            6709 non-null   object 
 10  model           6731 non-null   object 
 11  version         3820 non-null   object 
 12  color           6731 non-null   object 
 13  doors           5144 non-null   object 
 14  seats           4617 non-null   float64
 15  interior        5399 non-null   object 
 16  warranty        6731 non-null   object 
 17  owners          3685 non-null   f