In [2]:
# import necessary libraries
from selenium import webdriver
import undetected_chromedriver as uc
import chromedriver_autoinstaller
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import re
import random
import threading

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
vehicleResults = pd.DataFrame(columns=['branch', 'model', 'price','yearOfConstruction','mileStone','transmission','cilinders','prevOwners','link'])
def retryGetElement(func, times = 3, waitTime = 10): 
    count = 0
    accWaitTime = 0
    while count < times:
        try:
            count += 1
            return func()
        except Exception as e:
            print("Execute fail: ", e)
            accWaitTime += waitTime
            driver.refresh()
            time.sleep(accWaitTime)
    print(f'Retry with {times} but still failed') 
    return 'retryFailed'            
        
def extract_price_value(price):
    pattern = r'([\d,]+(?:\.\d+)?)'
    match = re.search(pattern, price)

    if match:
        numeric_value = match.group(1).replace('.', '')
        return numeric_value
    else:
        return '0'

def extract_numeric_value(string):
    # Remove any non-digit characters except the comma
    numeric_string = ''

    for char in string:
        if char.isdigit():
            numeric_string += char
        elif char == ' ':
            break    

    return numeric_string

def extract_year(date):
    # Split the date into month and year parts
    month, year = date.split('/')

    return year
    
def getValueFromKeyFeatureDiv(selector):
    divEle = retryGetElement(lambda: driver.find_element(By.CSS_SELECTOR, selector))
    return retryGetElement(lambda: divEle.find_element(By.CSS_SELECTOR, "div.key-feature__value").text)

def scrapeVehicleData():

    # global vehicleResults

    #Wait page load
    time.sleep(15)
    #Init data
    vehicleData = {}

    #Get branch and model
    branchModelTitle = retryGetElement(lambda: driver.find_element(By.ID, "ad-title").text)
    branchModelSplitArray = branchModelTitle.split()
    branch = branchModelSplitArray[0]
    model = ' '.join(branchModelSplitArray[1:])
    
    #Get price
    price = retryGetElement(lambda: driver.find_element(By.CSS_SELECTOR, "span[data-testid='prime-price']").text)

    #Get year of construction
    yearOfConstruction = retryGetElement(lambda: getValueFromKeyFeatureDiv("div.key-feature--firstRegistration"))

    #Get mileStone
    mileStone = retryGetElement(lambda: getValueFromKeyFeatureDiv("div.key-feature--mileage"))

    #Get transmission
    transmission = retryGetElement(lambda: getValueFromKeyFeatureDiv("div.key-feature--transmission"))

    #Get cilinders
    cilinders = retryGetElement(lambda: driver.find_element(By.ID, "cubicCapacity-v").text)

    #Get prevOwners
    prevOwners = '0'
    try:
        prevOwners = retryGetElement(lambda: getValueFromKeyFeatureDiv("div.key-feature--numberOfPreviousOwners"))
    except:
        print("Vehicle doesn't have preowners")    

    vehicleData["branch"] = branch
    vehicleData["model"] = model
    vehicleData["price"] = extract_price_value(price)
    vehicleData["yearOfConstruction"] = extract_year(yearOfConstruction)
    vehicleData["mileStone"] = extract_numeric_value(mileStone)
    vehicleData["transmission"] = transmission
    vehicleData["cilinders"] = extract_numeric_value(cilinders)
    vehicleData["prevOwners"] = prevOwners
    # vehicleData["link"] = linkHref

    # vehicleResults = pd.concat([vehicleResults, pd.DataFrame([vehicleData])], ignore_index=True)

    print("VehicleData: ", vehicleData)
    return vehicleData

def scrapeFromVehicleLinks():
    global vehicleResults
    time.sleep(10)
    
    # container = retryGetElement(lambda: driver.find_element(By.XPATH, "//article[@data-testid='result-list-container']"))
    # time.sleep(10)

    # Find all <a> tags with href starting with "/fahrzeuge/details.html" inside the container
    links = retryGetElement(lambda: driver.find_elements(By.CSS_SELECTOR, "a[href^='/fahrzeuge/details.html']"))
    print("Begin process new links", links)
    time.sleep(10)
    size = len(links)
    step = 250
    for x in range(size):
        # driver.execute_script(f"window.scrollBy(0,{step})")
        linkHref = links[x].get_attribute("href")
        print("Link: ", linkHref)
        # links[x].click()
        # driver.get(link.get_attribute("href"))

        #Focus to the detail vehicle tab
        # driver.switch_to.window(driver.window_handles[1])

        #Get data
        # print("Scraped data:", scrapeVehicleData(linkHref))

        vehicleData = {}
        vehicleData["link"] = linkHref
        
        vehicleResults = pd.concat([vehicleResults, pd.DataFrame([vehicleData])], ignore_index=True)

        # driver.close()

        # driver.switch_to.window(driver.window_handles[0])

        #Wait page load
        # time.sleep(10)

        # links = retryGetElement(lambda: driver.find_elements(By.CSS_SELECTOR, "a[href^='/fahrzeuge/details.html']"))
        step += 300

def getPageNumberSearchLink(pageNumber):
    return f'https://suchen.mobile.de/fahrzeuge/search.html?cn=DE&door=FOUR_OR_FIVE&ecol=BLACK&ecol=GREY&ecol=SILVER&ecol=WHITE&ft=PETROL&gn=45899%2C+Gelsenkirchen%2C+Nordrhein-Westfalen&isSearchRequest=true&ll=51.5308428%2C7.0328265&ml=%3A200000&p=%3A7500&pageNumber={pageNumber}&rd=100&s=Car&sb=rel&vc=Car'        



# Get all vehicle links and save into the csv first
def getAllVehicleLinks():
    driver.get(getPageNumberSearchLink(pageNum))

    while True:
        try:
            scrapeFromVehicleLinks()
            pageNum += 1
            driver.get(getPageNumberSearchLink(pageNum))
        except Exception as e:
            print("Exception when trying to get next page", e)
            break

    vehicleResults.to_csv('output.csv', index=False)


#Brute force to get detail information of each vehicle
    
def scrollElementRandomly():
    count = 0
    while count < 5:
        try:
            driver.execute_script(f"window.scrollBy(0,{random.randint(100, 1500)})")
            count+=1
            time.sleep(5)
        except Exception as e:
            print("Error while scroll", e)    


# webdriver_manager = ChromeDriverManager().install()


    
    





Execute fail:  Message: no such element: Unable to locate element: {"method":"css selector","selector":"div.key-feature--numberOfPreviousOwners"}
  (Session info: chrome=121.0.6167.86)
Stacktrace:
	GetHandleVerifier [0x008C1673+52979]
	(No symbol) [0x00847961]
	(No symbol) [0x0072DD3D]
	(No symbol) [0x00765FBB]
	(No symbol) [0x007660FB]
	(No symbol) [0x0079CF92]
	(No symbol) [0x00784534]
	(No symbol) [0x0079B3FE]
	(No symbol) [0x00784286]
	(No symbol) [0x0075C063]
	(No symbol) [0x0075CECD]
	GetHandleVerifier [0x00BD8D83+3294723]
	GetHandleVerifier [0x00C16CC2+3548482]
	GetHandleVerifier [0x00C11C9C+3527964]
	GetHandleVerifier [0x0095870E+671630]
	(No symbol) [0x00851EB4]
	(No symbol) [0x0084D808]
	(No symbol) [0x0084D92D]
	(No symbol) [0x0083F7E0]
	BaseThreadInitThunk [0x76A0FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x77997C6E+286]
	RtlGetAppContainerNamedObjectPath [0x77997C3E+238]

Execute fail:  Message: no such element: Unable to locate element: {"method":"css selector","selecto

  outputDf.at[start_index, "branch"] = vehicleData["branch"]
  outputDf.at[start_index, "model"] = vehicleData["model"]
  outputDf.at[start_index, "price"] = vehicleData["price"]
  outputDf.at[start_index, "yearOfConstruction"] = vehicleData["yearOfConstruction"]
  outputDf.at[start_index, "mileStone"] = vehicleData["mileStone"]
  outputDf.at[start_index, "transmission"] = vehicleData["transmission"]
  outputDf.at[start_index, "cilinders"] = vehicleData["cilinders"]
  outputDf.at[start_index, "prevOwners"] = vehicleData["prevOwners"]


VehicleData:  {'branch': 'BMW', 'model': '116', 'price': '7499', 'yearOfConstruction': '2014', 'mileStone': '99999', 'transmission': 'Automatik', 'cilinders': '1598', 'prevOwners': '2'}
The number of row updated 1
VehicleData:  {'branch': 'Volkswagen', 'model': 'up!', 'price': '7490', 'yearOfConstruction': '2015', 'mileStone': '58329', 'transmission': 'Schaltgetriebe', 'cilinders': '999', 'prevOwners': '2'}
The number of row updated 2
VehicleData:  {'branch': 'Volkswagen', 'model': 'Golf', 'price': '5900', 'yearOfConstruction': '2015', 'mileStone': '86076', 'transmission': 'Automatik', 'cilinders': '1395', 'prevOwners': '1'}
The number of row updated 3
Execute fail:  Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="ad-title"]"}
  (Session info: chrome=121.0.6167.86)
Stacktrace:
	GetHandleVerifier [0x008C1673+52979]
	(No symbol) [0x00847961]
	(No symbol) [0x0072DD3D]
	(No symbol) [0x00765FBB]
	(No symbol) [0x007660FB]
	(No symbol) [0x0079CF92

In [3]:
outputDf = pd.read_csv("output.csv")
def fetchData(start, end, name):

    def retryGetElement(func, times = 3, waitTime = 10): 
        count = 0
        accWaitTime = 0
        while count < times:
            try:
                count += 1
                return func()
            except Exception as e:
                print("Execute fail: ", e)
                accWaitTime += waitTime
                driver.refresh()
                time.sleep(accWaitTime)
        print(f'Retry with {times} but still failed') 
        return 'retryFailed'            
        
    def extract_price_value(price):
        pattern = r'([\d,]+(?:\.\d+)?)'
        match = re.search(pattern, price)

        if match:
            numeric_value = match.group(1).replace('.', '')
            return numeric_value
        else:
            return '0'

    def extract_numeric_value(string):
        # Remove any non-digit characters except the comma
        numeric_string = ''

        for char in string:
            if char.isdigit():
                numeric_string += char
            elif char == ' ':
                break    

        return numeric_string

    def extract_year(date):
        # Split the date into month and year parts
        month, year = date.split('/')

        return year
        
    def getValueFromKeyFeatureDiv(selector):
        divEle = retryGetElement(lambda: driver.find_element(By.CSS_SELECTOR, selector))
        return retryGetElement(lambda: divEle.find_element(By.CSS_SELECTOR, "div.key-feature__value").text)

    def scrapeVehicleData():

        # global vehicleResults

        #Wait page load
        time.sleep(15)
        #Init data
        vehicleData = {}

        #Get branch and model
        branchModelTitle = retryGetElement(lambda: driver.find_element(By.ID, "ad-title").text)
        branchModelSplitArray = branchModelTitle.split()
        branch = branchModelSplitArray[0]
        model = ' '.join(branchModelSplitArray[1:])
        
        #Get price
        price = retryGetElement(lambda: driver.find_element(By.CSS_SELECTOR, "span[data-testid='prime-price']").text)

        #Get year of construction
        yearOfConstruction = retryGetElement(lambda: getValueFromKeyFeatureDiv("div.key-feature--firstRegistration"))

        #Get mileStone
        mileStone = retryGetElement(lambda: getValueFromKeyFeatureDiv("div.key-feature--mileage"))

        #Get transmission
        transmission = retryGetElement(lambda: getValueFromKeyFeatureDiv("div.key-feature--transmission"))

        #Get cilinders
        cilinders = retryGetElement(lambda: driver.find_element(By.ID, "cubicCapacity-v").text)

        #Get prevOwners
        prevOwners = '0'
        try:
            prevOwners = retryGetElement(lambda: getValueFromKeyFeatureDiv("div.key-feature--numberOfPreviousOwners"))
        except:
            print("Vehicle doesn't have preowners")    

        vehicleData["branch"] = branch
        vehicleData["model"] = model
        vehicleData["price"] = extract_price_value(price)
        vehicleData["yearOfConstruction"] = extract_year(yearOfConstruction)
        vehicleData["mileStone"] = extract_numeric_value(mileStone)
        vehicleData["transmission"] = transmission
        vehicleData["cilinders"] = extract_numeric_value(cilinders)
        vehicleData["prevOwners"] = prevOwners
        # vehicleData["link"] = linkHref

        # vehicleResults = pd.concat([vehicleResults, pd.DataFrame([vehicleData])], ignore_index=True)

        print("VehicleData: ", vehicleData)
        return vehicleData

    def scrollElementRandomly():
        count = 0
        while count < 1:
            try:
                driver.execute_script(f"window.scrollBy(0,{random.randint(100, 1500)})")
                count+=1
                time.sleep(5)
            except Exception as e:
                print("Error while scroll", e) 

    def initDriver():
        global driver
        options = webdriver.ChromeOptions()
        version_main = int(chromedriver_autoinstaller.get_chrome_version().split(".")[0])
        #driver_executable_path=webdriver_manager
        driver = uc.Chrome(headless=False,use_subprocess=False,version_main=version_main,options=options,driver_executable_path="C:/Users/Hii/Downloads/chromedriver-win64/chromedriver.exe")
        driver.get("https://suchen.mobile.de/?lang=en")

        driver.delete_all_cookies()

        time.sleep(10)

        try:
            acceptBtn = WebDriverWait(driver, 20).until(EC.element_to_be_clickable(By.XPATH, "//button[text()='Accept']"))
            acceptBtn.click() 
        except Exception as e:
            print(e)    

    pageNum = 1

    time.sleep(5)

    #Read current csv
    outputDf = pd.read_csv("output.csv")

    start_index = start

    record = 0

    while start_index < end:
        
            
        for index in range(start_index, end):
            try:
                link = outputDf.at[index, 'link']
                driver.get(link)
                time.sleep(10)
                scrollElementRandomly()
                driver.execute_script(f"window.scrollBy(0,200)")
                vehicleData = scrapeVehicleData()

                if(vehicleData["branch"] == "" or vehicleData["branch"] == "retryFailed"):
                    raise Exception("Can not get branch, retry")

                outputDf.at[index, "branch"] = vehicleData["branch"]
                outputDf.at[index, "model"] = vehicleData["model"]
                outputDf.at[index, "price"] = vehicleData["price"]
                outputDf.at[index, "yearOfConstruction"] = vehicleData["yearOfConstruction"]
                outputDf.at[index, "mileStone"] = vehicleData["mileStone"]
                outputDf.at[index, "transmission"] = vehicleData["transmission"]
                outputDf.at[index, "cilinders"] = vehicleData["cilinders"]
                outputDf.at[index, "prevOwners"] = vehicleData["prevOwners"]

                print(f"The number of row updated {index}")
                record += 1
                if (record == 20):
                    outputDf.to_csv(f'export-progress-{name}.csv', index=False)
                    record = 0
            except Exception as e:
                start_index = index
                print(f'Got stuck at row {index}', e)
                driver.quit()
                time.sleep(300)

thread1 = threading.Thread(target=fetchData, args=(0, 999, 'thread1'))
thread2 = threading.Thread(target=fetchData, args=(1000, len(outputDf), 'thread2'))
thread1.start()
thread2.start()

thread1.join()
thread2.join()


Exception in thread Thread-5:
Traceback (most recent call last):
  File "c:\Users\Hii\AppData\Local\Programs\Python\Python39\lib\threading.py", line 950, in _bootstrap_inner
    self.run()
  File "C:\Users\Hii\AppData\Roaming\Python\Python39\site-packages\ipykernel\ipkernel.py", line 761, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\Hii\AppData\Local\Programs\Python\Python39\lib\threading.py", line 888, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Hii\AppData\Local\Temp\ipykernel_11428\78111194.py", line 140, in fetchData
  File "C:\Users\Hii\AppData\Local\Temp\ipykernel_11428\78111194.py", line 126, in initDriver
  File "c:\Users\Hii\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webelement.py", line 94, in click
    self._execute(Command.CLICK_ELEMENT)
  File "c:\Users\Hii\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webelement.py", line 403, in _execute
    return se

Execute fail:  Message: no such element: Unable to locate element: {"method":"css selector","selector":"div.key-feature--numberOfPreviousOwners"}
  (Session info: chrome=121.0.6167.86)
Stacktrace:
	GetHandleVerifier [0x008C1673+52979]
	(No symbol) [0x00847961]
	(No symbol) [0x0072DD3D]
	(No symbol) [0x00765FBB]
	(No symbol) [0x007660FB]
	(No symbol) [0x0079CF92]
	(No symbol) [0x00784534]
	(No symbol) [0x0079B3FE]
	(No symbol) [0x00784286]
	(No symbol) [0x0075C063]
	(No symbol) [0x0075CECD]
	GetHandleVerifier [0x00BD8D83+3294723]
	GetHandleVerifier [0x00C16CC2+3548482]
	GetHandleVerifier [0x00C11C9C+3527964]
	GetHandleVerifier [0x0095870E+671630]
	(No symbol) [0x00851EB4]
	(No symbol) [0x0084D808]
	(No symbol) [0x0084D92D]
	(No symbol) [0x0083F7E0]
	BaseThreadInitThunk [0x76A0FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x77997C6E+286]
	RtlGetAppContainerNamedObjectPath [0x77997C3E+238]

Execute fail:  Message: no such element: Unable to locate element: {"method":"css selector","selecto

  outputDf.at[index, "branch"] = vehicleData["branch"]
  outputDf.at[index, "model"] = vehicleData["model"]
  outputDf.at[index, "price"] = vehicleData["price"]
  outputDf.at[index, "yearOfConstruction"] = vehicleData["yearOfConstruction"]
  outputDf.at[index, "mileStone"] = vehicleData["mileStone"]
  outputDf.at[index, "transmission"] = vehicleData["transmission"]
  outputDf.at[index, "cilinders"] = vehicleData["cilinders"]
  outputDf.at[index, "prevOwners"] = vehicleData["prevOwners"]


VehicleData:  {'branch': 'Mercedes-Benz', 'model': 'A 160', 'price': '5650', 'yearOfConstruction': '2010', 'mileStone': '144800', 'transmission': 'Schaltgetriebe', 'cilinders': '1498', 'prevOwners': '3'}
The number of row updated 1001
VehicleData:  {'branch': 'Toyota', 'model': 'Yaris', 'price': '6990', 'yearOfConstruction': '2013', 'mileStone': '147300', 'transmission': 'Schaltgetriebe', 'cilinders': '998', 'prevOwners': '2'}
The number of row updated 1002
VehicleData:  {'branch': 'Peugeot', 'model': '3008', 'price': '6900', 'yearOfConstruction': '2010', 'mileStone': '120000', 'transmission': 'Schaltgetriebe', 'cilinders': '1598', 'prevOwners': '2'}
The number of row updated 1003
VehicleData:  {'branch': 'Opel', 'model': 'Zafira', 'price': '6750', 'yearOfConstruction': '2011', 'mileStone': '132864', 'transmission': 'Schaltgetriebe', 'cilinders': '1796', 'prevOwners': '4'}
The number of row updated 1004
Execute fail:  Message: no such element: Unable to locate element: {"method":"css s

In [None]:
outputDf = pd.read_csv("output.csv")

start_index = 0

while start_index < len(outputDf):
    for index in range(start_index, len(outputDf)):
        try:
            link = outputDf.at[index, 'link']
            print("Link" + link)
        except Exception as e:
            start_index = index
            print(f'Got stuck at row {index}', e)
            # outputDf.to_csv(f'export-{index}.csv', index=False)
            time.sleep(20)
            driver.refresh()


In [29]:
# Create ChromeOptions object
options = webdriver.ChromeOptions()
driver = webdriver.Chrome()
driver.get("http://Mobile.de?lang=en")
acceptBtn = driver.find_element(By.XPATH, "//button[text()='Accept']").text

acceptBtn.click()

resultsBtn = driver.find_element(By.XPATH, "//button[@data-testid='qs-submit-button']")
# time.sleep(10)
resultsBtn.click()

resultsBtn.get_attribute("href")

driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.COMMAND + 't') 

