In [None]:
# import necessary libraries
from selenium import webdriver
import undetected_chromedriver as uc
import chromedriver_autoinstaller
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import re
import random
from pywinauto import taskbar
from pywinauto.application import Application
from pywinauto import Desktop
import subprocess
import os

In [None]:
# Constants
DRIVER_EXECUTE_PATH_ENV_NAME="DRIVER_EXECUTE_PATH"
MOBILE_DE_HOMEPAGE_LINK="https://suchen.mobile.de/?lang=en"
VEHICLE_DETAIL_LINK_CSS_SELECTOR="a[href^='/fahrzeuge/details.html']"
ACCEPT_TERM_AND_CONDITION_BTN_PATH_SELECTOR="//button[text()='Accept']"

In [None]:
class Utils:
    # Return a vehicle search link with provided pageNumber
    def getSearchResultWithPageNumber(pageNumber):
        return f"https://suchen.mobile.de/fahrzeuge/search.html?cn=DE&door=FOUR_OR_FIVE&ecol=BLACK&ecol=GREY&ecol=SILVER&ecol=WHITE&ft=PETROL&gn=45899%2C+Gelsenkirchen%2C+Nordrhein-Westfalen&isSearchRequest=true&ll=51.5308428%2C7.0328265&ml=%3A200000&p=%3A7500&pageNumber={pageNumber}&rd=100&s=Car&sb=rel&vc=Car"
    
    def waitInSecondsForLoading(sec):
        time.sleep(sec)

In [None]:
# This class is created to provide utility for:
# + Get all vehicle links
# + Scraping vehicle detail data from each link (TODO)
class MobileDeScrapingEngine:
    pageNumber = 1
    vehiclesDataFrame = pd.DataFrame(columns=['branch','model','price','yearOfConstruction','mileStone','transmission','cilinders','prevOwners','link'])
    def __init__(self) -> None:
        # Init Chrome web driver
        version_main = int(chromedriver_autoinstaller.get_chrome_version().split(".")[0])
        self.driver = uc.Chrome(headless=False,use_subprocess=False,version_main=version_main,driver_executable_path=os.environ.get(DRIVER_EXECUTE_PATH_ENV_NAME))
        self.driver.get(MOBILE_DE_HOMEPAGE_LINK)
        self.driver.delete_all_cookies()

        Utils.waitInSecondsForLoading(10)

        self.acceptTermAndCondition()

    def acceptTermAndCondition(self):
        # Get Accept Term&Conditions button
        acceptBtn = self.driver.find_element(By.XPATH, ACCEPT_TERM_AND_CONDITION_BTN_PATH_SELECTOR)
        acceptBtn.click()   

    # Each vehicle search results will redirect us to first page in the pagination list
    # We will access each page to scraping the data        
    def accessEachPageNumber(self):
        while True:
            try:
                searchLink = Utils.getSearchResultWithPageNumber(self.pageNumber)
                self.driver.get(searchLink)
                Utils.waitInSecondsForLoading(10)
                self.getAllVehicleLinksInPage()
                # Increase pageNumber to continue go to next page
                self.pageNumber += 1
            except Exception as e:
                print("Exception when trying to get next page", e)
                break    

    def getAllVehicleLinksInPage(self):
        # Find all <a> tags with href starting with "/fahrzeuge/details.html" inside the search page
        links = self.retry(lambda: self.driver.find_elements(By.CSS_SELECTOR, VEHICLE_DETAIL_LINK_CSS_SELECTOR))
        for link in links:
            # Get href attribute of link element
            linkHref = link.get_attribute("href")
            print("Link: ", linkHref)

            # Set href value to link column
            vehiclesDataFrame = pd.concat([vehiclesDataFrame, pd.DataFrame([{"link": linkHref}])], ignore_index=True)
    
    def retry(self, func, maxRetryTimes = 3, waitingTime = 10):
        timesHasRetried = 0
        accumulateWaitingTime = 0
        while timesHasRetried < maxRetryTimes:
            try:
                count += 1
                return func()
            except Exception as e:
                print("Execute failed: ", e)
                accumulateWaitingTime += waitingTime
                self.driver.refresh()
                Utils.waitInSecondsForLoading(accumulateWaitingTime)
        print(f'Retry with {timesHasRetried} but still failed') 
        return None        

In [3]:
def fetchData(start, end, name):

    def retryGetElement(func, times = 3, waitTime = 10): 
        count = 0
        accWaitTime = 0
        while count < times:
            try:
                count += 1
                return func()
            except Exception as e:
                print("Execute fail, msg from my code: ", e)
                accWaitTime += waitTime
                driver.refresh()
                time.sleep(accWaitTime)
        print(f'Retry with {times} but still failed') 
        return 'retryFailed'            
        
    def extract_price_value(price):
        pattern = r'([\d,]+(?:\.\d+)?)'
        match = re.search(pattern, price)

        if match:
            numeric_value = match.group(1).replace('.', '')
            return numeric_value
        else:
            return '0'

    def extract_numeric_value(string):
        # Remove any non-digit characters except the comma
        numeric_string = ''

        for char in string:
            if char.isdigit():
                numeric_string += char
            elif char == ' ':
                break    

        return numeric_string

    def extract_year(date):
        # Split the date into month and year parts
        month, year = date.split('/')

        return year
        
    def getValueFromKeyFeatureDiv(selector):
        divEle = driver.find_element(By.CSS_SELECTOR, selector)
        return divEle.find_element(By.CSS_SELECTOR, "div.key-feature__value").text

    def scrapeVehicleData():

        # global vehicleResults

        #Wait page load
        # time.sleep(15)
        #Init data
        vehicleData = {}

        #Get branch and model
        branchModelTitle = retryGetElement(lambda: driver.find_element(By.ID, "ad-title").text)
        branchModelSplitArray = branchModelTitle.split()
        branch = branchModelSplitArray[0]
        model = ' '.join(branchModelSplitArray[1:])

        if(branchModelTitle == "retryFailed"):
            raise Exception("Can not get branch, retry")
        
        #Get price
        price = retryGetElement(lambda: driver.find_element(By.CSS_SELECTOR, "span[data-testid='prime-price']").text)

        #Get year of construction
        yearOfConstruction = retryGetElement(lambda: getValueFromKeyFeatureDiv("div.key-feature--firstRegistration"))

        #Get mileStone
        mileStone = retryGetElement(lambda: getValueFromKeyFeatureDiv("div.key-feature--mileage"))

        #Get transmission
        transmission = retryGetElement(lambda: getValueFromKeyFeatureDiv("div.key-feature--transmission"))

        #Get cilinders
        cilinders = retryGetElement(lambda: driver.find_element(By.ID, "cubicCapacity-v").text)

        #Get prevOwners
        prevOwners = '0'
        try:
            print("Get prevOwners only once")
            prevOwners = getValueFromKeyFeatureDiv("div.key-feature--numberOfPreviousOwners")
        except Exception as e:
            print("Vehicle doesn't have preowners")    

        vehicleData["branch"] = branch
        vehicleData["model"] = model
        vehicleData["price"] = extract_price_value(price)
        vehicleData["yearOfConstruction"] = extract_year(yearOfConstruction)
        vehicleData["mileStone"] = extract_numeric_value(mileStone)
        vehicleData["transmission"] = transmission
        vehicleData["cilinders"] = extract_numeric_value(cilinders)
        vehicleData["prevOwners"] = prevOwners
        # vehicleData["link"] = linkHref

        # vehicleResults = pd.concat([vehicleResults, pd.DataFrame([vehicleData])], ignore_index=True)

        print("VehicleData: ", vehicleData)
        return vehicleData

    def scrollElementRandomly():
        count = 0
        while count < 1:
            try:
                driver.execute_script(f"window.scrollBy(0,{random.randint(100, 1500)})")
                count+=1
                time.sleep(5)
            except Exception as e:
                print("Error while scroll", e) 

    def initDriver():
        global driver
        options = webdriver.ChromeOptions()
        version_main = int(chromedriver_autoinstaller.get_chrome_version().split(".")[0])
        #driver_executable_path=webdriver_manager
        driver = uc.Chrome(headless=False,use_subprocess=False,version_main=version_main,options=options,driver_executable_path="C:/Users/Hii/Downloads/chromedriver-win64/chromedriver.exe")
        driver.get("https://suchen.mobile.de/?lang=en")

        driver.delete_all_cookies()

        time.sleep(10)

        acceptBtn = driver.find_element(By.XPATH, "//button[text()='Accept']")
        acceptBtn.click()

    def openChrome():
        app = Application(backend="uia").connect(title_re=".*Google Chrome")
        chrome_window = app.window(title_re=".*Google Chrome")
        chrome_window.click_input()

    def closeChrome():
        app = Application(backend="uia").connect(title_re=".*Google Chrome")
        chrome_window = app.window(title_re=".*Google Chrome")
        chrome_window.close() 

    def openVsCode():
        app = Application(backend="uia").connect(title_re=".*Visual Studio Code")
        chrome_window = app.window(title_re=".*Visual Studio Code")
        chrome_window.click_input()    

    def switchTab():
        openChrome()
        time.sleep(5)
        openVsCode()
        # openChrome()

         

    pageNum = 1

    # time.sleep(5)

    #Read current csv
    outputDf = pd.read_csv("export-progress-thread1.csv")

    start_index = start

    while start_index < end:
        try:
            initDriver()
        except Exception as e:
            print("Init driver fail", e)
            continue    
        for index in range(start_index, end):
            try:
                link = outputDf.at[index, 'link']
                driver.get(link)
                time.sleep(10)
                # scrollElementRandomly()
                switchTab()
                driver.execute_script(f"window.scrollBy(0,200)")
                vehicleData = scrapeVehicleData()

                if(vehicleData["branch"] == "" or vehicleData["branch"] == "retryFailed"):
                    raise Exception("Can not get branch, retry")

                outputDf.at[index, "branch"] = vehicleData["branch"]
                outputDf.at[index, "model"] = vehicleData["model"]
                outputDf.at[index, "price"] = vehicleData["price"]
                outputDf.at[index, "yearOfConstruction"] = vehicleData["yearOfConstruction"]
                outputDf.at[index, "mileStone"] = vehicleData["mileStone"]
                outputDf.at[index, "transmission"] = vehicleData["transmission"]
                outputDf.at[index, "cilinders"] = vehicleData["cilinders"]
                outputDf.at[index, "prevOwners"] = vehicleData["prevOwners"]

                print(f"The number of row updated {index}")
                outputDf.to_csv(f'export-progress-{name}.csv', index=False)
            except Exception as e:
                start_index = index + 1
                print(f'Got stuck at row {index}', e)
                driver.quit()
                subprocess.call("TASKKILL /f  /IM  CHROMEDRIVER.EXE")
                subprocess.call("TASKKILL /f  /IM  CHROME.EXE")
                # wait CPU reset
                time.sleep(10)
                break

fetchData(268, 1000, 'thread1')


Get prevOwners only once
VehicleData:  {'branch': 'Mercedes-Benz', 'model': 'B 160', 'price': '4250', 'yearOfConstruction': '2010', 'mileStone': '261687', 'transmission': 'Schaltgetriebe', 'cilinders': '1498', 'prevOwners': '2'}
The number of row updated 242


  outputDf.at[index, "price"] = vehicleData["price"]
  outputDf.at[index, "yearOfConstruction"] = vehicleData["yearOfConstruction"]
  outputDf.at[index, "mileStone"] = vehicleData["mileStone"]
  outputDf.at[index, "cilinders"] = vehicleData["cilinders"]


KeyboardInterrupt: 

In [None]:
outputDf = pd.read_csv("output.csv")

start_index = 0

while start_index < len(outputDf):
    for index in range(start_index, len(outputDf)):
        try:
            link = outputDf.at[index, 'link']
            print("Link" + link)
        except Exception as e:
            start_index = index
            print(f'Got stuck at row {index}', e)
            # outputDf.to_csv(f'export-{index}.csv', index=False)
            time.sleep(20)
            driver.refresh()


In [29]:
# Create ChromeOptions object
options = webdriver.ChromeOptions()
driver = webdriver.Chrome()
driver.get("http://Mobile.de?lang=en")
acceptBtn = driver.find_element(By.XPATH, "//button[text()='Accept']").text

acceptBtn.click()

resultsBtn = driver.find_element(By.XPATH, "//button[@data-testid='qs-submit-button']")
# time.sleep(10)
resultsBtn.click()

resultsBtn.get_attribute("href")

driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.COMMAND + 't') 

