In [None]:
# import necessary libraries
from selenium import webdriver
import undetected_chromedriver as uc
import chromedriver_autoinstaller
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import re
import random
from pywinauto import taskbar
from pywinauto.application import Application
from pywinauto import Desktop
import subprocess
import os

In [None]:
# Constants
DRIVER_EXECUTE_PATH_ENV_NAME="DRIVER_EXECUTE_PATH"
MOBILE_DE_HOMEPAGE_LINK="https://suchen.mobile.de/?lang=en"
VEHICLE_DETAIL_LINK_CSS_SELECTOR="a[href^='/fahrzeuge/details.html']"
ACCEPT_TERM_AND_CONDITION_BTN_PATH_SELECTOR="//button[text()='Accept']"

In [None]:
class Utils:
    # Return a vehicle search link with provided pageNumber
    def getSearchResultWithPageNumber(pageNumber):
        return f"https://suchen.mobile.de/fahrzeuge/search.html?cn=DE&door=FOUR_OR_FIVE&ecol=BLACK&ecol=GREY&ecol=SILVER&ecol=WHITE&ft=PETROL&gn=45899%2C+Gelsenkirchen%2C+Nordrhein-Westfalen&isSearchRequest=true&ll=51.5308428%2C7.0328265&ml=%3A200000&p=%3A7500&pageNumber={pageNumber}&rd=100&s=Car&sb=rel&vc=Car"
    
    def waitInSecondsForLoading(sec):
        time.sleep(sec)

    def extractPriceValue(price):
        pattern = r'([\d,]+(?:\.\d+)?)'
        match = re.search(pattern, price)

        if match:
            numeric_value = match.group(1).replace('.', '')
            return numeric_value
        else:
            return '0'

    def extractNumericValue(string):
        # Remove any non-digit characters except the comma
        numeric_string = ''

        for char in string:
            if char.isdigit():
                numeric_string += char
            elif char == ' ':
                break    

        return numeric_string
    
    def extractYear(date):
        # Split the date into month and year parts
        month, year = date.split('/')

        return year
    
    def openTaskBarApp(appName):
        app = Application(backend="uia").connect(title_re=f".*{appName}")
        app_window = app.window(title_re=f".*{appName}")
        app_window.click_input()

    def closeTaskBarApp(appName):
        app = Application(backend="uia").connect(title_re=f".*{appName}")
        app_window = app.window(title_re=f".*{appName}")
        app_window.close() 
    
    # Use pywinauto to perform close the application icon on Windows taskbar => Bypass bot detection Akamai
    def actLikeHumans():
        Utils.openTaskBarApp("Google Chrome")
        # Wait a little bit before switching to act like human
        time.sleep(5)
        Utils.openTaskBarApp("Visual Studio Code")


In [None]:
# This class is created to provide utility for:
# + Get all vehicle links
# + Scraping vehicle detail data from each link
class MobileDeScrapingEngine:
    pageNumber = 1
    vehiclesDataFrame = pd.DataFrame(columns=['branch','model','price','yearOfConstruction','mileStone','transmission','cilinders','prevOwners','link'])
    def __init__(self) -> None:
        # Init Chrome web driver
        self.initDriver()

        self.startIndex = self.getLastIndexBeforeCrash()

    def initDriver(self):
        version_main = int(chromedriver_autoinstaller.get_chrome_version().split(".")[0])
        self.driver = uc.Chrome(headless=False,use_subprocess=False,version_main=version_main,driver_executable_path=os.environ.get(DRIVER_EXECUTE_PATH_ENV_NAME))
        self.driver.get(MOBILE_DE_HOMEPAGE_LINK)
        self.driver.delete_all_cookies()
        Utils.waitInSecondsForLoading(10)
        self.acceptTermAndCondition()    

    def acceptTermAndCondition(self):
        # Get Accept Term&Conditions button
        acceptBtn = self.driver.find_element(By.XPATH, ACCEPT_TERM_AND_CONDITION_BTN_PATH_SELECTOR)
        acceptBtn.click()   

    # Each vehicle search results will redirect us to first page in the pagination list
    # We will access each page to scraping the data        
    def accessEachPageNumber(self):
        while True:
            try:
                searchLink = Utils.getSearchResultWithPageNumber(self.pageNumber)
                self.driver.get(searchLink)
                Utils.waitInSecondsForLoading(10)
                self.getAllVehicleLinksInPage()
                # Increase pageNumber to continue go to next page
                self.pageNumber += 1
            except Exception as e:
                print("Exception when trying to get next page", e)
                break    

    def getAllVehicleLinksInPage(self):
        # Find all <a> tags with href starting with "/fahrzeuge/details.html" inside the search page
        links = self.retry(lambda: self.driver.find_elements(By.CSS_SELECTOR, VEHICLE_DETAIL_LINK_CSS_SELECTOR))
        for link in links:
            # Get href attribute of link element
            linkHref = link.get_attribute("href")
            print("Link: ", linkHref)

            # Set href value to link column
            vehiclesDataFrame = pd.concat([vehiclesDataFrame, pd.DataFrame([{"link": linkHref}])], ignore_index=True)

    def getValueFromKeyFeatureDiv(self, selector):
        divEle = self.driver.find_element(By.CSS_SELECTOR, selector)
        return divEle.find_element(By.CSS_SELECTOR, "div.key-feature__value").text        
    
    def scrapeVehicleDetailPage(self, index):
        #Get branch and model
        branchModelTitle = self.retry(lambda: self.driver.find_element(By.ID, "ad-title").text)

        # If after 3 times retries but can not get the branch name => throw exception to move to another link
        # Since branch is the must-haved information
        if(branchModelTitle == "retryFailed"):
            raise Exception("Can not get branch, retry")
        else:
            branchModelSplitArray = branchModelTitle.split()
            branch = branchModelSplitArray[0]
            model = ' '.join(branchModelSplitArray[1:])

        #Get price
        price = self.retry(lambda: self.driver.find_element(By.CSS_SELECTOR, "span[data-testid='prime-price']").text)

        #Get year of construction
        yearOfConstruction = self.retry(lambda: self.getValueFromKeyFeatureDiv("div.key-feature--firstRegistration"))

        #Get mileStone
        mileStone = self.retry(lambda: self.getValueFromKeyFeatureDiv("div.key-feature--mileage"))

        #Get transmission
        transmission = self.retry(lambda: self.getValueFromKeyFeatureDiv("div.key-feature--transmission"))

        #Get cilinders
        cilinders = self.retry(lambda: self.driver.find_element(By.ID, "cubicCapacity-v").text)

        #Get prevOwners
        prevOwners = '0'
        try:
            prevOwners = self.getValueFromKeyFeatureDiv("div.key-feature--numberOfPreviousOwners")
        except Exception as e:
            print("Vehicle doesn't have preowners", e)  


        self.vehiclesDataFrame.at[index, "branch"] = branch
        self.vehiclesDataFrame.at[index, "model"] = model
        self.vehiclesDataFrame.at[index, "price"] = Utils.extractPriceValue(price)
        self.vehiclesDataFrame.at[index, "yearOfConstruction"] = Utils.extractYear(yearOfConstruction)
        self.vehiclesDataFrame.at[index, "mileStone"] = Utils.extractNumericValue(mileStone)
        self.vehiclesDataFrame.at[index, "transmission"] = transmission
        self.vehiclesDataFrame.at[index, "cilinders"] = Utils.extractNumericValue(cilinders)
        self.vehiclesDataFrame.at[index, "prevOwners"] = prevOwners   

        print("VehicleData: ", self.vehiclesDataFrame.loc[[index]])

    def scrapeAllVehicleDetailPages(self): 
        while self.startIndex < len(self.vehiclesDataFrame):
            try:
                self.initDriver()
            except Exception as e:
                print("Init driver fail", e)
                continue    
            for index in range(self.startIndex, len(self.vehiclesDataFrame)):
                try:
                    link = self.vehiclesDataFrame.at[index, 'link']
                    self.driver.get(link)
                    Utils.waitInSecondsForLoading(10)
                    Utils.actLikeHumans()
                    self.scrollPage(0,200)
                    self.scrapeVehicleDetailPage(index)
                    print(f"The number of rows have been updated {index}")
                except Exception as e:
                    self.vehiclesDataFrame.to_csv(f'export-progress-{index}.csv', index=False)
                    self.startIndex = index + 1
                    print(f'Got stuck at row {index}', e)
                    self.driver.quit()
                    # Before open new driver, must fore quit the old driver because it is not automatically shut down
                    subprocess.call("TASKKILL /f  /IM  CHROMEDRIVER.EXE")
                    subprocess.call("TASKKILL /f  /IM  CHROME.EXE")
                    # wait CPU reset
                    time.sleep(10)
                    break

        # The reason why we save index + 1 because index is failed, we won't start with it any more
        self.saveLastIndexBeforeCrash(self.startIndex + 1)         

    def scrollPage(self, x, y):
        self.driver.execute_script(f"window.scrollBy({x},{y})")

    def saveLastIndexBeforeCrash(self, lastIndex):
        f = open("lastIndex.txt", "w")
        f.write(lastIndex)
        f.close()

    def getLastIndexBeforeCrash(self):
        try:
            f = open("lastIndex.txt", "r")
            return int(f.readline()) 
        except Exception as e:
            print("Fail to read file", e)
        return 0      

    def retry(self, func, maxRetryTimes = 3, waitingTime = 10):
        timesHasRetried = 0
        accumulateWaitingTime = 0
        while timesHasRetried < maxRetryTimes:
            try:
                timesHasRetried += 1
                return func()
            except Exception as e:
                print("Execute failed: ", e)
                accumulateWaitingTime += waitingTime
                self.driver.refresh()
                Utils.waitInSecondsForLoading(accumulateWaitingTime)
        print(f'Retry with {timesHasRetried} but still failed') 
        return None        