# DDR Final Project
# KSL Classifieds - Web Scraping 
# Used Cars Toyota listings
----
## Web Scraping Procedure


In [None]:
#pip install selenium

In [None]:
#pip install webdriver-manager


In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import os

from selenium.common.exceptions import ElementClickInterceptedException




In [2]:
# user agent
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"

# set up 
options = Options()
options.add_argument(f"user-agent={user_agent}")


#options.add_argument("--headless")

# initialize the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Open website
# This URL was formed after we experimented on KSL's website
# There was no need for us to invoke filters in a selenium session
# As the filters are directly reflected in url itself
# This makes our job easier

# in the future, if we need to scrape data for any other auto maker
# all we need to do is replace 'Toyota' with make of interest for ex: 'Porsche'
driver.get("https://cars.ksl.com/search/make/Toyota/sellerType/For+Sale+By+Owner/newUsed/Used")

# Wait for the page to load contents

time.sleep(20) 

# Scroll down to load more items. We will try to grab as many listings as we can
# The page is dynamic in nature, there is no end to how much can scroll down

for _ in range(30):  
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(15)  # Wait time before we scroll down next iteration
    
# grab the data_id of the listings. data id is all we need to explore a listing

listings = driver.find_elements(By.CSS_SELECTOR, 'div[data-id]')

data_ids = [listing.get_attribute('data-id') for listing in listings]

# close session
driver.quit()


In [3]:
# Output the extracted data-ids
print(data_ids)
print(f"Total listings extracted: {len(data_ids)}")

['9106261', '9079910', '9172112', '9172091', '9143811', '9143151', '9115103', '9172045', '9172057', '9172048', '9172036', '9137197', '9172021', '9172015', '9172007', '9171995', '9171937', '9171989', '9171973', '9171962', '9171952', '9139831', '9171902', '9171907', '9167880', '9171889', '9171874', '9106850', '9171840', '9171839', '9171789', '9171759', '9171779', '9171740', '9110629', '9171722', '9171690', '9171702', '9171691', '9171696', '9171670', '9171667', '9064956', '9171664', '9171643', '9132974', '9171637', '9171629', '9167880', '9129285', '9171592', '9086879', '9171493', '9171485', '9075497', '9171466', '9171454', '9171440', '9171416', '9171367', '9113653', '9051911', '8990547', '9170900', '9170899', '9170898', '9170891', '9170804', '9170875', '9170872', '9170869', '9170818', '9106261', '9170794', '9082916', '9170772', '9170741', '9170735', '9170720', '9170698', '9170677', '9170637', '9170645', '9043716', '9170602', '9170600', '9170598', '9170585', '9170582', '9170562', '9170548'

------
### At this point, we have grabbed the listings we are interested in
### We have about 864 ids of listings
### Now we will query / request each data id via a web request
### After querying the request, we will save the html response of each listing to our local directory
------

In [None]:
#testList = data_ids[0:5]

In [None]:
#testList

-----

In [4]:
# create local directory for saving listings

folder_path = 'Toyota Used'

os.makedirs(folder_path, exist_ok=True)

-----

### Below code launches a web driver sesssion
### In this web session we will load a listing
### Next we will let it load with ample amount of time
### We save the html response of the loaded content
### Launch another web url request in same webdriver session

In [None]:
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"

# Set up Chrome options
options = Options()
options.add_argument(f"user-agent={user_agent}")

# init webdriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

## ***************************************************************************************************** ##

## replace testList with data_ids for full scraping

# Iterate over each data_id to fetch and save the HTML content

for data_id in data_ids:
    
    listing_url = f"https://cars.ksl.com/listing/{data_id}"
    
    driver.get(listing_url)
    
    # accept cookie
#WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))).click()

    
    time.sleep(5)
      
    # Execute JavaScript to scroll halfway through the page
    ## the see more button needs to be in view, in order to be executed
    
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight / 2);")

    # Find all elements with the class name 'seeMore'

        
# *****************************************************************#

    see_more_buttons = driver.find_elements(By.CLASS_NAME, 'seeMore')

    if len(see_more_buttons) > 1:
        try:
            time.sleep(2)
            
            see_more_buttons[1].click()
            
            time.sleep(2)
            
            see_more_buttons[0].click()
            
        except ElementClickInterceptedException as e:
            
            print("ElementClickInterceptedException caught: ", e)
        
    elif len(see_more_buttons) == 1:
        try:
            see_more_buttons[0].click()
        except ElementClickInterceptedException as e:
            print("ElementClickInterceptedException caught: ", e)
        


# *****************************************************************#

     
    # Get the page src - html
    html_content = driver.page_source
    
    #define  file path
    file_path = os.path.join(folder_path, f"listing_{data_id}.html")
    
    # save the file to a html
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(html_content)
    
    print(f"Saved HTML for listing {data_id}.")
    
    
    ## wait time before we query next car listing
    time.sleep(10)

# close session
driver.quit()

Saved HTML for listing 9106261.
Saved HTML for listing 9079910.
Saved HTML for listing 9172112.
Saved HTML for listing 9172091.
Saved HTML for listing 9143811.
Saved HTML for listing 9143151.
Saved HTML for listing 9115103.
Saved HTML for listing 9172045.
Saved HTML for listing 9172057.
Saved HTML for listing 9172048.
Saved HTML for listing 9172036.
Saved HTML for listing 9137197.
Saved HTML for listing 9172021.
Saved HTML for listing 9172015.
Saved HTML for listing 9172007.
Saved HTML for listing 9171995.
Saved HTML for listing 9171937.
Saved HTML for listing 9171989.
Saved HTML for listing 9171973.
Saved HTML for listing 9171962.
Saved HTML for listing 9171952.
Saved HTML for listing 9139831.
Saved HTML for listing 9171902.
Saved HTML for listing 9171907.
Saved HTML for listing 9167880.
Saved HTML for listing 9171889.
Saved HTML for listing 9171874.
Saved HTML for listing 9106850.
Saved HTML for listing 9171840.
Saved HTML for listing 9171839.
Saved HTML for listing 9171789.
Saved HT

Saved HTML for listing 9167728.
Saved HTML for listing 9167724.
Saved HTML for listing 9167660.
Saved HTML for listing 9128631.
Saved HTML for listing 9167614.
Saved HTML for listing 9052961.
Saved HTML for listing 9167619.
Saved HTML for listing 9106261.
Saved HTML for listing 9167578.
Saved HTML for listing 9080307.
Saved HTML for listing 9167517.
Saved HTML for listing 9135063.
Saved HTML for listing 9075326.
Saved HTML for listing 9167500.
Saved HTML for listing 9047446.
Saved HTML for listing 8999901.
Saved HTML for listing 9167408.
Saved HTML for listing 9167369.
Saved HTML for listing 9167317.
Saved HTML for listing 9010792.
Saved HTML for listing 8953755.
Saved HTML for listing 9078225.
Saved HTML for listing 9166321.
Saved HTML for listing 9166294.
Saved HTML for listing 9166276.
Saved HTML for listing 9166270.
Saved HTML for listing 9166227.
Saved HTML for listing 9166199.
Saved HTML for listing 9166201.
Saved HTML for listing 9138058.
Saved HTML for listing 9166150.
Saved HT

------

### Now at this point, we have saved html files of our listings
### This concludes the Web Scraping part of our job
### Up next, we will process the html files to our data needs
### We will do this procedure in a separate python file !
