# Use selenium to scrape carmax.com

In [633]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup

import time
import os
import numpy as np
import pandas as pd
import timeit
import pickle

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

### Scape the list of URLs

In [635]:
driver = webdriver.Chrome(chromedriver)
SearchURL = "https://www.carmax.com/search"
driver.get(SearchURL)
time.sleep(1);
link_lists = []
endofresult = False

max_value = driver.find_element_by_xpath('//div[@data-max-price="100000"]')
time.sleep(1)
max_value.send_keys(Keys.DOWN)
time.sleep(0.5)
max_value.send_keys(Keys.DOWN)
time.sleep(0.5)
max_value.send_keys(Keys.DOWN)
max_value.send_keys(Keys.RETURN)

In [636]:
while(endofresult == False):
    try:
        driver.find_element_by_css_selector(".pagination--next-disable")
        endofresult = True
        print ("This is the end of search result!")
        with open("URL_Links_All.pkl".format(count), 'wb') as picklefile: # b means binary
            pickle.dump(link_lists, picklefile)
    except Exception as e:
        next_element = driver.find_element_by_class_name("pagination--next")
        next_element.click()
        time.sleep(3)
        link_elements = driver.find_elements_by_xpath('//h3[@class="vehicle-browse--result-title"]/a')
        link_lists = link_lists + [element.get_attribute('href') for element in link_elements]
        count = len(link_lists)
        if (count % 100 == 0):
            with open("URL_Links_{}.pkl".format(count), 'wb') as picklefile: # b means binary
                pickle.dump(link_lists, picklefile)

This is the end of search result!


### Scape the individual Car main pages

In [638]:
with open('./data/URL_Links_All.pkl', 'rb') as picklefile: 
    links = pickle.load(picklefile)

In [646]:
individual_links = ["https://www.carmax.com/car/"+x.split("/")[4]+"#ratingsReviews" for x in links]

In [649]:
subset = individual_links[0:20]

In [None]:
driver = webdriver.Chrome(chromedriver)
driver.get("https://www.carmax.com/car/16061803")

In [596]:
def find_classes(URL, classname_function_dict):
    driver.get(URL)
    review_button = driver.find_element_by_xpath('//a[@class="kmx-button kmx-button--secondary kmx-button--large kmx-button--raised"]')
    review_button.click()
    result = {}
    for classname in sorted(classname_function_dict.keys()):
        result[classname] = driver.find_elements_by_class_name(classname)
    return result

In [597]:
testurl = 'https://www.carmax.com/car/16061803'
testurl = 'https://www.carmax.com/car/16104605'

In [315]:
result = find_classes(testurl, classname_function_dict)

In [569]:
def process_mileage_value(input):
    price = (input[0]
                 .text
                 .replace('$','')
                 .replace('*','')
                 .replace(',',''))
    mileage = (input[1]
                 .text
                 .replace('K',''))
    return [price, mileage]

In [570]:
def process_year_make(input):
    splited = input[0].text.split(' ')
    year = splited[0]
    brand = splited[1]
    return [year, brand]

In [571]:
def process_min(input):
    return [input[0].text]

In [626]:
def process_location(input):
    if(len(input) == 0):
        return ['N/A']
    else:
        return [input[0].text]

In [572]:
def process_image_count(input):
    return [int(input[0].text.split(" ")[2])]

In [573]:
def process_stock_vin(input):
    splited = input[0].text.split("|")
    stock = splited[0].split(" ")[2]
    vin = splited[1].split(" ")[2]
    return [stock, vin]

In [574]:
def process_city_highway(input):
    splited = input[0].text.split("\n")
    city = splited[0]
    highway = splited[2]
    return [city, highway]

In [575]:
def process_base_spec(input):
    color = input[0].text.split("\n")
    exterior = color[1].split(" ")[1]
    interior = color[2].split(" ")[1]
    basespec = input[1].text.split("\n")
    transmission = basespec[1].split(" ")[1]
    drive = basespec[2].split(" ")[1]
    engine = basespec[3].split(" ")[1].replace("L","")
    cylinder = basespec[4].split(" ")[1]
    horsepower = basespec[5].split(" ")[1]
    torque = basespec[6].split(" ")[1]
    return [exterior, interior, transmission, drive, engine, cylinder, horsepower, torque]

In [576]:
def process_user_rating(input):
    user_star_5 = int(input[0].text.split("\n")[1])
    user_star_4 = int(input[1].text.split("\n")[1])
    user_star_3 = int(input[2].text.split("\n")[1])
    user_star_2 = int(input[3].text.split("\n")[1])
    user_star_1 = int(input[4].text.split("\n")[1])
    return [user_star_5, user_star_4, user_star_3, user_star_2, user_star_1]

In [651]:
classname_function_dict = {
    "car-page-header__car-title__year-make":process_year_make,
    "car-page-header__car-title__model-trim":process_min,
    "price-mileage--value":process_mileage_value,
    "action-bar--image-counter":process_image_count,
    "store-information--location":process_location,
    "about-this-car--header__subtitle":process_stock_vin,
    "gas-mileage-container":process_city_highway,
    "kmx-table":process_base_spec,
    "linear-rating":process_user_rating
}

In [620]:
def get_car_basics(URL_lists, classname_function_dict):
    result = []
    for URL in URL_lists:
        driver = webdriver.Chrome(chromedriver)
        driver.get(URL)
        time.sleep(5)
        review_button = driver.find_element_by_xpath('//a[@class="kmx-button kmx-button--secondary kmx-button--large kmx-button--raised"]')
        review_button.click()
        time.sleep(2)
        record = []
        for classname in sorted(classname_function_dict.keys()):
            element = driver.find_elements_by_class_name(classname)
            # print (classname)
            record = record + (classname_function_dict[classname](element))
            # print (record)
        result.append(record)
        driver.close()
    return result

In [621]:
URL_lists = [
 'https://www.carmax.com/car/15086949',
 'https://www.carmax.com/car/15585428',
 'https://www.carmax.com/car/15895613',
 'https://www.carmax.com/car/15777631',
 'https://www.carmax.com/car/16057238',
 'https://www.carmax.com/car/15867327',
 'https://www.carmax.com/car/16056848',
 'https://www.carmax.com/car/15867099',
 'https://www.carmax.com/car/15867057']

In [625]:
start_time = timeit.default_timer()

result = get_car_basics(URL_lists, classname_function_dict)

elapsed = timeit.default_timer() - start_time

about-this-car--header__subtitle
['15086949', 'WAUCFAFC5DN062406']
action-bar--image-counter
['15086949', 'WAUCFAFC5DN062406', 21]
car-page-header__car-title__model-trim
['15086949', 'WAUCFAFC5DN062406', 21, 'A6 Premium']
car-page-header__car-title__year-make
['15086949', 'WAUCFAFC5DN062406', 21, 'A6 Premium', '2013', 'Audi']
gas-mileage-container
['15086949', 'WAUCFAFC5DN062406', 21, 'A6 Premium', '2013', 'Audi', '25', '33']
kmx-table
['15086949', 'WAUCFAFC5DN062406', 21, 'A6 Premium', '2013', 'Audi', '25', '33', 'White', 'Black', 'Automatic', '2WD', '2.0', '4', '211', '258']
linear-rating
['15086949', 'WAUCFAFC5DN062406', 21, 'A6 Premium', '2013', 'Audi', '25', '33', 'White', 'Black', 'Automatic', '2WD', '2.0', '4', '211', '258', 2, 0, 0, 0, 1]
price-mileage--value
['15086949', 'WAUCFAFC5DN062406', 21, 'A6 Premium', '2013', 'Audi', '25', '33', 'White', 'Black', 'Automatic', '2WD', '2.0', '4', '211', '258', 2, 0, 0, 0, 1, '19998', '59']
store-information--location
['15086949', 'WAUCFA

about-this-car--header__subtitle
['15867099', 'WBA2F9C31HV984059']
action-bar--image-counter
['15867099', 'WBA2F9C31HV984059', 22]
car-page-header__car-title__model-trim
['15867099', 'WBA2F9C31HV984059', 22, '230 I']
car-page-header__car-title__year-make
['15867099', 'WBA2F9C31HV984059', 22, '230 I', '2017', 'BMW']
gas-mileage-container
['15867099', 'WBA2F9C31HV984059', 22, '230 I', '2017', 'BMW', '24', '35']
kmx-table
['15867099', 'WBA2F9C31HV984059', 22, '230 I', '2017', 'BMW', '24', '35', 'Gray', 'Black', 'Automatic', '2WD', '2.0', '4', '248', '258']
linear-rating
['15867099', 'WBA2F9C31HV984059', 22, '230 I', '2017', 'BMW', '24', '35', 'Gray', 'Black', 'Automatic', '2WD', '2.0', '4', '248', '258', 0, 0, 0, 0, 0]
price-mileage--value
['15867099', 'WBA2F9C31HV984059', 22, '230 I', '2017', 'BMW', '24', '35', 'Gray', 'Black', 'Automatic', '2WD', '2.0', '4', '248', '258', 0, 0, 0, 0, 0, '28998', '16']
store-information--location
['15867099', 'WBA2F9C31HV984059', 22, '230 I', '2017', 'BM

In [629]:
elapsed

118.49955037800828

In [587]:
URL = "https://www.carmax.com/car/16169822#ratingsReviews"
driver = webdriver.Chrome(chromedriver)
driver.get(URL)
element = driver.find_elements_by_class_name("store-information--location")

In [590]:
len(element) is None

False

In [601]:
pd.DataFrame(result)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,15879910,19UDE2F71GA000579,22,ILX,2016,Acura,25,35,Silver,Black,...,201,180,1,0,0,0,0,21998,25,"San Francisco, CA"
1,16061803,19UUA8F5XDA013730,23,TL,2013,Acura,20,29,White,Black,...,280,254,6,3,1,0,0,18998,42,"San Francisco, CA"


### Scape the recall page

URL = "https://www.nhtsa.gov/recalls?vin=4T1BK36B38U322693" <br>
Pages are guarded by robot detector

### Scape the autocheck page

In [496]:
driver = webdriver.Chrome(chromedriver)
URL = "https://www.carmax.com/car/15165784/vehicle-history"
driver.get(URL)

In [497]:
check = driver.find_element_by_xpath("//table/tbody")
car_summary = check.text.split("\n")

In [440]:

accident_check = car_summary[0].replace("Accident Check ", "")
cal_owner = car_summary[1].replace("Calculated Owners ", "")
problemcheck = car_summary[2].replace("Title and ProblemCheck ", "")
odometer_check = car_summary[3].replace("OdometerCheck ", "")
event_check = car_summary[4].replace("Use/EventCheck ", "")

In [498]:
len(car_summary)

6

In [513]:
recall_check = car_summary[5]
if("Open Safety Recalls" in recall_check):
    Open_recall = int(recall_check.replace("Open Safety Recalls Reported Recalls: ", ""))
else:
    open_recall = int(recall_check.replace("Reported Recalls: ", ""))

In [None]:
O

In [502]:
link_lists = [
 'https://www.carmax.com/car/15165784',
 'https://www.carmax.com/car/15210066',
 'https://www.carmax.com/car/15824549',
 'https://www.carmax.com/car/15867329',
 'https://www.carmax.com/car/15380380',
 'https://www.carmax.com/car/15867035',
 'https://www.carmax.com/car/15944508',
 'https://www.carmax.com/car/16169822',
 'https://www.carmax.com/car/15086949',
 'https://www.carmax.com/car/15585428',
 'https://www.carmax.com/car/15895613',
 'https://www.carmax.com/car/15777631',
 'https://www.carmax.com/car/16057238']

In [503]:
car_id = [x.split("/")[-1] for x in link_lists]

In [504]:
car_history = ["https://www.carmax.com/car/"+element+"/vehicle-history" for element in car_id]

In [505]:
car_history

['https://www.carmax.com/car/15165784/vehicle-history',
 'https://www.carmax.com/car/15210066/vehicle-history',
 'https://www.carmax.com/car/15824549/vehicle-history',
 'https://www.carmax.com/car/15867329/vehicle-history',
 'https://www.carmax.com/car/15380380/vehicle-history',
 'https://www.carmax.com/car/15867035/vehicle-history',
 'https://www.carmax.com/car/15944508/vehicle-history',
 'https://www.carmax.com/car/16169822/vehicle-history',
 'https://www.carmax.com/car/15086949/vehicle-history',
 'https://www.carmax.com/car/15585428/vehicle-history',
 'https://www.carmax.com/car/15895613/vehicle-history',
 'https://www.carmax.com/car/15777631/vehicle-history',
 'https://www.carmax.com/car/16057238/vehicle-history']

In [515]:
def get_carhistory(car_history):
    result = []
    for URL in car_history:
        driver = webdriver.Chrome(chromedriver)
        driver.get(URL)
        time.sleep(2)
        check = driver.find_element_by_xpath("//table/tbody")
        car_summary = check.text.split("\n")
        
        accident_check = car_summary[0].replace("Accident Check ", "")
        if (accident_check == 'No Accidents Reported'):
            accident_check = 1
        else:
            accident_check = int(accident_check.split(": ")[1])
            
        cal_owner = car_summary[1].replace("Calculated Owners ", "")
        if (cal_owner == "Only One Owner"):
            cal_owner = 1
        else:
            cal_owner = int(cal_owner)
            
        problem_check = car_summary[2].replace("Title and ProblemCheck ", "")
        if (problem_check == "Your vehicle checks out!"):
            problem_check = 'OK'
            
        odometer_check = car_summary[3].replace("OdometerCheck ", "")
        if (odometer_check == "Your vehicle checks out!"):
            odometer_check = 'OK'
            
        event_check = car_summary[4].replace("Use/EventCheck ", "")
        if(event_check == "No vehicle use or event information reported"):
            event_check = 'OK'
        
        open_recall = 0
        if(len(car_summary)==6):
            recall_check = car_summary[5]
            open_recall = int(recall_check.replace("Open Safety Recalls Reported Recalls: ", ""))
            
        result.append([accident_check,cal_owner,problem_check,odometer_check,event_check,open_recall])
        driver.close()
    return result

In [516]:
result = get_carhistory(car_history)

In [517]:
pd.DataFrame(result)

Unnamed: 0,0,1,2,3,4,5
0,1,1,OK,OK,Vehicle Use or event information available,1
1,1,1,OK,OK,Vehicle Use or event information available,0
2,1,1,OK,OK,OK,0
3,1,2,OK,OK,Vehicle Use or event information available,0
4,1,1,OK,OK,Vehicle Use or event information available,1
5,1,3,OK,OK,Vehicle Use or event information available,0
6,1,1,OK,OK,Vehicle Use or event information available,1
7,1,1,OK,OK,Vehicle Use or event information available,2
8,2,2,OK,OK,Vehicle Use or event information available,1
9,1,1,OK,OK,Vehicle Use or event information available,1
