# Use selenium to scrape carmax.com

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup

import time
import os
import numpy as np
import pandas as pd
import timeit
import pickle

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

### Scape the list of URLs

In [83]:
driver = webdriver.Chrome(chromedriver)
SearchURL = "https://www.carmax.com/search"
driver.get(SearchURL)
time.sleep(1);
link_lists = []
endofresult = False

max_value = driver.find_element_by_xpath('//div[@data-max-price="100000"]')
time.sleep(1)
max_value.send_keys(Keys.DOWN)
time.sleep(0.5)
max_value.send_keys(Keys.DOWN)
time.sleep(0.5)
max_value.send_keys(Keys.DOWN)
max_value.send_keys(Keys.RETURN)

In [None]:
while(endofresult == False):
    try:
        driver.find_element_by_css_selector(".pagination--next-disable")
        endofresult = True
        print ("This is the end of search result!")
        with open("LA_URL_Links_All.pkl".format(count), 'wb') as picklefile: # b means binary
            pickle.dump(link_lists, picklefile)
    except Exception as e:
        next_element = driver.find_element_by_class_name("pagination--next")
        next_element.click()
        time.sleep(3)
        link_elements = driver.find_elements_by_xpath('//h3[@class="vehicle-browse--result-title"]/a')
        link_lists = link_lists + [element.get_attribute('href') for element in link_elements]
        count = len(link_lists)
        if (count % 100 == 0):
            with open("LA_URL_Links_{}.pkl".format(count), 'wb') as picklefile: # b means binary
                pickle.dump(link_lists, picklefile)

### Scape the individual Car main pages

In [2]:
# Retrieve URL links retrieved earlier
with open('./data/LA_URL_Links_2400.pkl', 'rb') as picklefile: 
    links = pickle.load(picklefile)

In [3]:
# Construct URLs for individual cars 
individual_links = ["https://www.carmax.com/car/"+x.split("/")[4]+"#ratingsReviews" for x in links]

In [4]:
subset = individual_links[0:20]

In [5]:
def process_mileage_value(input):
    price = (input[0]
                 .text
                 .replace('$','')
                 .replace('*','')
                 .replace(',',''))
    mileage = (input[1]
                 .text
                 .replace('K',''))
    return [price, mileage]

In [6]:
def process_year_make(input):
    splited = input[0].text.split(' ')
    year = splited[0]
    brand = splited[1]
    return [year, brand]

In [7]:
def process_min(input):
    return [input[0].text]

In [8]:
def process_location(input):
    if(len(input) == 0):
        return ['N/A']
    else:
        return [input[0].text]

In [9]:
def process_image_count(input):
    if(len(input) == 0):
        return [0]
    else:
        return [int(input[0].text.split(" ")[2])]

In [10]:
def process_stock_vin(input):
    splited = input[0].text.split("|")
    stock = splited[0].split(" ")[2]
    vin = splited[1].split(" ")[2]
    return [stock, vin]

In [11]:
def process_city_highway(input):
    splited = input[0].text.split("\n")
    city = splited[0]
    highway = splited[2]
    return [city, highway]

In [12]:
def process_base_spec(input):
    color = input[0].text.split("\n")
    exterior = color[1].split(" ")[1]
    interior = color[2].split(" ")[1]
    basespec = input[1].text.split("\n")
    transmission = basespec[1].split(" ")[1]
    drive = basespec[2].split(" ")[1]
    engine = basespec[3].split(" ")[1].replace("L","")
    cylinder = basespec[4].split(" ")[1]
    horsepower = basespec[5].split(" ")[1]
    torque = basespec[6].split(" ")[1]
    return [exterior, interior, transmission, drive, engine, cylinder, horsepower, torque]

In [13]:
def process_user_rating(input):
    if(len(input) == 0):
        return [0,0,0,0,0]
    else:
        user_star_5 = int(input[0].text.split("\n")[1])
        user_star_4 = int(input[1].text.split("\n")[1])
        user_star_3 = int(input[2].text.split("\n")[1])
        user_star_2 = int(input[3].text.split("\n")[1])
        user_star_1 = int(input[4].text.split("\n")[1])
        return [user_star_5, user_star_4, user_star_3, user_star_2, user_star_1]

In [14]:
# This dictionary stores the classnames and function names to process webelements retrieved with the classnames
classname_function_dict = {
    "car-page-header__car-title__year-make":process_year_make,
    "car-page-header__car-title__model-trim":process_min,
    "price-mileage--value":process_mileage_value,
    "action-bar--image-counter":process_image_count,
    "store-information--location":process_location,
    "about-this-car--header__subtitle":process_stock_vin,
    "gas-mileage-container":process_city_highway,
    "kmx-table":process_base_spec,
    "linear-rating":process_user_rating
}

In [15]:
# Retrieve

def get_car_specs(URL_lists, classname_function_dict, lastrecord):
    
    old_car_data = []
    
    if (lastrecord !=0 ):
        with open('LA_car_specs_{}.pkl'.format(lastrecord), 'rb') as picklefile: 
            old_car_data = pickle.load(picklefile)
    
    result = old_car_data
    
    for count in range(lastrecord+1, len(URL_lists)):
        driver = webdriver.Chrome(chromedriver)
        
        driver.get(URL_lists[count])
        stockid = URL_lists[count].split("/")[4].split("#")[0]
        time.sleep(7)
        #review_button = driver.find_element_by_xpath('//a[@class="kmx-button kmx-button--secondary kmx-button--large kmx-button--raised"]')
        #review_button.click()
        #time.sleep(2)
        record = []
        for classname in sorted(classname_function_dict.keys()):
            element = driver.find_elements_by_class_name(classname)
            # print (classname)
            record = record + (classname_function_dict[classname](element))
            # print (record)
        #result.append(record)
        #driver.close()
        
        #print (record)
        
        car_history_URL = "https://www.carmax.com/car/" + stockid + "/vehicle-history"
        history = get_carhistory(car_history_URL,driver)
        #print (history)
        record = record + history
        result.append(record)

        if ((count !=0) & (count % 20 == 0)):
            with open('LA_car_specs_{}.pkl'.format(count), 'wb') as picklefile: # b means binary
                pickle.dump(result, picklefile)
                
        driver.quit()
                
    return result

In [27]:
def get_carhistory(URL, driver):
    #result = []
    #for URL in car_history:
    
    #driver = webdriver.Chrome(chromedriver)
    driver.get(URL)
    time.sleep(5)
    check = driver.find_element_by_xpath("//table/tbody")
    car_summary = check.text.split("\n")

    accident_check = car_summary[0].replace("Accident Check ", "")
    if (accident_check == 'No Accidents Reported'):
        accident_check = 0
    else:
        accident_check = int(accident_check.split(": ")[1])

    cal_owner = car_summary[1].replace("Calculated Owners ", "")
    if (cal_owner == "Only One Owner"):
        cal_owner = 1
    else:
        cal_owner = int(cal_owner)

    problem_check = car_summary[2].replace("Title and ProblemCheck ", "")
    if (problem_check == "Your vehicle checks out!"):
        problem_check = 'OK'
    else:
        problem_check = 'NOT OK'

    odometer_check = car_summary[3].replace("OdometerCheck ", "")
    if (odometer_check == "Your vehicle checks out!"):
        odometer_check = 'OK'
    else:
        odometer_check = 'NOT OK'

    event_check = car_summary[4].replace("Use/EventCheck ", "")
    if(event_check == "No vehicle use or event information reported"):
        event_check = 'OK'
    else:
        event_check = 'NOT OK'

    open_recall = 0
    if(len(car_summary)==6):
        recall_check = car_summary[5]
        open_recall = int(recall_check.replace("Open Safety Recalls Reported Recalls: ", ""))

    result = [accident_check,cal_owner,problem_check,odometer_check,event_check,open_recall]
    
    return result

In [31]:
# Time the script execution

lastrecord = 20

start_time = timeit.default_timer()

result = get_car_specs(individual_links, classname_function_dict, lastrecord)

elapsed = timeit.default_timer() - start_time

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//table/tbody"}
  (Session info: chrome=67.0.3396.99)
  (Driver info: chromedriver=2.40.565386 (45a059dc425e08165f9a10324bd1380cc13ca363),platform=Mac OS X 10.13.5 x86_64)


In [23]:
individual_links.remove('https://www.carmax.com/car/16064373#ratingsReviews')

In [24]:
with open('./data/LA_URL_Links_2400.pkl', 'wb') as picklefile: # b means binary
    pickle.dump(individual_links, picklefile)

In [25]:
individual_links

['https://www.carmax.com/car/16147037#ratingsReviews',
 'https://www.carmax.com/car/15380328#ratingsReviews',
 'https://www.carmax.com/car/15690668#ratingsReviews',
 'https://www.carmax.com/car/15895597#ratingsReviews',
 'https://www.carmax.com/car/16035336#ratingsReviews',
 'https://www.carmax.com/car/15967989#ratingsReviews',
 'https://www.carmax.com/car/16055700#ratingsReviews',
 'https://www.carmax.com/car/15928036#ratingsReviews',
 'https://www.carmax.com/car/15833401#ratingsReviews',
 'https://www.carmax.com/car/15773557#ratingsReviews',
 'https://www.carmax.com/car/16056533#ratingsReviews',
 'https://www.carmax.com/car/15893366#ratingsReviews',
 'https://www.carmax.com/car/16042916#ratingsReviews',
 'https://www.carmax.com/car/15956930#ratingsReviews',
 'https://www.carmax.com/car/15895462#ratingsReviews',
 'https://www.carmax.com/car/16156390#ratingsReviews',
 'https://www.carmax.com/car/16154591#ratingsReviews',
 'https://www.carmax.com/car/16210252#ratingsReviews',
 'https://

In [77]:
with open('car_specs_2120.pkl'.format(lastrecord), 'rb') as picklefile: 
        data20 = pickle.load(picklefile)
pd.DataFrame(data20, columns=colnames)

Unnamed: 0,stock_id,vin_id,images,model,year,brand,city,highway,exterior_color,interior_color,...,rating_1,price,mileage,store,accident,owner,problem,odometer,event,recall
0,16061803,19UUA8F5XDA013730,23,TL,2013,Acura,20,29,White,Black,...,0,18998,42,"San Francisco, CA",0,2,OK,OK,NOT OK,0
1,15824438,19UUB1F33HA003761,22,TLX,2017,Acura,24,35,Red,Tan,...,0,24998,20,"San Francisco, CA",1,1,OK,OK,NOT OK,0
2,15165784,WAUACGFF8F1040956,21,A3 Premium,2015,Audi,23,33,Black,Black,...,0,17599,36,"San Francisco, CA",0,1,OK,OK,NOT OK,1
3,15210066,WAUACGFF7F1024750,21,A3 Premium,2015,Audi,,,Burgundy,Gray,...,0,16998,37,"San Francisco, CA",1,1,OK,OK,NOT OK,0
4,15824549,WAUE8GFF6G1004350,22,A3 Premium Plus,2016,Audi,24,33,Red,Black,...,0,24599,12,"San Francisco, CA",0,1,OK,OK,OK,0
5,15867329,WAUEFGFF8F1049522,21,A3 Premium Plus,2015,Audi,24,33,Brown,Brown,...,0,20599,29,"San Francisco, CA",0,2,OK,OK,NOT OK,0
6,15380380,WAUAFAFL0FA022347,25,A4 Premium,2015,Audi,24,32,Black,Black,...,0,20998,37,"San Francisco, CA",0,1,OK,OK,NOT OK,1
7,15867035,WAUAFAFL0CN012485,20,A4 Premium,2012,Audi,22,30,Black,Black,...,0,14998,66,"San Francisco, CA",0,3,OK,OK,NOT OK,0
8,15944508,WAUEFAFL7FN019312,21,A4 Premium Plus,2015,Audi,24,32,Black,Gray,...,0,20998,30,"San Francisco, CA",0,1,OK,OK,NOT OK,1
9,15086949,WAUCFAFC5DN062406,21,A6 Premium,2013,Audi,25,33,White,Black,...,1,19998,59,"San Francisco, CA",2,2,OK,OK,NOT OK,1


In [42]:
with open('car_specs_40.pkl'.format(lastrecord), 'rb') as picklefile: 
        data40 = pickle.load(picklefile)
pd.DataFrame(data40, columns=colnames)

Unnamed: 0,stock_id,vin_id,images,model,year,brand,city,highway,exterior_color,interior_color,...,rating_1,price,mileage,store,accident,owner,problem,odometer,event,recall
0,16061803,19UUA8F5XDA013730,23,TL,2013,Acura,20.0,29.0,White,Black,...,0,18998.0,42,"From San Francisco, CA",1,2,OK,OK,NOT OK,0
1,15824438,19UUB1F33HA003761,22,TLX,2017,Acura,24.0,35.0,Red,Tan,...,0,24998.0,20,"From San Francisco, CA",1,1,OK,OK,NOT OK,0
2,15165784,WAUACGFF8F1040956,21,A3 Premium,2015,Audi,23.0,33.0,Black,Black,...,0,17599.0,36,"From San Francisco, CA",1,1,OK,OK,NOT OK,1
3,15210066,WAUACGFF7F1024750,21,A3 Premium,2015,Audi,,,Burgundy,Gray,...,0,16998.0,37,"From San Francisco, CA",1,1,OK,OK,NOT OK,0
4,15824549,WAUE8GFF6G1004350,22,A3 Premium Plus,2016,Audi,24.0,33.0,Red,Black,...,0,24599.0,12,"San Francisco, CA",1,1,OK,OK,OK,0
5,15867329,WAUEFGFF8F1049522,21,A3 Premium Plus,2015,Audi,24.0,33.0,Brown,Brown,...,0,20599.0,29,"From San Francisco, CA",1,2,OK,OK,NOT OK,0
6,15380380,WAUAFAFL0FA022347,25,A4 Premium,2015,Audi,24.0,32.0,Black,Black,...,0,20998.0,37,"From San Francisco, CA",1,1,OK,OK,NOT OK,1
7,15867035,WAUAFAFL0CN012485,20,A4 Premium,2012,Audi,22.0,30.0,Black,Black,...,0,14998.0,66,"From San Francisco, CA",1,3,OK,OK,NOT OK,0
8,15944508,WAUEFAFL7FN019312,21,A4 Premium Plus,2015,Audi,24.0,32.0,Black,Gray,...,0,20998.0,30,"From San Francisco, CA",1,1,OK,OK,NOT OK,1
9,15086949,WAUCFAFC5DN062406,21,A6 Premium,2013,Audi,25.0,33.0,White,Black,...,1,19998.0,59,"From San Francisco, CA",2,2,OK,OK,NOT OK,1


In [43]:
individual_links

['https://www.carmax.com/car/15879910#ratingsReviews',
 'https://www.carmax.com/car/16061803#ratingsReviews',
 'https://www.carmax.com/car/15824438#ratingsReviews',
 'https://www.carmax.com/car/15165784#ratingsReviews',
 'https://www.carmax.com/car/15210066#ratingsReviews',
 'https://www.carmax.com/car/15824549#ratingsReviews',
 'https://www.carmax.com/car/15867329#ratingsReviews',
 'https://www.carmax.com/car/15380380#ratingsReviews',
 'https://www.carmax.com/car/15867035#ratingsReviews',
 'https://www.carmax.com/car/15944508#ratingsReviews',
 'https://www.carmax.com/car/15086949#ratingsReviews',
 'https://www.carmax.com/car/15585428#ratingsReviews',
 'https://www.carmax.com/car/16057238#ratingsReviews',
 'https://www.carmax.com/car/15867327#ratingsReviews',
 'https://www.carmax.com/car/16056848#ratingsReviews',
 'https://www.carmax.com/car/15867099#ratingsReviews',
 'https://www.carmax.com/car/15381244#ratingsReviews',
 'https://www.carmax.com/car/15982798#ratingsReviews',
 'https://

In [24]:
colnames = ['stock_id', 
            'vin_id', 
            'images', 
            'model', 
            'year', 
            'brand',
            'city',
            'highway',
            'exterior_color',
            'interior_color',
            'transmission',
            'drive',
            'engine',
            'cylinder',
            'horsepower',
            'torque',
            'rating_5',
            'rating_4',
            'rating_3',
            'rating_2',
            'rating_1',
            'price',
            'mileage',
            'store',
            'accident',
            'owner',
            'problem',
            'odometer',
            'event',
            'recall'
           ]

In [None]:
pd.DataFrame(car_spec, columns=colnames)

In [696]:
with open('./car_specs_20.pkl', 'rb') as picklefile: 
    car_spec = pickle.load(picklefile)

### Scape the recall page

URL = "https://www.nhtsa.gov/recalls?vin=4T1BK36B38U322693" <br>
Pages are guarded by robot detector

### Scape the autocheck page

In [76]:
def get_carhistory(URL, driver):
    #result = []
    #for URL in car_history:
    
    #driver = webdriver.Chrome(chromedriver)
    driver.get(URL)
    time.sleep(5)
    check = driver.find_element_by_xpath("//table/tbody")
    car_summary = check.text.split("\n")

    accident_check = car_summary[0].replace("Accident Check ", "")
    if (accident_check == 'No Accidents Reported'):
        accident_check = 0
    else:
        accident_check = int(accident_check.split(": ")[1])

    cal_owner = car_summary[1].replace("Calculated Owners ", "")
    if (cal_owner == "Only One Owner"):
        cal_owner = 1
    else:
        cal_owner = int(cal_owner)

    problem_check = car_summary[2].replace("Title and ProblemCheck ", "")
    if (problem_check == "Your vehicle checks out!"):
        problem_check = 'OK'
    else:
        problem_check = 'NOT OK'

    odometer_check = car_summary[3].replace("OdometerCheck ", "")
    if (odometer_check == "Your vehicle checks out!"):
        odometer_check = 'OK'
    else:
        odometer_check = 'NOT OK'

    event_check = car_summary[4].replace("Use/EventCheck ", "")
    if(event_check == "No vehicle use or event information reported"):
        event_check = 'OK'
    else:
        event_check = 'NOT OK'

    open_recall = 0
    if(len(car_summary)==6):
        recall_check = car_summary[5]
        open_recall = int(recall_check.replace("Open Safety Recalls Reported Recalls: ", ""))

    result = [accident_check,cal_owner,problem_check,odometer_check,event_check,open_recall]
    
    return result

## Data cleaning