In this project, we use webscraping packages to scrape data from cars.com website.

In [2]:
# import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [3]:
# HTTP requests
url = "https://www.cars.com/shopping/results/?dealer_id=&keyword=&list_price_max=&list_price_min=&makes[]=mercedes_benz&maximum_distance=30&mileage_max=&page_size=20&sort=best_match_desc&stock_type=cpo&year_max=&year_min=&zip="

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.26"
}

page = requests.get(url, headers=headers)

# checking status code
page.status_code

200

In [4]:
# soup object
soup1 = BeautifulSoup(page.content, "html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
soup2

<!DOCTYPE html>

<html class="ep-theme-cars" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title data-suffix=" | Cars.com">
   Certified Used Mercedes-Benz for Sale | Cars.com
  </title>
<meta content="Shop Mercedes-Benz vehicles for sale at Cars.com. Research, compare, and save listings, or contact sellers directly from 2,088 Mercedes-Benz models nationwide." name="description"/>
<meta content="noindex, nofollow" name="robots"/>
<meta content="Cars.com" property="og:site_name"/>
<meta content="website" property="og:type"/>
<meta content="Certified Used Mercedes-Benz for Sale | Cars.com" property="og:title"/>
<meta content="https://www.cars.com/shopping/results" property="og:url"/>
<meta content="Shop Mercedes-Benz vehicles for sale at Cars.com. Research, compare, and save listings, or contact sellers directly from 2,088 Mercedes-Benz models nationwide." property=

In [5]:
# creating a results variable 
results = soup2.find_all("div", {"class": "vehicle-card"})
len(results)

21

In [6]:
# printing the first data from the url
results[0]

<div class="vehicle-card inventory-ad" data-inventory-ad="true" data-koddi-click-tracking-url="https://cars.koddi.io/event-collection/beacon?action=click&amp;trackingData={trackingData}&amp;rank={rank}&amp;clientName=Cars&amp;beaconIssued=2022-11-09T10:49:03Z" data-koddi-impression-tracking-url="https://cars.koddi.io/event-collection/beacon?action=impression&amp;trackingData={trackingData}&amp;rank={rank}&amp;ts={ts}&amp;clientName=Cars&amp;beaconIssued=2022-11-09T10:49:03Z" data-koddi-listing-id="61215756-d32d-420f-b367-d9ae67413a5e" data-koddi-tracking-data="H1y5cIzz9bwhT8fe0CAqXYvppbc7XZoQxhgWxgtwxgqPSSF0ufEYhEzgaWJ740RSnfmp7D3MlXfxLEXO8PGwehFoO3EwBC8MrWnimIxWGZOzrVFc2AAuhJW1mnGwia+WrBUCbY2YEZyK8St9FHAOM3FDFpSN/Peb62UIIz4anDRjqP2a1UYV1eCU2kQ027WxpBFusHpu7QlTLgasu8cOIrjyeIRtnTdhyNB4EdGNPvUeaN13R0209DBNYbRs4sb5CKFYxqhma8Ud+x79IsplxjASOp8Byl1uMYfPH/BBBmxNchyVpl9VkelkZ60LNY7VoE0FpJ2k6i2scI91U8cfFB+qnJJMjull4eyGpEulan73k1tvG+EczPPMA/I5/wjjucqxHL0gF1ENeqgwJx7qyHzI/rYCs5hcpx3HNHAnVCmr9zm+G

In [8]:
# extracting the name mileage, rating, rating count, dealer name and price of the vehicle
results[0].find("h2").get_text().strip()


'2020 Mercedes-Benz CLA 250 Base 4MATIC'

In [9]:
# mileage
results[0].find("div", {"class": "mileage"}).get_text().strip()

'17,266 mi.'

In [10]:
# rating
results[0].find("span", {"class": "sds-rating__count"}).get_text().strip()

'4.4'

In [11]:
# rating count
results[0].find("span", {"class": "sds-rating__link"}).get_text().strip()

'(1,396 reviews)'

In [12]:
# dealer name
results[0].find("div", {"class": "dealer-name"}).get_text().strip()

'Mercedes-Benz of Temecula'

In [13]:
# price 
results[0].find("span", {"class": "primary-price"}).get_text().strip()

'$38,888'

In [14]:
# creating empty lists and using for loop to iterate
name = []
mileage = []
rating = []
rating_count = []
dealer_name = []
price = []

# name
for result in results:
    try:
        name.append(result.find("h2").get_text().strip())
    except:
        name.append("n/a")
# mileage

for result in results:
    try:
        mileage.append(result.find("div", {"class": "mileage"}).get_text().strip())
    except:
        mileage.append("n/a")
# rating

for result in results:
    try:
        rating.append(
            result.find("span", {"class": "sds-rating__count"}).get_text().strip()
        )
    except:
        rating.append("n/a")
# rating_count

for result in results:
    try:
        rating_count.append(
            result.find("span", {"class": "sds-rating__link"}).get_text().strip()
        )
    except:
        rating_count.append("n/a")
# dealer name

for result in results:
    try:
        dealer_name.append(
            result.find("div", {"class": "dealer-name"}).get_text().strip()
        )
    except:
        dealer_name.append("n/a")
# price

for result in results:
    try:
        price.append(result.find("span", {"class": "primary-price"}).get_text().strip())
    except:
        price.append("n/a")

In [15]:
# create pandas dataframe
car_dealer = pd.DataFrame(
    {
        "Name": name,
        "Mileage": mileage,
        "Rating": rating,
        "Rating Count": rating_count,
        "Dealer Name": dealer_name,
        "Price": price,
    }
)

car_dealer

Unnamed: 0,Name,Mileage,Rating,Rating Count,Dealer Name,Price
0,2020 Mercedes-Benz CLA 250 Base 4MATIC,"17,266 mi.",4.4,"(1,396 reviews)",Mercedes-Benz of Temecula,"$38,888"
1,2021 Mercedes-Benz AMG A 35 Base,"9,401 mi.",4.9,"(1,815 reviews)",Mercedes-Benz of San Antonio,"$48,762"
2,2019 Mercedes-Benz G-Class G 550 4MATIC,"26,273 mi.",,(13 reviews),Mercedes-Benz of Buffalo,"$144,850"
3,2021 Mercedes-Benz AMG GLE 53 Base,"15,000 mi.",4.8,(597 reviews),Astorg Auto of Charleston,"$91,995"
4,2020 Mercedes-Benz S-Class S 560 4MATIC,"30,312 mi.",4.9,"(1,815 reviews)",Mercedes-Benz of San Antonio,"$75,887"
5,2017 Mercedes-Benz AMG GLC 43 Base 4MATIC,"49,137 mi.",4.9,"(1,815 reviews)",Mercedes-Benz of San Antonio,"$43,977"
6,2017 Mercedes-Benz AMG CLA 45 Base 4MATIC,"21,818 mi.",4.9,(828 reviews),Napleton Autowerks Loves Park,"$37,999"
7,2018 Mercedes-Benz G-Class G 550 4MATIC,"21,385 mi.",4.6,"(1,084 reviews)",Mercedes-Benz of Littleton,"$109,999"
8,2022 Mercedes-Benz GLB 250 4MATIC,"4,237 mi.",4.4,(488 reviews),Mercedes-Benz of Scarborough,"$50,715"
9,2021 Mercedes-Benz S-Class S 580 4MATIC,"3,046 mi.",3.2,(233 reviews),Mercedes-Benz of Arrowhead,"$109,988"


In [16]:
# # data cleaning using lambda function to remove the reviews and brackets in the rating count column
car_dealer["Rating Count"] = car_dealer["Rating Count"].apply(
    lambda x: x.strip("reviews)").strip("(")
)

In [17]:
# checking data types of the columns
car_dealer.dtypes

Name            object
Mileage         object
Rating          object
Rating Count    object
Dealer Name     object
Price           object
dtype: object

In [18]:
# correcting the data types
car_dealer["Rating Count"] = [
    int(str(i).replace(",", "")) for i in car_dealer["Rating Count"]
]

car_dealer["Rating"] = car_dealer["Rating"].str.replace("n/a", "0").astype(float)

car_dealer

Unnamed: 0,Name,Mileage,Rating,Rating Count,Dealer Name,Price
0,2020 Mercedes-Benz CLA 250 Base 4MATIC,"17,266 mi.",4.4,1396,Mercedes-Benz of Temecula,"$38,888"
1,2021 Mercedes-Benz AMG A 35 Base,"9,401 mi.",4.9,1815,Mercedes-Benz of San Antonio,"$48,762"
2,2019 Mercedes-Benz G-Class G 550 4MATIC,"26,273 mi.",0.0,13,Mercedes-Benz of Buffalo,"$144,850"
3,2021 Mercedes-Benz AMG GLE 53 Base,"15,000 mi.",4.8,597,Astorg Auto of Charleston,"$91,995"
4,2020 Mercedes-Benz S-Class S 560 4MATIC,"30,312 mi.",4.9,1815,Mercedes-Benz of San Antonio,"$75,887"
5,2017 Mercedes-Benz AMG GLC 43 Base 4MATIC,"49,137 mi.",4.9,1815,Mercedes-Benz of San Antonio,"$43,977"
6,2017 Mercedes-Benz AMG CLA 45 Base 4MATIC,"21,818 mi.",4.9,828,Napleton Autowerks Loves Park,"$37,999"
7,2018 Mercedes-Benz G-Class G 550 4MATIC,"21,385 mi.",4.6,1084,Mercedes-Benz of Littleton,"$109,999"
8,2022 Mercedes-Benz GLB 250 4MATIC,"4,237 mi.",4.4,488,Mercedes-Benz of Scarborough,"$50,715"
9,2021 Mercedes-Benz S-Class S 580 4MATIC,"3,046 mi.",3.2,233,Mercedes-Benz of Arrowhead,"$109,988"


In [19]:
# export to excel
car_dealer.to_excel("single_page_car.xlsx", index=False)

The data that has been scraped was for only one page from the url. From the cells below, we are going to scrape data from multiple pages from the url. 

In [20]:
# creating empty lists for iteration using for loop
name = []
mileage = []
rating = []
rating_count = []
dealer_name = []
price = []

for i in range(1, 11):
    # url in a variable
    url = (
        "https://www.cars.com/shopping/results/?page="
        + str(i)
        + "&page_size=20&dealer_id=&keyword=&list_price_max=&list_price_min=&makes[]=mercedes_benz&maximum_distance=30&mileage_max=&sort=best_match_desc&stock_type=cpo&year_max=&year_min=&zip="
    )
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.26"
    }
    
    page = requests.get(url, headers=headers)
    
    # soup object
    soup1 = BeautifulSoup(page.content, "html.parser")
    soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
    
    # results
    results = soup2.find_all("div", {"class": "vehicle-card"})
    
    # name
    for result in results:
        try:
            name.append(result.find("h2").get_text().strip())
        except:
            name.append("n/a")
    
    # mileage
    for result in results:
        try:
            mileage.append(result.find("div", {"class": "mileage"}).get_text().strip())
        except:
            mileage.append("n/a")
    
    # rating
    for result in results:
        try:
            rating.append(
                result.find("span", {"class": "sds-rating__count"}).get_text().strip()
            )
        except:
            rating.append("n/a")
    
    # rating_count
    for result in results:
        try:
            rating_count.append(
                result.find("span", {"class": "sds-rating__link"}).get_text().strip()
            )
        except:
            rating_count.append("n/a")
    
    # dealer name
    for result in results:
        try:
            dealer_name.append(
                result.find("div", {"class": "dealer-name"}).get_text().strip()
            )
        except:
            dealer_name.append("n/a")
    
    # price
    for result in results:
        try:
            price.append(
                result.find("span", {"class": "primary-price"}).get_text().strip()
            )
        except:
            price.append("n/a")

In [21]:
# create pandas dataframe
car_dealer_combined = pd.DataFrame(
    {
        "Name": name,
        "Mileage": mileage,
        "Rating": rating,
        "Rating Count": rating_count,
        "Dealer Name": dealer_name,
        "Price": price,
    }
)

car_dealer_combined

Unnamed: 0,Name,Mileage,Rating,Rating Count,Dealer Name,Price
0,2020 Mercedes-Benz CLA 250 Base 4MATIC,"17,266 mi.",4.4,"(1,396 reviews)",Mercedes-Benz of Temecula,"$38,888"
1,2021 Mercedes-Benz AMG A 35 Base,"9,401 mi.",4.9,"(1,815 reviews)",Mercedes-Benz of San Antonio,"$48,762"
2,2021 Mercedes-Benz AMG GLE 53 Base,"15,000 mi.",4.8,(597 reviews),Astorg Auto of Charleston,"$91,995"
3,2019 Mercedes-Benz G-Class G 550 4MATIC,"26,273 mi.",,(13 reviews),Mercedes-Benz of Buffalo,"$144,850"
4,2020 Mercedes-Benz S-Class S 560 4MATIC,"30,312 mi.",4.9,"(1,815 reviews)",Mercedes-Benz of San Antonio,"$75,887"
...,...,...,...,...,...,...
201,2020 Mercedes-Benz GLS 450 Base 4MATIC,"34,630 mi.",4.8,(70 reviews),Mercedes-Benz of West Houston,"$71,990"
202,2018 Mercedes-Benz AMG GLE 63 S Coupe 4MATIC,"15,886 mi.",4.1,"(1,017 reviews)",RBM of Atlanta,"$83,919"
203,2022 Mercedes-Benz GLC 300 Base 4MATIC,"6,874 mi.",4.8,"(1,238 reviews)",Mercedes-Benz of North Haven,"$52,375"
204,2019 Mercedes-Benz AMG GLC 43 Base 4MATIC,"29,391 mi.",4.6,"(1,084 reviews)",Mercedes-Benz of Littleton,"$49,999"


In [22]:
# check for duplicates
car_dealer_combined.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
201    False
202    False
203    False
204    False
205    False
Length: 206, dtype: bool

In [23]:
# data cleaning using lambda function to remove the reviews and brackets in the rating count column
car_dealer_combined["Rating Count"] = car_dealer_combined["Rating Count"].apply(
    lambda x: x.strip("reviews)").strip("(")
)

In [24]:
# checking data types
car_dealer_combined.dtypes

Name            object
Mileage         object
Rating          object
Rating Count    object
Dealer Name     object
Price           object
dtype: object

In [25]:
# correcting data types
car_dealer_combined["Rating Count"] = [
    int(str(i).replace(",", "")) for i in car_dealer_combined["Rating Count"]
]

car_dealer_combined["Rating"] = (
    car_dealer_combined["Rating"].str.replace("n/a", "0").astype(float)
)

In [26]:
car_dealer_combined

Unnamed: 0,Name,Mileage,Rating,Rating Count,Dealer Name,Price
0,2020 Mercedes-Benz CLA 250 Base 4MATIC,"17,266 mi.",4.4,1396,Mercedes-Benz of Temecula,"$38,888"
1,2021 Mercedes-Benz AMG A 35 Base,"9,401 mi.",4.9,1815,Mercedes-Benz of San Antonio,"$48,762"
2,2021 Mercedes-Benz AMG GLE 53 Base,"15,000 mi.",4.8,597,Astorg Auto of Charleston,"$91,995"
3,2019 Mercedes-Benz G-Class G 550 4MATIC,"26,273 mi.",0.0,13,Mercedes-Benz of Buffalo,"$144,850"
4,2020 Mercedes-Benz S-Class S 560 4MATIC,"30,312 mi.",4.9,1815,Mercedes-Benz of San Antonio,"$75,887"
...,...,...,...,...,...,...
201,2020 Mercedes-Benz GLS 450 Base 4MATIC,"34,630 mi.",4.8,70,Mercedes-Benz of West Houston,"$71,990"
202,2018 Mercedes-Benz AMG GLE 63 S Coupe 4MATIC,"15,886 mi.",4.1,1017,RBM of Atlanta,"$83,919"
203,2022 Mercedes-Benz GLC 300 Base 4MATIC,"6,874 mi.",4.8,1238,Mercedes-Benz of North Haven,"$52,375"
204,2019 Mercedes-Benz AMG GLC 43 Base 4MATIC,"29,391 mi.",4.6,1084,Mercedes-Benz of Littleton,"$49,999"


In [27]:
# export to excel
car_dealer_combined.to_excel("multiple_page_car.xlsx", index=False)