In [None]:
import requests
import json
from time import sleep
import time
from bs4 import BeautifulSoup
from pathlib import Path
import numpy as np
import datetime
import os
import re
import glob
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
import pandas as pd
import re
from collections import defaultdict
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

now = str(datetime.datetime.now().year)+str('0'+str(datetime.datetime.now().month))[-2:]+str('0'+str(datetime.datetime.now().day))[-2:]


In [None]:
# Step 1: Identify the scope of the dataset by data-listing-id (cars)

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

make_all = ['Toyota','Subaru','Mitsubishi','Honda','Nissan','Mazda'] # mainstream mass-market Japanese brands
priceMx = 20000 # dollars
distance = 500 # miles
zipcode = 60126 # my zipcode 
cars_per_page = 100

for j in range(0,6):

    make = make_all[j]

    url = 'https://www.cars.com/shopping/results/?stock_type=used&makes%5B%5D='+make.lower()+'&models%5B%5D=&maximum_distance='+str(distance)+'&zip='+str(zipcode)+'&list_price_max='+str(priceMx)+'&page_size='+str(cars_per_page)
    print(url)

    response = requests.get(url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.content, 'lxml')

    print(make)

    cars_total = soup.find('span', class_ = 'total-entries').text
    cars_total = int(cars_total.replace(',' , '').replace(' matches' , ''))

    print(cars_total)

    no_of_pages = int(cars_total/cars_per_page) + (cars_total%cars_per_page>0)

    if not os.path.exists('car lists'):
        os.makedirs('car lists')

    for i in range(0,no_of_pages):

        print(str(j+1)+' '+make+' '+str(i+1)+'/'+str(no_of_pages))

        sleep(np.random.choice([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])+5)

        url = 'https://www.cars.com/shopping/results/?stock_type=used&makes%5B%5D='+make.lower()+'&models%5B%5D=&maximum_distance='+str(distance)+'&zip='+str(zipcode)+'&list_price_max='+str(priceMx)+'&page_size='+str(cars_per_page)+'&page='+str(i+1)
        print(url)

        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'lxml')

        try:
            #data = json.loads(soup.find('script',{'id':'vehicleItemListSchema'}).contents[0][9:-5])
            data = [div["data-listing-id"] for div in soup.find_all("div", class_="vehicle-card")]
        except:
            data = 'Unable to get data'

        file = open("car lists/"+make+now+"carlist"+str(i)+".txt","w")
        file.write(str(data).replace("'",'"'))
        file.close()


In [None]:
# Step 2: Links to all cars

list_of_files = glob.glob(os.path.join("car lists", "**", "*.txt"), recursive=True)

linkvector = []

for file_path in list_of_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for item in data:
            link = f"https://www.cars.com/vehicledetail/{item}/"
            linkvector.append(link)

unique_links = list(set(linkvector))

print(len(unique_links))


In [None]:
# Step 3: Grab info from each car

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

now = str(datetime.datetime.now().year)+str('0'+str(datetime.datetime.now().month))[-2:]+str('0'+str(datetime.datetime.now().day))[-2:]

url = unique_links

if not os.path.exists('datasets'):
        os.makedirs('datasets')

data_folder = Path("datasets/")

column_names = [
    "make", "year", "trim", "model", "stock_type", "fuel_type", "bodystyle", "photo_count",
    "exterior_color", "interior_color", "drivetrain", "fuel_type", "transmission", "engine",
    "mileage", "accidents_or_damage", "clean_title", "one_owner_vehicle", "personal_use_only",
    "open_recall", "comfort_rating", "interior_rating", "performance_rating", "value_rating",
    "exterior_rating", "reliability_rating", "singleurl", "price"
]

dataset = pd.DataFrame([], columns=column_names)

def try_get_text(soup, label):
    try:
        return soup.find('dt', string=label).find_next_sibling('dd').get_text(strip=True)
    except:
        return "NA"

def try_get_rating(soup, label):
    try:
        return soup.find('span', class_='sds-definition-list__display-name', string=label).parent.find('span', class_='sds-definition-list__value').get_text(strip=True)
    except:
        return "NA"

def try_get_json_value(soup, label):
    try:
        return json.loads(soup.find("script", {"type": "application/json", "id": "initial-als-data"}).string).get(label)
    except:
        return "NA"

def try_get_photo_count(soup, label):
    label = 'photo_count'
    try:
        return json.loads(soup.find("script", {"type": "application/json", "id": "initial-activity-data"}).string).get(label)
    except:
        return "NA"
    
for i in range(10080,len(url)): # range(0, len(url))

    singleurl = url[i][:-1]

    time.sleep(np.random.uniform(4, 10))

    response = requests.get(singleurl, headers=headers, timeout=100)
    soup = BeautifulSoup(response.content, 'html')

    print(str(i)+' of '+str(len(url)))
    print(singleurl)

    exterior_color = try_get_text(soup, 'Exterior color')
    interior_color = try_get_text(soup, 'Interior color')
    drivetrain = try_get_text(soup, 'Drivetrain')
    fuel_type = try_get_text(soup, 'Fuel type')
    transmission = try_get_text(soup, 'Transmission')
    engine = try_get_text(soup, 'Engine')
    mileage = try_get_text(soup, 'Mileage')
    accidents_or_damage = try_get_text(soup, 'Accidents or damage')
    clean_title = try_get_text(soup, 'Clean title')
    one_owner_vehicle = try_get_text(soup, '1-owner vehicle')
    personal_use_only = try_get_text(soup, 'Personal use only')
    open_recall = try_get_text(soup, 'Open recall')

    comfort_rating = try_get_rating(soup, 'Comfort')
    interior_rating = try_get_rating(soup, 'Interior')
    performance_rating = try_get_rating(soup, 'Performance')
    value_rating = try_get_rating(soup, 'Value')
    exterior_rating = try_get_rating(soup, 'Exterior')
    reliability_rating = try_get_rating(soup, 'Reliability')
    
    listing_id = try_get_json_value(soup, "listing_id")
    year = try_get_json_value(soup, "model_year")
    photo_count = try_get_photo_count(soup, "photo_count")
    trim = try_get_json_value(soup, "trim")
    model = try_get_json_value(soup, "model")
    stock_type = try_get_json_value(soup, "stock_type")
    bodystyle = try_get_json_value(soup, "bodystyle")
    make = try_get_json_value(soup, "make")
    price = try_get_json_value(soup, "price")

    data_point = [make,
                  year,
                  trim,
                  model,
                  stock_type,
                  fuel_type,
                  bodystyle,
                  photo_count,
                  exterior_color,
                  interior_color,
                  drivetrain,
                  fuel_type,
                  transmission,
                  engine,
                  mileage,
                  accidents_or_damage,
                  clean_title,
                  one_owner_vehicle,
                  personal_use_only,
                  open_recall,
                  comfort_rating,
                  interior_rating,
                  performance_rating,
                  value_rating,
                  exterior_rating,
                  reliability_rating,
                  singleurl,
                  price]

    dataset.loc[len(dataset)] = data_point
    

In [None]:
dataset.to_csv("datasets/"+now+'_dataset_raw.csv', index=False)

In [None]:
dataset.shape