# Import library

In [8]:
from abc import ABC, abstractmethod
import requests
from bs4 import BeautifulSoup
import json
from dataclasses import dataclass
from typing import List, Dict, Optional, Tuple
import logging
from urllib.parse import urljoin
import os

from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep


# Setup class

In [9]:
@dataclass
class CarInfo:
	brand: str
	name: str
	price: Optional[str]
	specifications: Dict[str, str]
	images: List[str]
	id : int
	# description: Optional[str]

class BaseScraper(ABC):
	numberOfCars = -2
	idState = False
	def __init__(self, base_url: str):
		self.base_url = base_url
		self.session = requests.Session()
		self.headers = {
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
		}
	
	def get_soup(self, url: str) -> BeautifulSoup:
		response = self.session.get(url, headers=self.headers)
		return BeautifulSoup(response.content, 'html.parser')
	
	# @classmethod
	# def numberOfCarsDef(cls):
	# 	return cls.numberOfCars
	
	# @classmethod
	# def add_car(cls):
	# 	cls.numberOfCars += 1

	@classmethod
	def get_next_id(cls):
		# current_id = cls.numberOfCars
		cls.numberOfCars += 1
		return cls.numberOfCars
		# return current_id
	
	@classmethod
	def get_total_cars(cls) -> int:
		return cls.numberOfCars
	
	@classmethod 
	def set_numberOfCar(cls, value):
		cls.numberOfCars = value

	@classmethod
	def check_state_id(cls) -> bool:
		return cls.idState
	
	@classmethod
	def pass_ID(cls) :
		cls.idState = True

	@abstractmethod
	def extract_car_info(self, url: str) -> CarInfo:
		pass
	
	@abstractmethod
	def get_all_car_urls(self) -> Dict[str, str]:
		pass

## Toyota

Crawl specific page

In [10]:
class Website1Scraper(BaseScraper):
	def extract_car_info(self, url: str) -> CarInfo:
		soup = self.get_soup(url)
		
		try:
			# car name
			name = soup.select_one('.text-title.mb-32.text-left').text.strip()
			
			# price
			price_element = soup.select_one('.col-7 .concept-car-info .concept-car-value')

			price = None
			if price_element:
				price_text = price_element.text.strip()
				currency_element = price_element.select_one('.concept-car-value-sub')
				currency = currency_element.text.strip() if currency_element else 'VND'
				if currency in price_text:
					price_text = price_text.replace(currency, '').strip()
				price = f"{price_text} {currency}"
			
			# car info
			specs = {}
			specs_rows = soup.select('.concept-car-info')

			for row in specs_rows:
				try:
					key = row.select_one('.concept-car-name').text.strip()
					value = row.select_one('.concept-car-value').text.strip()
					if "Giá từ" not in key and "VNĐ" not in value:
						specs[key] = value
					# specs[key] = value
				except (AttributeError, IndexError):
					continue
			
			# car images
			images = []
			image_elements = soup.select('.product-detail-img img') 
			# for img in image_elements:
			# 	print(img)
			for img in image_elements:
				try:
					src = img.get('data-src')  # src
					if src:
						if src.startswith('//'):
							src = 'https:' + src
						elif not src.startswith('http'):
							src = f"{self.base_url.rstrip('/')}/{src.lstrip('/')}"
						images.append(src)
				except AttributeError:
					continue
			
			# car description
			description_element = soup.select_one('.product-detail-info .product-detail-text')
			description = description_element.text.strip() if description_element else None
			
			return CarInfo(
				name=name,
				price=price,
				specifications=specs,
				images=images,
				description=description
			)
		
		except Exception as e:
			logging.error(f"Error extracting car info from {url}: {str(e)}")
			raise
	
	# def get_all_car_urls(self) -> List[str]:
	# 	return [
	# 		"https://www.toyota.com.vn/camry-ce"
	# 	]
	
	def get_all_car_urls(self) -> List[str]:
		driver = webdriver.Edge()
		driver.get(self.base_url)

		# find all tabs categories
		tabs = driver.find_elements("css selector", '.discovery-vehicles-tab-item')
		urls = []
		# traversals all tabs
		for tab in tabs:
			tab.click()  
			html = driver.page_source
			soup = BeautifulSoup(html, 'html.parser')

			car_links = soup.select('.swiper-discovery-vehicles-item a')
			for link in car_links:
				href = link.get('href')
				if href:  
					full_url = self.base_url.rstrip('/') + href
					urls.append(full_url)

		driver.quit()
		# urls = list(set(urls)) # unique url

		print(urls)
		return urls



## Cars.com
- /shopping <br>
- cars used


Load checkpoint and setup function for crawling


In [None]:
class Website2Scraper(BaseScraper):
	checkpoint_file = "checkpoint.json"
	def save_to_json(self, filename: str, cars: List[CarInfo]):
		if os.path.exists(filename):
			with open(filename, 'r', encoding='utf-8') as f:
				try:
					existing_data = json.load(f)
				except json.JSONDecodeError:
					existing_data = []  
		else:
			existing_data = []

		car_dicts = [vars(car) for car in cars]
		existing_data.extend(car_dicts)

		with open(filename, 'w', encoding='utf-8') as f:
			json.dump(existing_data, f, ensure_ascii=False, indent=2)

	def save_checkpoint(self, data):
		with open(self.checkpoint_file, "w") as f:
			json.dump(data, f, indent=4)

	def load_checkpoint(self):
		if os.path.exists(self.checkpoint_file):
			with open(self.checkpoint_file, "r") as f:
				return json.load(f)
		return {}
	def get_max_id_from_checkpoint(self):
		checkpoint = self.load_checkpoint()
		max_id = 0
		for brand_data in checkpoint.values():
			brand_id = brand_data.get("id", 0)
			max_id = max(max_id, brand_id)
		return max_id
	
	def get_html_with_requests(self, url: str) -> str:
		response = self.session.get(url, headers=self.headers)
		return response.text

		
	def extract_car_info(self, url: str, checkpoint: dict, car_brand: str) -> CarInfo:
		soup = self.get_soup(url)
		
		try:
			# car name
			name = soup.select_one('.title-section .listing-title').text.strip()
			
			# price
			price = None
			price_element = soup.select_one('span[data-qa="primary-price"]')
			if price_element:
				price = price_element.text.strip()
			
			# car info
			specs = {}
			specs_list = soup.select('dl.fancy-description-list dt, dl.fancy-description-list dd')
			
			current_key = None
			for element in specs_list:
				if element.name == 'dt':
					current_key = element.text.strip()
				elif element.name == 'dd' and current_key:
					value = element.text.strip()
					specs[current_key] = value
					current_key = None
			
			# car images
			images = []
			gallery_images = (
				soup.select('.vdp-gallery img[modal-src]') or 
				soup.select('img[modal-src]') or  
				soup.select('img.row-pic') 
			)
			
			for img in gallery_images:
				try:
					src = img.get('modal-src')
					if not src:
						src = img.get('src')
					
					if src:
						if src.startswith('//'):
							src = 'https:' + src
						elif not src.startswith('http'):
							src = f"{self.base_url.rstrip('/')}/{src.lstrip('/')}"
						
						if src not in images:
							images.append(src)
				except AttributeError:
					continue
		
			# ID
			checkpoint = self.load_checkpoint()
			
			if(self.check_state_id()):
				car_id = self.get_next_id()
			else : 
				car_id = checkpoint.get(car_brand, {}).get("id", self.get_next_id()) + 1 
				self.set_numberOfCar(car_id)
				# print(f"numberofcars : {self.numberOfCars}")
				self.pass_ID()
				
			return CarInfo(
				brand="Unknown",
				name=name,
				price=price,
				specifications=specs,
				images=images,
				id = car_id
				# description=description
			)
		
		except Exception as e:
			logging.error(f"Error extracting car info from {url}: {str(e)}")
			raise

	def get_all_car_urls(self) -> List[Dict[str, str]]:
		all_car_urls = []
		checkpoint = self.load_checkpoint()

		try:
			html_content = self.get_html_with_requests(self.base_url)

			soup = BeautifulSoup(html_content, "html.parser")
			
			car_brands_section = soup.find("div", class_="sds-link-pack")
			if not car_brands_section:
				print("None Car brands")
				return all_car_urls

			car_links = car_brands_section.find_all("a")
			car_brand_urls = [link.get("href") for link in car_links if link.get("href")]

			for brand_url in car_brand_urls:
				car_brand = brand_url.split('/')[2]
				full_brand_url = urljoin(self.base_url, brand_url)

				if checkpoint:
					max_id = self.get_max_id_from_checkpoint()
					self.set_numberOfCar(max_id)
				# checkpoint
				if car_brand in checkpoint:
					print(f"Continue brand: {car_brand}")
					current_url = checkpoint[car_brand]["current_url"]
					collected_urls = checkpoint[car_brand]["collected_urls"]
					car_id = checkpoint[car_brand].get("id", self.get_next_id()) + 1 
				else:
					print(f"New brand: {car_brand}")
					current_url = full_brand_url
					collected_urls = []
					car_id = self.get_next_id()

				skip_urls = set(collected_urls)

				while current_url:
					if current_url in skip_urls:
						print(f"Page done: {current_url}, skip!")
						next_link = soup.find("link", rel="next")
						if next_link and next_link.get("href"):
							current_url = urljoin(self.base_url, next_link["href"])
						else:
							current_url = None 
							print("none")
						continue

					else :
						try : 
							html_content = self.get_html_with_requests(current_url)
							soup = BeautifulSoup(html_content, "html.parser")

							vehicle_links = soup.find_all("a", class_="vehicle-card-link")
							car_urls = [urljoin(self.base_url, link.get("href")) for link in vehicle_links if link.get("href")]

							car_brand = brand_url.split('/')[2]  
							all_cars = []
							for url in car_urls:
								car_info = self.extract_car_info(url, checkpoint, car_brand)
								car_info.brand = car_brand
								print(car_info.id) 
								car_id = car_info.id
								all_cars.append(car_info)
								all_car_urls.append({"brand": car_brand, "url": url})
							# print(f"carbrand: {all_car_urls}")
							savePath = "dataset/cars_used.json"
							self.save_to_json(savePath, all_cars)
							print(f"Save.")
							# collected_urls.append(full_brand_url)
							if current_url not in collected_urls:
								collected_urls.append(current_url)
							
							next_link = soup.find("link", rel="next")
							if next_link and next_link.get("href"):
								next_url = urljoin(self.base_url, next_link["href"])
								# print(f"next url : {next_url}")
								if "&maximum_distance=all" not in next_url:
									next_url += "&maximum_distance=all" 
								current_url = next_url
							else:
								next_url = None
								current_url = next_url

							# prev_url = current_url
							# Cập nhật checkpoint
							checkpoint[car_brand] = {
								"current_url": current_url,
								"collected_urls": collected_urls,
								# "prev_url": prev_url,
								"id": car_id
							}
							
							self.save_checkpoint(checkpoint)
								
						except Exception as e:
							print(f"Loi trang{e}")
							current_url = None

		except Exception as e:
			print(f"Error: {e}")

		return all_car_urls



In [13]:
class CarDataCollector:
    def __init__(self, scrapers: List[BaseScraper]):
        self.scrapers = scrapers
        
    def collect_all_data(self, savepath: str) -> List[CarInfo]:
        all_cars = []
        for scraper in self.scrapers:
            try:
                car_urls = scraper.get_all_car_urls()
                for data in car_urls:
                    try:
                        car_info = scraper.extract_car_info(data["url"])
                        car_info.brand = data["brand"]
                        # self.save_to_json(savepath, car_info)
                        all_cars.append(car_info)
                    except Exception as e:
                        logging.error(f"Error scraping car data from {data}: {str(e)}")
            except Exception as e:
                logging.error(f"Error with scraper {scraper.__class__.__name__}: {str(e)}")
        return all_cars

### Crawl data from website

In [14]:
scrapers = [
	# Website1Scraper('https://www.toyota.com.vn/')
	Website2Scraper('https://www.cars.com/shopping/')
]

collector = CarDataCollector(scrapers)
cars = collector.collect_all_data("dataset/cars_used.json")
# collector.save_to_json('dataset/cars_used.json', cars)

Continue brand: acura
Continue brand: alfa_romeo
Continue brand: am_general
Continue brand: aston_martin
Continue brand: audi
Continue brand: austin_healey
Continue brand: bentley
Continue brand: bmw


KeyboardInterrupt: 