<a href="https://colab.research.google.com/github/ignius01/webscraper/blob/main/Web_Scraper_9000.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from PIL import Image
from io import BytesIO
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from requests.exceptions import RequestException
import logging
import time
import numpy as np
import json
import os


# Configuration
RETRY_BACKOFF_FACTOR = 3
RETRY_ATTEMPTS = 2
LOGGING_LEVEL = logging.DEBUG

# Setup logging configuration
logging.basicConfig(level=LOGGING_LEVEL, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger()

# Cached UserAgent to avoid regenerating it multiple times
UA = UserAgent()

# CSS Selectors
CSS_SELECTORS = {
    'vehicle_card': 'a.si-vehicle-box',
    'vehicle_image_div': 'div.vehiclebox-image',
    'vehicle_image': 'div.vehiclebox-image img',
    'vehicle_title': 'div.vehiclebox-title h2',
    'vehicle_price': 'div.srp-your-price div:last-child',
    'vehicle_color': 'div.si-vehicle-details div:first-child div.ml-2',
    'vehicle_stock': 'div#copy_stock',
    'vehicle_location': 'div.font-weight-bold.ft-14',
    'vehicle_vin': 'div.vin div.d-inline',
    'vehicle_mileage': 'div.mileage'
}

CSS_SELECTORS_USED = {
    'vehicle_card': 'a.si-vehicle-box',
    'vehicle_image_div': 'div.vehiclebox-image',
    'vehicle_image': 'div.vehiclebox-image img',
    'vehicle_title': 'div.vehiclebox-title h2',
    'vehicle_price': 'div.srp-your-price div:last-child',
    'vehicle_color': 'div.si-vehicle-details div:nth-child(1) div.ml-2',
    'vehicle_stock': 'div#copy_stock',
    'vehicle_location': 'div.font-weight-bold.ft-14',
    'vehicle_vin': 'div.vin div.d-inline',
    'vehicle_mileage': 'div.mileage'
}

# Known Colors for Matching
KNOWN_COLORS = {
    'Crystal Black Pearl': (85, 96, 110),
    'Lunar Silver Metallic': (189, 195, 208),  # Calibrated based on "Canyon River Blue Metallic"
    'Meteorite Gray Metallic': (131, 127, 122),
    'Rallye Red': (170, 72, 80),
    'Sonic Gray Pearl': (151, 158, 170),  # Calibrated based on "Crystal Black Pearl"
    'Platinum White Pearl': (217, 222, 226),
    'Aegean Blue Metallic': (25, 75, 125),
    'Radiant Red Metallic': (204, 138, 156),  # Calibrated based on "Radiant Red Metallic"
    'Urban Gray Pearl': (100, 100, 100),
    'Modern Steel Metallic': (80, 80, 80),
    'Nordic Forest Pearl': (25, 75, 25),
    'Obsidian Blue Pearl': (40, 40, 90),
    'Canyon River Blue Metallic': (189, 195, 208),  # Added based on the data
    'Solar Silver Metallic': (164, 177, 197),  # Added based on the data
    'Still Night Pearl': (81, 94, 192),  # Added based on the data
}

def fetch_page(url, retries=RETRY_ATTEMPTS, backoff_factor=RETRY_BACKOFF_FACTOR):
    headers = {'User-Agent': UA.random}
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=60)
            response.raise_for_status()
            return response.text
        except RequestException as e:
            logger.error(f"Error fetching URL {url}: {e}. Attempt {attempt + 1} of {retries}")
            time.sleep(backoff_factor ** attempt + (attempt * 0.1))
    logger.error(f"Failed to fetch URL after {retries} attempts: {url}")
    return None

def parse_price(price_text):
    try:
        price = price_text.split('-')[0].strip()
        return float(price.replace('$', '').replace(',', ''))
    except ValueError:
        return 0

def calculate_lease_payment(cash_price, down_payment=0, residual_value_percentage=0.6, lease_term=36, money_factor=0.00208):
    adjusted_cap_cost = cash_price - down_payment
    residual_value = cash_price * residual_value_percentage
    depreciation_fee = (adjusted_cap_cost - residual_value) / lease_term
    finance_fee = (adjusted_cap_cost + residual_value) * money_factor
    return depreciation_fee + finance_fee

def calculate_finance_payment(cash_price, down_payment=0, loan_term=72, interest_rate=0.089):
    loan_amount = cash_price - down_payment
    monthly_interest_rate = interest_rate / 12
    return loan_amount * (monthly_interest_rate / (1 - (1 + monthly_interest_rate) ** -loan_term))

def extract_element_text(car_div, selector):
    element = car_div.select_one(selector)
    return element.text.strip() if element else 'Unknown'

def get_average_color(colors):
    r, g, b = zip(*colors)
    return (sum(r) // len(r), sum(g) // len(g), sum(b) // len(b))

def get_dominant_colors(image_url):
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content))

        # Define the crop box for the hood area, adjusted for height
        width, height = image.size

        # Calculate the crop box dimensions for 2:25 aspect ratio
        crop_width = int(width * 0.30)  # Keep the width as before
        crop_height = int(crop_width * (4 / 23))  # Calculate height based on 2:25 ratio

        # Adjust the starting position of the crop box to be 7% higher
        crop_box = (int(width * 0.30), int(height * 0.37), int(width * 0.30) + crop_width, int(height * 0.37) + crop_height)
        cropped_image = image.crop(crop_box)

        # Save the cropped image
        cropped_image_path = os.path.join('/tmp', f'cropped_image_{time.time()}.png')
        cropped_image.save(cropped_image_path)
        logger.debug(f"Cropped image saved to: {cropped_image_path}")

        # Define the sample points within the cropped image
        cropped_width, cropped_height = cropped_image.size
        sample_points = [
            (int(cropped_width * 0.2), int(cropped_height * 0.5)),
            (int(cropped_width * 0.35), int(cropped_height * 0.5)),
            (int(cropped_width * 0.5), int(cropped_height * 0.5)),
            (int(cropped_width * 0.65), int(cropped_height * 0.5)),
            (int(cropped_width * 0.8), int(cropped_height * 0.5)),
            (int(cropped_width * 0.4), int(cropped_height * 0.6)),
            (int(cropped_width * 0.6), int(cropped_height * 0.6))
        ]

        logger.debug(f"Sample points: {sample_points}")
        logger.debug(f"Cropped image size: {cropped_width}x{cropped_height}")

        # Extract colors from sample points
        sampled_colors = [cropped_image.getpixel(point) for point in sample_points]

        logger.debug(f"Sampled colors: {sampled_colors}")

        # Calculate the average color
        avg_color = get_average_color(sampled_colors)
        logger.debug(f"Average color: {avg_color}")

        return [avg_color]
    except Exception as e:
        logger.error(f"Error analyzing image colors: {e}", exc_info=True)
        return []

def rgb_to_lab(rgb):
    rgb = np.array(rgb) / 255.0
    xyz = np.dot(rgb, [[0.412453, 0.357580, 0.180423],
                       [0.212671, 0.715160, 0.072169],
                       [0.019334, 0.119193, 0.950227]])
    xyz = xyz / np.array([0.95047, 1.00000, 1.08883])
    mask = xyz > 0.008856
    xyz = np.where(mask, np.power(xyz, 1/3), 7.787 * xyz + 16/116)
    L = (116 * xyz[:, 1]) - 16
    a = 500 * (xyz[:, 0] - xyz[:, 1])
    b = 200 * (xyz[:, 1] - xyz[:, 2])
    return np.stack([L, a, b], axis=1)

def delta_e_ciede2000(lab1, lab2):
    L1, a1, b1 = lab1
    L2, a2, b2 = lab2
    kL, kC, kH = 1, 1, 1
    deltaL = L2 - L1
    L_ = (L1 + L2) / 2
    C1 = np.sqrt(a1 ** 2 + b1 ** 2)
    C2 = np.sqrt(a2 ** 2 + b2 ** 2)
    C_ = (C1 + C2) / 2
    G = 0.5 * (1 - np.sqrt(C_ ** 7 / (C_ ** 7 + 25 ** 7)))
    a1_ = (1 + G) * a1
    a2_ = (1 + G) * a2
    C1_ = np.sqrt(a1_ ** 2 + b1 ** 2)
    C2_ = np.sqrt(a2_ ** 2 + b2 ** 2)
    deltaC = C2_ - C1_
    h1_ = np.degrees(np.arctan2(b1, a1_)) % 360
    h2_ = np.degrees(np.arctan2(b2, a2_)) % 360
    deltah = h2_ - h1_
    if abs(deltah) > 180:
        deltah -= 360 if deltah > 0 else -360
    deltaH = 2 * np.sqrt(C1_ * C2_) * np.sin(np.radians(deltah) / 2)
    H_ = (h1_ + h2_) / 2
    if abs(h1_ - h2_) > 180:
        H_ += 180 if H_ < 360 else -180
    T = 1 - 0.17 * np.cos(np.radians(H_ - 30)) + 0.24 * np.cos(np.radians(2 * H_)) + 0.32 * np.cos(np.radians(3 * H_ + 6)) - 0.20 * np.cos(np.radians(4 * H_ - 63))
    SL = 1 + ((0.015 * (L_ - 50) ** 2) / np.sqrt(20 + (L_ - 50) ** 2))
    SC = 1 + 0.045 * C_
    SH = 1 + 0.015 * C_ * T
    deltaTheta = 30 * np.exp(-((H_ - 275) / 25) ** 2)
    RC = 2 * np.sqrt(C_ ** 7 / (C_ ** 7 + 25 ** 7))
    RT = -np.sin(2 * np.radians(deltaTheta)) * RC
    return np.sqrt((deltaL / (kL * SL)) ** 2 + (deltaC / (kC * SC)) ** 2 + (deltaH / (kH * SH)) ** 2 + RT * (deltaC / (kC * SC)) * (deltaH / (kH * SH)))

def map_to_known_color(avg_color):
    avg_color_lab = rgb_to_lab([avg_color])
    min_distance = float('inf')
    matched_color = 'Unknown'
    for color_name, rgb_value in KNOWN_COLORS.items():
        color_lab = rgb_to_lab([rgb_value])
        distance = delta_e_ciede2000(avg_color_lab[0], color_lab[0])
        if distance < min_distance:
            min_distance = distance
            matched_color = color_name
    return matched_color

def extract_car_details(car_div, css_selectors, payment_type='cash', down_payment=0, interest_rate=0.089, loan_term=72, is_used=False):
    car_details = {}
    try:
        car_id = car_div.get('href').split('/')[-2]
        title = extract_element_text(car_div, css_selectors['vehicle_title'])
        title_parts = title.split()
        car_make = title_parts[1] if len(title_parts) > 1 else 'Unknown'
        car_model = ' '.join(title_parts[2:]) if len(title_parts) > 2 else 'Unknown'
        car_year = title_parts[0] if len(title_parts) > 0 else 'Unknown'
        price_text = extract_element_text(car_div, css_selectors['vehicle_price'])
        car_price = parse_price(price_text)
        color = extract_element_text(car_div, css_selectors['vehicle_color'])
        stock = extract_element_text(car_div, css_selectors['vehicle_stock'])
        location = extract_element_text(car_div, css_selectors['vehicle_location'])
        vin = extract_element_text(car_div, css_selectors['vehicle_vin'])
        mileage = extract_element_text(car_div, css_selectors['vehicle_mileage'])
        car_details = {
            'id': car_id,
            'make': car_make,
            'model': car_model,
            'year': car_year,
            'price': car_price,
            'color': color,
            'stock': stock,
            'location': location,
            'vin': vin,
            'mileage': mileage
        }
        image_element = car_div.select_one(css_selectors['vehicle_image'])
        if image_element and 'src' in image_element.attrs:
            image_url = image_element['src']
            dominant_colors = get_dominant_colors(image_url)
            if dominant_colors:
                avg_color = dominant_colors[0]
                logger.debug(f"Car ID: {car_id}, Average color: {avg_color}")
                matched_color = map_to_known_color(avg_color)
                if matched_color != color:
                    car_details['listed_color'] = color
                    car_details['image_color'] = matched_color
                    car_details['average_rgb'] = avg_color
        else:
            car_details['image_colors'] = 'Unknown'
            car_details['average_rgb'] = (0, 0, 0)
    except Exception as e:
        logger.error(f"Error extracting car details: {e}", exc_info=True)

ModuleNotFoundError: No module named 'fake_useragent'