In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm # type: ignore
import re
import os


def extract_car_company(url):
    """
    Extracts the car company from the URL.
    e.g., 'https://www.cardekho.com/maruti/invicto/specs' -> 'Maruti'
    """
    match = re.search(r'cardekho\.com/([^/]+)/', url)
    return match.group(1).replace('-', ' ').title() if match else 'Unknown Company'


def extract_car_name(url):
    """
    Extracts the car name from the URL.
    Handles both '/brand/car-name/specs' and '/brand/brand-car-name-specifications.htm'
    """
    url = url.strip().lower()

    # Case 1: Ends with '/specs'
    match_specs = re.match(r'.*/([^/]+)/specs$', url)
    if match_specs:
        return match_specs.group(1).replace('-', ' ').title()

    # Case 2: Ends with '-specifications.htm'
    match_htm = re.search(r'/([^/]+)-specifications\.htm$', url)
    if match_htm:
        car_name_part = match_htm.group(1)
        # Remove repeating brand prefix (like 'maruti-') if present
        brand = extract_car_company(url).lower().replace(' ', '-')
        car_name = re.sub(rf'^{brand}-', '', car_name_part)
        return car_name.replace('-', ' ').title()

    return 'Unknown Car'



def normalize_price_nlp(price_str):
    if not isinstance(price_str, str):
        return None

    price_str = price_str.replace("Rs.", "").replace(",", "").strip().lower()
    price_str = re.sub(r'[^\d.\-–crlakcrores ]', '', price_str)  # Remove special characters

    if '-' in price_str or '–' in price_str:
        delimiter = '-' if '-' in price_str else '–'
        parts = price_str.split(delimiter)
        normalized_parts = []
        for part in parts:
            match = re.search(r'(\d+\.?\d*)', part)
            if match:
                value = float(match.group(1))
                if 'cr' in part or 'crore' in part:
                    value *= 100  # Convert crore to lakh
                normalized_parts.append(f"{round(value, 2)}")
        return ' - '.join(normalized_parts) if normalized_parts else None
    else:
        match = re.search(r'(\d+\.?\d*)', price_str)
        if not match:
            return None
        value = float(match.group(1))
        if 'cr' in price_str or 'crore' in price_str:
            value *= 100
        return f"{round(value, 2)}"




def scrape_car_specs(url, serial_number):
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "table"))
        )
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        car_name = extract_car_name(url)
        car_company = extract_car_company(url)
            

        key_specs = {
            'S.no': serial_number,
            'Car Name': car_name,
            'Car Company': car_company,
            'Car Image': '',
            'ARAI Mileage': '',
            'Fuel Type': '',
            'Engine Displacement': '',
            'No. of Cylinders': '',
            'Max Power': '',
            'Max Torque': '',
            'Seating Capacity': '',
            'Transmission Type': '',
            'Fuel Tank Capacity': '',
            'Body Type': '',
            'Price': ''
        }

        # Extract specs
        spec_tables = soup.find_all('table')
        for table in spec_tables:
            rows = table.find_all('tr')
            for row in rows:
                cols = row.find_all(['td', 'th'])
                if len(cols) == 2:
                    spec_name = cols[0].text.strip()
                    spec_value = cols[1].text.strip()
                    if spec_name in key_specs:
                        key_specs[spec_name] = spec_value
        
        
        # Improved image extraction - handles lazy-loaded images
        image_url = ''
        all_imgs = soup.find_all('img')

        for img in all_imgs:
            src = img.get('data-src') or img.get('src') or ''
            alt = img.get('alt', '').lower()
            src = src.strip()
            if src and (
                'car' in alt
                or 'car' in src
                or car_name.lower() in alt
                or car_company.lower() in alt
            ):
                image_url = src
                break

        # Fallback - try specific sections
        if not image_url:
            possible_sections = [
                soup.find('div', class_='imgSection'),
                soup.find('div', class_='overviewImg'),
                soup.find('figure'),
            ]
            for section in possible_sections:
                if section:
                    img_tag = section.find('img')
                    if img_tag:
                        image_url = img_tag.get('data-src') or img_tag.get('src') or ''
                        if image_url:
                            break

        key_specs['Car Image'] = image_url



        # Extract price
        price = ''
        price_tag = soup.find('div', class_='price')
        if price_tag:
            price = price_tag.get_text(strip=True)
        else:
            alt_price_tag = soup.find('div', class_='priceSection')
            if alt_price_tag:
                price = alt_price_tag.get_text(strip=True)

        normalized_price = normalize_price_nlp(price)
        key_specs['Price'] = normalized_price if normalized_price else price.rstrip('*').strip()
        return key_specs

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None
    finally:
        driver.quit()

        
output_filename = 'CarsDataset.csv'
existing_data = pd.DataFrame()
if os.path.exists(output_filename):
    existing_data = pd.read_csv(output_filename)

if not existing_data.empty and 'Car Name' in existing_data.columns and 'Car Company' in existing_data.columns:
    existing_entries = set(
        zip(existing_data['Car Name'].str.lower(), existing_data['Car Company'].str.lower())
    )
else:
    existing_entries = set()


urls = [
    # Maruti Cars URLs
    'https://www.cardekho.com/maruti/swift/specs',
    'https://www.cardekho.com/maruti/maruti-ertiga-specifications.htm',
    'https://www.cardekho.com/maruti/fronx/specs',
    'https://www.cardekho.com/maruti/brezza/specs',
    'https://www.cardekho.com/maruti/dzire/specs',
    'https://www.cardekho.com/maruti/maruti-grand-vitara-specifications.htm',
    'https://www.cardekho.com/maruti/maruti-baleno-specifications.htm',
    'https://www.cardekho.com/maruti/wagon-r/specs',
    'https://www.cardekho.com/maruti/alto-k10/specs',
    'https://www.cardekho.com/maruti/jimny/specs',
    'https://www.cardekho.com/maruti/maruti-celerio-specifications.htm',
    'https://www.cardekho.com/maruti/xl6/specs',
    'https://www.cardekho.com/maruti/maruti-ignis-specifications.htm',
    'https://www.cardekho.com/maruti/maruti-eeco-specifications.htm',
    'https://www.cardekho.com/maruti/s-presso/specs',
    'https://www.cardekho.com/maruti/maruti-ciaz-specifications.htm',
    'https://www.cardekho.com/maruti/invicto/specs',
    'https://www.cardekho.com/maruti/dzire-tour-s/specs',
    'https://www.cardekho.com/maruti/ertiga-tour/specs',
    'https://www.cardekho.com/maruti/alto-tour-h1/specs',
    'https://www.cardekho.com/maruti/eeco-cargo/specs',
    'https://www.cardekho.com/maruti/wagon-r-tour/specs',
    # Toyota Cars URLs
    'https://www.cardekho.com/toyota/toyota-fortuner-specifications.htm',
    'https://www.cardekho.com/toyota/innova-crysta/specs',
    'https://www.cardekho.com/toyota/hyryder/specs',
    'https://www.cardekho.com/toyota/land-cruiser-300/specs',
    'https://www.cardekho.com/toyota/hilux/specs',
    'https://www.cardekho.com/toyota/camry/specs',
    'https://www.cardekho.com/toyota/glanza/specs',
    'https://www.cardekho.com/toyota/innova-hycross/specs',
    'https://www.cardekho.com/toyota/vellfire/specs',
    'https://www.cardekho.com/toyota/taisor/specs',
    'https://www.cardekho.com/toyota/toyota-rumion-specifications.htm',
    'https://www.cardekho.com/toyota/fortuner-legender/specs',
    # Honda Cars URLs
    'https://www.cardekho.com/honda/city/specs',
    'https://www.cardekho.com/honda/amaze/specs',
    'https://www.cardekho.com/honda/elevate/specs',
    'https://www.cardekho.com/honda/city-hybrid/specs',
    'https://www.cardekho.com/honda/amaze-2nd-gen/specs'
]

all_data = []
print(f"Checking {len(urls)} URLs for new cars...\n")
serial_number = len(existing_data) + 1

for url in tqdm(urls, desc="Scraping Progress"):
    car_name = extract_car_name(url).lower()
    car_company = extract_car_company(url).lower()

    if (car_name, car_company) in existing_entries:
        continue  # already scraped

    result = scrape_car_specs(url, serial_number)
    if result:
        all_data.append(result)
        existing_entries.add((car_name, car_company))
        serial_number += 1

if all_data:
    new_df = pd.DataFrame(all_data)
    final_df = pd.concat([existing_data, new_df], ignore_index=True)
    try:
        final_df.to_csv(output_filename, index=False)
        print(f"\nScraped {len(new_df)} new entries. Total: {len(final_df)} rows in '{output_filename}'.")
    except PermissionError:
        print(f"Permission denied: Close '{output_filename}' and retry.")
else:
    print("No new cars to scrape. Dataset is already up-to-date.")


Checking 39 URLs for new cars...



Scraping Progress: 100%|██████████| 39/39 [00:00<00:00, 11949.58it/s]

No new cars to scrape. Dataset is already up-to-date.





# Missing Values of ARAI Mileage

In [4]:
df = pd.read_csv("CarsDataset.csv")

# Show rows with missing ARAI Mileage
missing_mileage = df[df['ARAI Mileage'].isnull() | (df['ARAI Mileage'].str.strip() == '')]
print("Rows with missing ARAI Mileage:\n")
print(missing_mileage[['S.no', 'Car Name', 'Car Company']])

Rows with missing ARAI Mileage:

Empty DataFrame
Columns: [S.no, Car Name, Car Company]
Index: []


In [5]:
# Example: Manually updating ARAI Mileage values
df.loc[df['Car Name'] == 'Fortuner', 'ARAI Mileage'] = '14.4 kmpl'
df.loc[df['Car Name'] == 'Innova Crysta', 'ARAI Mileage'] = '14 kmpl'
df.loc[df['Car Name'] == 'Hilux', 'ARAI Mileage'] = '10.4 kmpl'
df.loc[df['Car Name'] == 'Vellfire', 'ARAI Mileage'] = '19.28 kmpl'
df.loc[df['Car Name'] == 'Fortuner Legender', 'ARAI Mileage'] = '14.4 kmpl'

In [6]:
df.to_csv("CarsDataset.csv", index=False)
print("Updated ARAI Mileage values saved.")

Updated ARAI Mileage values saved.


# Missing Values of Fuel Tank Capacity

In [7]:
# Show rows with missing Fuel Tank Capacity
missing_fuel_tank_capacity = df[df['Fuel Tank Capacity'].isnull() | (df['Fuel Tank Capacity'].str.strip() == '')]
print("Rows with missing Fuel Tank Capacity:\n")
print(missing_fuel_tank_capacity[['S.no', 'Car Name', 'Car Company']])

Rows with missing Fuel Tank Capacity:

    S.no     Car Name Car Company
34    35         City       Honda
37    38  City Hybrid       Honda


In [8]:
# Example: Manually updating Fuel Tank Capacity values
df.loc[df['Car Name'] == 'Dzire Tour S', 'Fuel Tank Capacity'] = '55 liters'
df.loc[df['Car Name'] == 'Fortuner Legender', 'Fuel Tank Capacity'] = '80 liters'

In [9]:
df.to_csv("CarsDataset.csv", index=False)
print("Updated Fuel Tank Capacity values saved.")

Updated Fuel Tank Capacity values saved.


# Updating the Car Images

In [10]:
df.loc[df['Car Name'] == 'Swift', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Maruti/Swift/9226/1751527616116/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Ertiga', 'Car Image'] = "https://foujiadda.in/admin/uploads/vehicle/image1701185191.jpg"
df.loc[df['Car Name'] == 'Fronx', 'Car Image'] = "https://images.timesdrive.in/photo/msid-151016779,thumbsize-364787,width-560,height-250,false/151016779.jpg"
df.loc[df['Car Name'] == 'Brezza', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Maruti/Brezza/10387/1753880871331/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Dzire', 'Car Image'] = "https://images.autox.com/uploads/cars/2024/11/maruti-suzuki-dzire-500x261.jpg"
df.loc[df['Car Name'] == 'Grand Vitara', 'Car Image'] = "https://www.varunmaruti.com/uploads/products/colors/grandvitara-midnight-black1.png"
df.loc[df['Car Name'] == 'Baleno', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Maruti/Baleno/10497/1697697558001/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Wagon R', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Maruti/Wagon-R/10363/1741236373749/exterior-image-166.jpg"
df.loc[df['Car Name'] == 'Alto K10', 'Car Image'] = "https://www.seyieauto.com/wp-content/uploads/2023/02/alto-k10-white.jpg"
df.loc[df['Car Name'] == 'Jimny', 'Car Image'] = "https://www.evoindia.com/h-upload/uid/BEiXA1vEBLzh75XwuZCOGnS2EJ1RksHK.jpg"
df.loc[df['Car Name'] == 'Celerio', 'Car Image'] = "https://financialexpresswpcontent.s3.amazonaws.com/uploads/2021/11/Celerio-2.jpg"
df.loc[df['Car Name'] == 'Xl6', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Maruti/XL6/10378/1741241176726/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Ignis', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/630x420/Maruti/Ignis/10318/1738298075884/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Eeco', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Maruti/Eeco/10376/1708671417179/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'S Presso', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Maruti/S-Presso/10348/Maruti-S-Presso-LXi/1687519307943/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Ciaz', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Maruti/Ciaz/10346/1738211238794/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Invicto', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Maruti/Invicto/9483/1688547077223/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Dzire Tour S', 'Car Image'] = "https://stimg.cardekho.com/images/car-images/930x620/Maruti/Dzire-Tour-S/12461/1742387359920/Arctic-White_d6d9de.jpg"
df.loc[df['Car Name'] == 'Ertiga Tour', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Maruti/Ertiga-Tour/9617/1741068787046/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Alto Tour H1', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Maruti/Alto-Tour-H1/12517/1745058103119/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Eeco Cargo', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Maruti/Eeco-Cargo/9449/1675860460463/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Wagon R Tour', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Maruti/Wagon-R-tour/9442/1675922710720/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Fortuner', 'Car Image'] = "https://spn-sta.spinny.com/blog/20221202123514/Toyota-Fortuner.jpg"
df.loc[df['Car Name'] == 'Innova Crysta', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Toyota/Innova-Crysta/9612/1697698611076/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Hyryder', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Toyota/Hyryder/10910/1744114238786/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Land Cruiser 300', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Toyota/Land-Cruiser-300/8448/1750750759371/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Hilux', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Toyota/Hilux/10924/1691990326111/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Camry', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Toyota/Camry/11344/1733916451269/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Glanza', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Toyota/Glanza/10231/1686812796183/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Innova Hycross', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Toyota/Innova-Hycross/10929/1749732632021/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Vellfire', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Toyota/Vellfire/10337/1749879912109/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Taisor', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Toyota/Taisor/11606/1741756199300/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Rumion', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Toyota/Rumion/8650/1741755317177/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Fortuner Legender', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Toyota/Fortuner-Legender/10229/1749726924621/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'City', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Honda/City/12667/1750410975226/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Amaze', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Honda/Amaze/12185/1751289124905/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'Elevate', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Honda/Elevate/9533/1751533499774/front-left-side-47.jpg"
df.loc[df['Car Name'] == 'City Hybrid', 'Car Image'] = "https://media.zigcdn.com/media/model/2023/Mar/city-e-hev_360x240.jpg"
df.loc[df['Car Name'] == 'Amaze 2Nd Gen', 'Car Image'] = "https://stimg.cardekho.com/images/carexteriorimages/930x620/Honda/Amaze-2nd-Gen/10519/1754477167669/front-left-side-47.jpg"
# df.loc[df['Car Name'] == '', 'Car Image'] = ""


In [11]:
df.to_csv("CarsDataset.csv", index=False)
print("Updated Car Images are saved.")

Updated Car Images are saved.


# Fixing Values of Car Price

In [12]:
df.loc[df['Car Name'] == 'Hilux', 'Price in Lakhs'] = '304.0 - 379.0'
df.loc[df['Car Name'] == 'Vellfire', 'Price in Lakhs'] = '122.0 - 132.0'
df.loc[df['Car Name'] == 'Land Cruiser 300', 'Price in Lakhs'] = '231.0 - 241.0'

In [13]:
df.to_csv("CarsDataset.csv", index=False)
print("Updated Prices are saved.")

Updated Prices are saved.
