# Creating a web scraper which extracts price and image data from carsales website

## Extract relevant fields from html raw documents for all ferrari models

In [3]:
!pip3 install pandas

[0m

In [4]:
import os
import re
import json
from bs4 import BeautifulSoup
import pandas as pd

# Directory containing HTML files
data_dir = 'ferrari_360_html_data'

# Fields to extract
target_fields = [
    "Vehicle description", "Powerplant type", "Cost to insure", "Exterior colour", "Interior colour",
    "Body type", "Transmission", "Engine", "Registration plate", "Registration expiry",
    "Roadworthy certificate", "Last Modified",
    "carsales Network ID", "Odometer", "Fuel", "Model date", "Seller description",
    "Image URLs", "Price"
]

all_data = []

# Loop through all HTML files
for filename in sorted(os.listdir(data_dir)):
    if filename.endswith('.html'):
        filepath = os.path.join(data_dir, filename)

        with open(filepath, 'r', encoding='utf-8') as f:
            html = f.read()
            soup = BeautifulSoup(html, 'html.parser')

        car_data = {field: "Not found" for field in target_fields}

        # --- Seller Description ---
        desc_match = re.search(r'"contentText"\s*:\s*{[^}]*"value"\s*:\s*"((?:\\.|[^"\\])*)"', html)
        if desc_match:
            raw = desc_match.group(1)
            clean = bytes(raw, "utf-8").decode("unicode_escape")
            car_data["Seller description"] = clean

        # --- Car Attribute Fields ---
        for field in target_fields:
            if field in ["Model date", "Seller description", "Image URLs", "Price"]:
                continue  # handled separately
            label_elem = soup.find(text=field)
            if label_elem:
                value_elem = label_elem.find_next()
                if value_elem:
                    text = value_elem.get_text(strip=True)
                    if text:
                        car_data[field] = text

        # --- Model Year from Title ---
        title_tag = soup.find("title")
        if title_tag:
            match = re.search(r"\b(19|20)\d{2}\b", title_tag.text)
            if match:
                car_data["Model date"] = match.group(0)

        # --- Image URLs ---
        images = soup.find_all("img")
        image_urls = [img.get("src") for img in images if img.get("src") and "carsales/cars/" in img.get("src")]
        car_data["Image URLs"] = image_urls

        # --- Price ---
        price_match = re.search(r"\$[\d,]+", soup.text)
        car_data["Price"] = price_match.group() if price_match else "Not found"

        all_data.append(car_data)

# --- Create DataFrame ---
df = pd.DataFrame(all_data)
df.head(5)


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## Extract additional features using LLMs from descriptions 

In [None]:
!pip3 install openai langchain langchain-community


In [100]:
import os
import json
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import SystemMessage, HumanMessage

# ✅ Set your API key
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

# ✅ Valid options dictionary
VALID_OPTIONS = {
    "Seat Style": ["Standard", "Daytona", "Racing (carbon)"],
    "Seat Adjustment": ["Manual", "Electric"],
    "Headrest Cavallino Logo": ["Yes", "No"],
    "Seat Contrast Stitching": ["Yes", "No"],
    "Seat Piping": ["Yes", "No"],
    "Body Colour": [
        "Rosso Corsa", "Giallo Modena", "Nero", "Grigio Titanio", "Argento Nurburgring",
        "Blu Pozzi", "Blu Tour de France", "Azzurro California", "Verde Zeltweg", "Custom"
    ],
    "Paint Finish": ["Solid", "Metallic", "Special"],
    "Front Grille": ["Standard", "Mesh"],
    "Rear Grille": ["Standard", "Challenge (Mesh)"],
    "Scuderia Shields": ["Yes", "No"],
    "Side Skirts Finish": ["Body Colour", "Black"],
    "Wheel Type": ["Standard 5-Spoke", "Modular Split-Rim"],
    "Brake Caliper Colour": ["Black", "Red", "Yellow", "Silver", "Gold"],
    "Dashboard & Console Trim": ["Aluminum", "Carbon Fibre"],
    "Carpet Colour": ["Nero", "Rosso", "Blu", "Bordeaux", "Tan", "Other"],
    "Leather Rear Shelf (Coupe only)": ["Yes", "No"],
    "Leather Roll Bar Hoops (Spider only)": ["Yes", "No"],
    "Leather Luggage Straps (Coupe only)": ["Yes", "No"],
    "Fire Extinguisher": ["Yes", "No"],
    "Hi-Fi Stereo": ["Yes", "No"],
    "6-CD Changer": ["Yes", "No"],
    "Subwoofer": ["Yes", "No"],
    "Satellite Navigation (Becker)": ["Yes", "No"],
    "Dedication Plate": ["Yes", "No"],
    "Leather Door Sills": ["Yes", "No"],
    "Leather Headliner": ["Yes", "No"],
    "Coloured Upper Dashboard": ["Yes", "No"],
    "Coloured Steering Wheel": ["Yes", "No"],
    "Special Stitching Colour": ["Yes", "No"],
    "Transmission": ["Manual", "F1"],
    "Suspension Package": ["Standard", "Fiorano"],
    "Exhaust System": ["Standard", "Sport", "Racing"],
    "Spider Soft Top Colour (Spider only)": ["Black", "Blu", "Bordeaux", "Beige"],
    "Tonneau Cover Finish (Spider only)": ["Body Colour", "Custom Painted"],
    "Luggage Set": ["None", "Modena Leather (3 pc)", "Matching Interior Leather (3 pc)"],
    "Spare Wheel Kit": ["Yes", "No"],
    "Battery Charger Pre-Wiring Kit": ["Yes", "No"]
}


def validate_response(response: dict, valid_options: dict) -> dict:
    validated = {}
    for feature, allowed_values in valid_options.items():
        value = response.get(feature, "Unknown")
        if allowed_values and value in allowed_values:
            validated[feature] = value
        else:
            validated[feature] = "Unknown"
    return validated


def extract_features_from_description(description: str) -> dict:
    llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

    system_message = SystemMessage(
        content=(
            "You are a meticulous Ferrari 360 factory expert. "
            "Your task is to extract factory-correct configuration options from vehicle descriptions. "
            "You must only select values that are explicitly mentioned and clearly match a valid option. "
            "Do not infer or assume anything beyond what is stated. "
            "If a feature is not mentioned or unclear, respond with 'Unknown'."
        )
    )

    fields_formatted = "\n".join([f"- {field}" for field in VALID_OPTIONS.keys()])

    user_message = HumanMessage(
        content=(
            f"""
    Extract the following Ferrari 360 features from the description provided.

    Return the result as a valid JSON object using all the listed feature names as keys. 
    For each feature:
    - Use the exact label/value from the provided options (no abbreviations or alternatives).
    - If a feature is not mentioned or cannot be confidently identified, return "Unknown".

    ### Features to extract:
    {fields_formatted}

    ### Description:
    \"\"\"{description}\"\"\"

    ### Example output:
    {{
    "Scuderia Shields": "Yes",
    "6-CD Changer": "Unknown",
    "Seat Style": "Daytona",
    ...
    }}

    Remember:
    - Output all features, even if most are "Unknown".
    - Return only a JSON object — no prose, markdown, or explanation.
    """
        )
    )


    response = llm([system_message, user_message])
    raw_output = response.content.strip()
    
    if raw_output.startswith("```"):
        raw_output = re.sub(r"^```[a-zA-Z]*\n?", "", raw_output)
        raw_output = re.sub(r"\n?```$", "", raw_output)

    try:
        parsed = json.loads(raw_output)
    except json.JSONDecodeError:
        print("⚠️ LLM output was not valid JSON. Output was:")
        print(raw_output)
        return {}

    validated = validate_response(parsed, VALID_OPTIONS)
    return validated


# ✅ Example usage
if __name__ == "__main__":
    description = """
    In keeping with the vast majority of new 360 buyers, I prefer the second generation F1 automated manual gearbox...
    Options include front and rear Challenge style grilles for improved engine ventilation, Logo Scuderia Ferrari Shields on fenders, 6 CD changer, hi fi system and xenon headlights.
    The new Nouvalari Sport muffler is a little louder than OEM but offers a very well-tuned sound...
    There is no spare wheel.
    """

    features = extract_features_from_description(description)
    print(json.dumps(features, indent=2))


{
  "Seat Style": "Unknown",
  "Seat Adjustment": "Unknown",
  "Headrest Cavallino Logo": "Unknown",
  "Seat Contrast Stitching": "Unknown",
  "Seat Piping": "Unknown",
  "Body Colour": "Unknown",
  "Paint Finish": "Unknown",
  "Front Grille": "Unknown",
  "Rear Grille": "Unknown",
  "Scuderia Shields": "Yes",
  "Side Skirts Finish": "Unknown",
  "Wheel Type": "Unknown",
  "Brake Caliper Colour": "Unknown",
  "Dashboard & Console Trim": "Unknown",
  "Carpet Colour": "Unknown",
  "Leather Rear Shelf (Coupe only)": "Unknown",
  "Leather Roll Bar Hoops (Spider only)": "Unknown",
  "Leather Luggage Straps (Coupe only)": "Unknown",
  "Fire Extinguisher": "Unknown",
  "Hi-Fi Stereo": "Yes",
  "6-CD Changer": "Yes",
  "Subwoofer": "Unknown",
  "Satellite Navigation (Becker)": "Unknown",
  "Dedication Plate": "Unknown",
  "Leather Door Sills": "Unknown",
  "Leather Headliner": "Unknown",
  "Coloured Upper Dashboard": "Unknown",
  "Coloured Steering Wheel": "Unknown",
  "Special Stitching Colou

In [101]:
# Apply this to the dataframe we created earlier
results = df["Seller description"].apply(extract_features_from_description)

# Convert the results (Series of dicts) into a DataFrame
features_df = pd.DataFrame(results.tolist())

# Combine with original DataFrame if needed
full_df = pd.concat([df, features_df], axis=1)

# Save to CSV
full_df.to_csv("ferrari_360_features_extracted.csv", index=False)