In [None]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime
import logging

# Add project root to path
project_root = Path.cwd().parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from config import DATA_DIR_BRONZE, SCRAPING_CONFIG, ALL_LISTING_URLS_FILE
from elferspot_listings.utils.helpers import setup_logging, save_data

# Setup logging
logger = setup_logging(level='INFO')
logger.info("Data Gathering Notebook initialized")

## Configuration

Set up scraping parameters and user agent.

In [None]:
# Scraping configuration
BASE_URL = SCRAPING_CONFIG['base_url']
USER_AGENT = SCRAPING_CONFIG['user_agent']
TIMEOUT = SCRAPING_CONFIG['request_timeout']
DELAY = SCRAPING_CONFIG['delay_between_requests']

print(f"Base URL: {BASE_URL}")
print(f"Timeout: {TIMEOUT}s")
print(f"Delay between requests: {DELAY}s")

## Step 1: Load or Define Listing URLs

Load existing listing URLs or define scraping targets.

In [None]:
# Example: Load URLs from file if it exists
if ALL_LISTING_URLS_FILE.exists():
    urls_df = pd.read_csv(ALL_LISTING_URLS_FILE)
    listing_urls = urls_df['URL'].tolist()
    logger.info(f"Loaded {len(listing_urls)} URLs from {ALL_LISTING_URLS_FILE}")
else:
    # Define initial URLs to scrape
    listing_urls = [
        f"{BASE_URL}/en/porsche-911-for-sale",
        f"{BASE_URL}/en/porsche-boxster-for-sale",
        f"{BASE_URL}/en/porsche-cayman-for-sale",
        # Add more URLs as needed
    ]
    logger.info(f"Using {len(listing_urls)} predefined URLs")

print(f"Total URLs to process: {len(listing_urls)}")

## Step 2: Scraping Functions

Define functions to fetch and parse listing data.

In [None]:
def fetch_page(url: str, headers: dict = None) -> str:
    """Fetch HTML content from URL."""
    if headers is None:
        headers = {'User-Agent': USER_AGENT}
    
    try:
        response = requests.get(url, headers=headers, timeout=TIMEOUT)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
        return None


def parse_listing(html: str, url: str) -> dict:
    """
    Parse listing HTML and extract relevant data.
    
    Returns dictionary with listing details.
    """
    soup = BeautifulSoup(html, 'html.parser')
    
    # This is a template - adjust selectors based on actual website structure
    listing_data = {
        'URL': url,
        'Scraped_At': datetime.now().isoformat(),
        'Title': None,
        'Model': None,
        'Series': None,
        'Year of construction': None,
        'Mileage': None,
        'price': None,
        'currency': None,
        'Transmission': None,
        'Drive': None,
        'Exterior color': None,
        'Interior color': None,
        'Condition': None,
        'Car location': None,
        'Matching numbers': None,
        'Number of vehicle owners': None,
        'Paint-to-Sample (PTS)': None,
        'Ready to drive': None,
    }
    
    # Example parsing (adjust selectors for actual site)
    # listing_data['Title'] = soup.select_one('.listing-title')?.text.strip()
    # listing_data['price'] = soup.select_one('.price')?.text.strip()
    # ... etc
    
    return listing_data


def scrape_listings(urls: list, delay: float = 1.0) -> pd.DataFrame:
    """
    Scrape multiple listing URLs.
    
    Args:
        urls: List of URLs to scrape
        delay: Delay between requests in seconds
    
    Returns:
        DataFrame with scraped listings
    """
    listings = []
    
    for i, url in enumerate(urls, 1):
        logger.info(f"Scraping {i}/{len(urls)}: {url}")
        
        html = fetch_page(url)
        if html:
            listing_data = parse_listing(html, url)
            listings.append(listing_data)
        
        # Respectful delay
        if i < len(urls):
            time.sleep(delay)
    
    df = pd.DataFrame(listings)
    logger.info(f"Successfully scraped {len(df)} listings")
    
    return df

## Step 3: Execute Scraping

**Note:** This is a template. Update the `parse_listing()` function with actual CSS selectors for your target website.

In [None]:
# Execute scraping (uncomment when ready)
# scraped_df = scrape_listings(listing_urls[:10], delay=DELAY)  # Start with first 10
# display(scraped_df.head())

# For demonstration, create sample data structure
sample_data = {
    'URL': ['https://example.com/listing1', 'https://example.com/listing2'],
    'Title': ['Porsche 911 Carrera', 'Porsche Boxster S'],
    'Model': ['911', 'Boxster'],
    'Series': ['991', '987'],
    'Year of construction': [2015, 2008],
    'Mileage': ['45,000 km', '72,000 km'],
    'price': [85000, 35000],
    'currency': ['EUR', 'EUR'],
    'Transmission': ['Manual', 'Manual'],
    'Scraped_At': [datetime.now().isoformat()] * 2
}

scraped_df = pd.DataFrame(sample_data)
print(f"Scraped {len(scraped_df)} listings")
scraped_df.head()

## Step 4: Save to Bronze Layer

Save raw scraped data to the Bronze layer (raw data zone).

In [None]:
# Define output path
bronze_file = DATA_DIR_BRONZE / f"listings_bronze_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"

# Save using utility function
save_data(scraped_df, bronze_file)

print(f"âœ“ Saved {len(scraped_df)} listings to Bronze layer")
print(f"  Location: {bronze_file}")

## Summary

- **Listings scraped:** {len(scraped_df)}
- **Bronze file:** {bronze_file.name}

**Next Step:** Run `02_bronze_to_silver.ipynb` to clean and standardize this data.