In [1]:
import os
import pandas as pd
from datetime import datetime
import re
from bs4 import BeautifulSoup
import requests

# Function to save scraped data to a CSV file
def save_data(scraped_data_list, filename='cochesdotcom_detail_pages_2024-05-13.csv'):
    if scraped_data_list:
        # Convert list of dictionaries to DataFrame
        df_to_append = pd.DataFrame(scraped_data_list)
        # Check if file exists to append; if not, write a new file
        mode = 'a' if os.path.exists(filename) else 'w'
        header = not os.path.exists(filename) or mode == 'w'
        # Write DataFrame to CSV file
        df_to_append.to_csv(filename, index=False, mode=mode, header=header)
        print(f"Data saved to {filename}.")
        # Save the last scraped row number to a text file
        with open('last_scraped_row.txt', 'w') as file:
            file.write(str(last_scraped_row))

# Function to scrape data from detail_page URL
def scrape_detail_page(url):
    try:
        # Set up HTTP request with user-agent header
        headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'}
        res = requests.get(url, headers=headers)
        # Parse the HTML content of the page
        soup = BeautifulSoup(res.text, 'lxml')

        # Initialize dictionary to store scraped data
        scraped_data = {}

        # Extract make and model from breadcrumbs navigation
        make_and_model = soup.find('nav', class_='Breadcrumbs-module_breadcrumbs__ktfbh font-body-s')
        if make_and_model:
            attributes = make_and_model.find_all(class_='TextButton-module_content__eBnG1')
            if len(attributes) >= 2:
                make = attributes[-2].text.strip()
                model = attributes[-1].text.strip()
                scraped_data['make'] = make
                scraped_data['model'] = model

        # Extract cash and financing prices
        list = soup.find('div', class_='ClassifiedPriceBlock_classified-price-block__SnJtL Container-module_container__JMoiT Container-module_is-full-width__oqQVQ')
        if list:
            prices = list.find_all('strong', class_='PriceLabel_copy__4E1Gm font-heading-m font-bold')
            if prices:
                cash_price_text = prices[0].text.strip()
                cash_price_numeric = re.search(r'([\d\.]+)', cash_price_text).group(1) if cash_price_text else None
                financing_price_text = prices[1].text.strip() if len(prices) > 1 else ""
                financing_price_numeric = re.search(r'([\d\.]+)', financing_price_text).group(1) if financing_price_text else None
                scraped_data['cash'] = cash_price_numeric
                scraped_data['fin'] = financing_price_numeric

        # Extract various car features and specifications
        attribute_list = soup.find('ul', class_='Grid-module_grid__h49fk CarOverview-module_car-overview__ysD5f CarOverview-module_background-color-neutral-5__GrRlH CarOverview-module_has-padding__N9hjr CarProfileCarDetails_car-over-view-details__DywRB')
        if attribute_list:
            attributes = attribute_list.find_all('li', class_='Grid-module_grid__h49fk')
            for attribute in attributes:
                span = attribute.find('p', class_='FeatureText-module_feature-text__BpFHF font-body-m FeatureText-module_align-left__bm1gj FeatureText-module_vertical-align-center__kK9Mq color-inherit FeatureText-module_is-center-on-mobile__HHn4A font-bold FeatureText-module_has-icon-on-mobile__3bhQE CarOverview-module_feature-text__DHOtg').find('span', class_='FeatureText-module_wrapper__nGlYu')
                title = span.find('small', class_='FeatureText-module_eyebrow__UCQaZ font-regular font-body-xs color-inherit FeatureText-module_is-eyebrow-upper-case__L8oNT').text.strip()
                value = span.find('span').text.strip()
                scraped_data[title] = value

        # Extract detailed technical data from tables
        tables = soup.find_all('table', class_='SpecsDetailTable-module_table__G60yw')
        if tables:
            for table in tables:
                specs = table.find_all('tr', class_='SpecsDetailTableRow-module_tr__MbBui')
                if specs:
                    for spec in specs:
                        cells = spec.find_all('td', class_='SpecsDetailTableCell-module_td__b-Z4y font-body-s')
                        if len(cells) == 2:
                            title = cells[0].text.strip()
                            value = cells[1].text.strip()
                            scraped_data[title] = value
                
        return scraped_data
    except Exception as e:
        print(f"Error scraping URL {url}: {str(e)}")
        return None
        
try:
    df_urls = pd.read_csv('cochesdotcom_urls.csv')
    try:
        with open('last_scraped_row.txt', 'r') as file:
            last_scraped_row = int(file.read().strip())
    except FileNotFoundError:
        last_scraped_row = 0

    scraped_data_list = []

    for i, url in enumerate(df_urls['url'][last_scraped_row:]):
        if pd.isna(url):
            continue
        i += last_scraped_row  # Adjust the index to the correct row number
        if i >= last_scraped_row + 30000:
            break

        print(f"Scraping URL: {url}")
        scraped_data = scrape_detail_page(url)
        last_scraped_row += 1
        print(last_scraped_row)
        if scraped_data:
            scraped_data['Date'] = datetime.now().strftime('%Y-%m-%d')
            scraped_data['URL'] = url
            scraped_data_list.append(scraped_data)

        # Checkpoint every 100 URLs
        if (i - last_scraped_row) % 500 == 0 and scraped_data_list:
            save_data(scraped_data_list)
            scraped_data_list.clear()  # Clear the list after saving
            print ("500 rows saved")

    save_data(scraped_data_list)  # Final save
    print ("final save")

    with open('last_scraped_row.txt', 'w') as file:
        file.write(str(last_scraped_row))
    print(f"Scraping complete. Last_scraped_row: {last_scraped_row}")
    
except Exception as e:
    print(f"An error occurred: {str(e)}")
    save_data(scraped_data_list)  # Save any remaining data before exiting

except KeyboardInterrupt:
    print("KeyboardInterrupt error occurred")
    save_data(scraped_data_list)  # Save any remaining data before exiting

final save
Scraping complete. Last_scraped_row: 77742
