In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import re

# Function to scrape data from detail_page URL
def scrape_detail_page(url):
    try:
        # Send a request with a valid user-agent header
        headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'}
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, 'lxml')

        # Initialize dictionary to store scraped data
        scraped_data = {}

        # Make and model 
        make_and_model = soup.find('nav',class_='Breadcrumbs-module_breadcrumbs__ktfbh font-body-s')
        # Extract attributes and values from the make_and_model list
        if make_and_model:
            attributes = make_and_model.find_all(class_='TextButton-module_content__eBnG1') 
            if len(attributes) >= 2:
                make = attributes[-2].text.strip()
                model = attributes[-1].text.strip()
                scraped_data['make'] = make
                scraped_data['model'] = model

        # Find the list containing price (cash and fin)
        list = soup.find('div', class_='ClassifiedPriceBlock_classified-price-block__SnJtL Container-module_container__JMoiT Container-module_is-full-width__oqQVQ')
        if list:
            prices = list.find_all('strong', class_='PriceLabel_copy__4E1Gm font-heading-m font-bold')
            if prices:
                cash_price_text = prices[0].text.strip()
                cash_price_numeric = re.search(r'([\d\.]+)', cash_price_text).group(1) if cash_price_text else None
                            
                financing_price_text = prices[1].text.strip() if len(prices) > 1 else ""
                financing_price_numeric = re.search(r'([\d\.]+)', financing_price_text).group(1) if financing_price_text else None
                      
                scraped_data['cash'] = cash_price_numeric
                scraped_data['fin'] = financing_price_numeric

            
        # Find the list containing attributes (año, kms, combustible, cambio, potencia, etc.)
        attribute_list = soup.find('ul', class_='Grid-module_grid__h49fk CarOverview-module_car-overview__ysD5f CarOverview-module_background-color-neutral-5__GrRlH CarOverview-module_has-padding__N9hjr CarProfileCarDetails_car-over-view-details__DywRB')

        # Extract attributes and values from the attribute list
        if attribute_list:
            attributes = attribute_list.find_all('li', class_='Grid-module_grid__h49fk') 
            for attribute in attributes:
                span = attribute.find('p', class_='FeatureText-module_feature-text__BpFHF font-body-m FeatureText-module_align-left__bm1gj FeatureText-module_vertical-align-center__kK9Mq color-inherit FeatureText-module_is-center-on-mobile__HHn4A font-bold FeatureText-module_has-icon-on-mobile__3bhQE CarOverview-module_feature-text__DHOtg').find('span', class_='FeatureText-module_wrapper__nGlYu')
                title = span.find('small', class_='FeatureText-module_eyebrow__UCQaZ font-regular font-body-xs color-inherit FeatureText-module_is-eyebrow-upper-case__L8oNT').text.strip()
                value = span.find('span').text.strip()
                scraped_data[title] = value
                

        # Find and extract Datos técnicos
        tables = soup.find_all ('table', class_= 'SpecsDetailTable-module_table__G60yw')
        if tables:
            for table in tables:
                specs = table.find_all('tr', class_='SpecsDetailTableRow-module_tr__MbBui')
                if specs:
                    for spec in specs:
                        cells = spec.find_all('td', class_='SpecsDetailTableCell-module_td__b-Z4y font-body-s')
                        if len(cells) == 2:
                            title = cells[0].text.strip()
                            value = cells[1].text.strip()
                            scraped_data[title] = value
                
        return scraped_data
    except Exception as e:
        print(f"Error scraping URL {url}: {str(e)}")
        return None

try:
    # Read the CSV file containing the URLs
    df_urls = pd.read_csv('cochesdotcom_urls.csv')

    # Read the last scraped row number or set it to 0 if it doesn't exist
    try:
        with open('last_scraped_row.txt', 'r') as file:
            last_scraped_row = int(file.read().strip())
    except FileNotFoundError:
        last_scraped_row = 0

    # Initialize an empty list to store scraped data
    scraped_data_list = []

    # Iterate over each detail_page URL and scrape data
    for i, url in enumerate(df_urls['url'][last_scraped_row:]):
        i += last_scraped_row  # Adjust the index to the correct row number
        try:
            # Stop iterating if we have gone through X URLs to not overload the website and avoid being banned
            if i >= last_scraped_row + 5000:
                break

            # Skip empty URLs
            if pd.isna(url):
                continue

            # Call the function to scrape data from detail_page URL
            print(f"Scraping URL: {url}")
            scraped_data = scrape_detail_page(url)
            last_scraped_row += 1  # Update the last scraped row number
            print (last_scraped_row)
            if scraped_data:
                scraped_data['Date'] = datetime.now().strftime('%Y-%m-%d')
                scraped_data['URL'] = url
                scraped_data_list.append(scraped_data)
                
        except KeyboardInterrupt:
            raise
        except Exception as e:
            print(f"Error scraping URL {url}: {str(e)}")
        
    # After the loop completes, if there's any remaining scraped data, save it to CSV
    if scraped_data_list:
        df_to_append = pd.DataFrame(scraped_data_list)
        
        # Determine the mode for saving the DataFrame to CSV
        mode = 'a' if os.path.exists('cochesdotcom_detail_pages_2024-05-13.csv') else 'w'
        
        # Save DataFrame to CSV
        df_to_append.to_csv('cochesdotcom_detail_pages_2024-05-13.csv', index=False, mode=mode, header=not os.path.exists('cochesdotcom_detail_pages_2024-05-13.csv'))
        
        # Save the last scraped row number to a text file
        with open('last_scraped_row.txt', 'w') as file:
            file.write(str(last_scraped_row))
    print("5000 rows saved to cochesdotcom_detail_pages_2024-05-13.csv without interruption")
except Exception as e:
    print(f"An error occurred: {str(e)}")
    # After the loop completes, if there's any remaining scraped data, save it to CSV
    if scraped_data_list:
        df_to_append = pd.DataFrame(scraped_data_list)
        
        # Determine the mode for saving the DataFrame to CSV
        mode = 'a' if os.path.exists('cochesdotcom_detail_pages_2024-05-13.csv') else 'w'
        
        # Save DataFrame to CSV
        df_to_append.to_csv('cochesdotcom_detail_pages_2024-05-13.csv', index=False, mode=mode, header=not os.path.exists('cochesdotcom_detail_pages_2024-05-13.csv'))
        
        # Save the last scraped row number to a text file
        with open('last_scraped_row.txt', 'w') as file:
            file.write(str(last_scraped_row))
    print("Rows saved to cochesdotcom_detail_pages_2024-05-13.csv after exception")
except KeyboardInterrupt:
    print('KeyboardInterrupt occurred')
    # After the loop completes, if there's any remaining scraped data, save it to CSV
    if scraped_data_list:
        df_to_append = pd.DataFrame(scraped_data_list)
        
        # Determine the mode for saving the DataFrame to CSV
        mode = 'a' if os.path.exists('cochesdotcom_detail_pages_2024-05-13.csv') else 'w'
        
        # Save DataFrame to CSV
        df_to_append.to_csv('cochesdotcom_detail_pages_2024-05-13.csv', index=False, mode=mode, header=not os.path.exists('cochesdotcom_detail_pages_2024-05-13.csv'))
        
        # Save the last scraped row number to a text file
        with open('last_scraped_row.txt', 'w') as file:
            file.write(str(last_scraped_row))
    print("Rows saved to cochesdotcom_detail_pages_2024-05-13.csv after KeyboardInterrupt")
    

Scraping URL: https://www.coches.com/coches-segunda-mano/ocasion-mercedes-clase-glc-258-glc-coupe-350d-4matic-aut-en-alicante.htm?id=kOEAUj6wuWBQ
39824
Scraping URL: https://www.coches.com/coches-segunda-mano/ocasion-mercedes-clase-a-163-a-200-en-madrid.htm?id=6x3Dz7daCG5X
39825
Data saved to cochesdotcom_detail_pages_2024-05-13.csv.
100 rows saved
Scraping URL: https://www.coches.com/coches-segunda-mano/ocasion-mercedes-clase-b-136-b-180-7g-dct-en-madrid.htm?id=D40hNffVfpzV
39826
Scraping URL: https://www.coches.com/coches-segunda-mano/ocasion-mercedes-clase-a-190-a-220d-progressive-line-advanced-8g-dct-en-barcelona.htm?id=fkD_pVPF8cEC
39827
Scraping URL: https://www.coches.com/coches-segunda-mano/ocasion-mercedes-clase-gla-177-gla-220d-amg-line-7g-dct-en-barcelona.htm?id=yD9592VPUe4c
39828
Scraping URL: https://www.coches.com/coches-segunda-mano/ocasion-mercedes-clase-a-163-a-200-progressive-line-advanced-7g-dct-en-barcelona.htm?id=fmznAASVQWFY
39829
Scraping URL: https://www.coches.

KeyboardInterrupt: 