In [18]:
import requests 
from bs4 import BeautifulSoup
import csv
import re
import time
import random

def extract_car_info(url):
    """
    Extracts the car ID and name from a given Moteur.ma URL.
    Example URL: https://www.moteur.ma/fr/voiture/achat-voiture-occasion/detail-annonce/455791/mercedes-benz-220-205.html
    """
    try:
        pattern = r'/detail-annonce/(\d+)/([a-z0-9\-]+)\.html'
        match = re.search(pattern, url, re.IGNORECASE)
        
        if not match:
            raise ValueError("URL format is incorrect or missing expected data.")
        
        car_id = match.group(1)
        car_name = match.group(2).replace('-', ' ').title()  # Optional: Format name nicely
        
        return {
            'id': car_id,
            'name': car_name
        }

    except Exception as e:
        return {
            'error': str(e),
            'url': url
        }

def scrape_car_details(car_url, headers):
    """Extract detailed information from a car listing page"""
    try:
        car_page_response = requests.get(car_url, headers=headers)
        if car_page_response.status_code != 200:
            print(f"Failed to access car page: {car_url}")
            return {}
            
        soup_car_page = BeautifulSoup(car_page_response.text, 'html.parser')
        
        # Get basic info from URL
        basic_info = extract_car_info(car_url)
        car_data = {
            'id': basic_info.get('id', 'Unknown'),
            'name': basic_info.get('name', 'Unknown'),
            'url': car_url
        }
        
        # Get all detail lines
        data = soup_car_page.find_all('div', class_='detail_line')
        
        for item in data:
            text = item.find_all('span')
            if len(text) >= 2:
                key = text[0].get_text(strip=True)
                value = text[1].get_text(strip=True)
                car_data[key] = value
        
        # Get price if available
        price_elem = soup_car_page.find('span', class_='price')
        if price_elem:
            car_data['Prix'] = price_elem.get_text(strip=True)
        
        # Get location if available
        location_elem = soup_car_page.find('span', class_='location')
        if location_elem:
            car_data['Localisation'] = location_elem.get_text(strip=True)
            
        return car_data
    except Exception as e:
        print(f"Error extracting car info from {car_url}: {e}")
        return {'id': 'error', 'name': 'error', 'url': car_url, 'error': str(e)}

def write_to_csv(car_data_list, filename):
    """Write car data to CSV file with proper formatting"""
    if not car_data_list:
        print("No data to write to CSV")
        return
        
    # Get all unique fields across all cars
    all_fields = set(['id', 'name', 'url'])
    for car in car_data_list:
        all_fields.update(car.keys())
    
    # Remove any error fields
    if 'error' in all_fields:
        all_fields.remove('error')
        
    # Convert to list and sort, but put id, name, url at the beginning
    priority_fields = ['id', 'name', 'url']
    remaining_fields = sorted([f for f in all_fields if f not in priority_fields])
    fieldnames = priority_fields + remaining_fields
    
    # Write to CSV
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        
        # Write all car data
        for car in car_data_list:
            # Filter out error field if present
            if 'error' in car:
                del car['error']
            writer.writerow({field: car.get(field, '') for field in fieldnames})
    
    print(f"Successfully wrote {len(car_data_list)} car listings to {filename}")

def main():
    url = "https://www.moteur.ma/fr/voiture/achat-voiture-occasion"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    all_car_data = []  # Store car data to write at once after determining all fields
    visited_urls = set()  # Track visited URLs to avoid duplicates
    
    current_page = url
    page_num = 1
    max_pages = 50  # Limit to prevent infinite loops, adjust as needed
    
    while page_num <= max_pages:
        print(f"Scraping page {page_num}: {current_page}")
        
        # Check if we've already visited this page
        if current_page in visited_urls:
            print(f"Already visited {current_page}, stopping to avoid duplicates")
            break
            
        visited_urls.add(current_page)
        
        try:
            response = requests.get(current_page, headers=headers)
            
            if response.status_code != 200:
                print(f"Failed to retrieve page {page_num}. Status code: {response.status_code}")
                break
                
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find all car listings on the page
            products = soup.find_all('div', class_='row bloc-info')
            
            if not products:
                print(f"No car listings found on page {page_num}")
                break
                
            print(f"Found {len(products)} car listings on page {page_num}")
            
            # Process each car listing
            for product in products:
                url_tag = product.find('a', class_='slide')
                if url_tag and url_tag.has_attr('href'):
                    car_url = url_tag['href']
                    print(f"Scraping car: {car_url}")
                    
                    # Extract car information
                    car_data = scrape_car_details(car_url, headers)
                    
                    if car_data:
                        all_car_data.append(car_data)
                        
            
            # UPDATED PAGINATION LOGIC: Use the last element in the pagination list
            next_page = None
            pagination = soup.find('li', class_='page pagination')

            if pagination:
                pagination_links = pagination.find_all('a')

                if pagination_links:
                    # Get the last element (next button)
                    next_button = pagination_links[-1]
                    
                    # Make sure it's not disabled and has href
                    parent_li = next_button.parent
                    if next_button.has_attr('href') and not (parent_li and 'disabled' in parent_li.get('class', [])):
                        next_page = next_button['href']
                        
                        # Verify this is not a page we've already seen
                        if next_page in visited_urls:
                            print("Next page already visited, stopping pagination")
                            next_page = None
            
            if not next_page:
                print("No next page found or all pages visited. Ending scraping.")
                break
                
            current_page = next_page
            page_num += 1
            
            # Write intermediate results every 2 pages to avoid losing data
            if page_num % 2 == 0 and all_car_data:
                write_to_csv(all_car_data, f'car_data_partial_{page_num-2}_to_{page_num-1}.csv')
                print(f"Saved {len(all_car_data)} cars to intermediate file")
            
        except Exception as e:
            print(f"Error on page {page_num}: {e}")
            break
    
    # Write final results to CSV
    if all_car_data:
        write_to_csv(all_car_data, 'car_data_final.csv')
    
    print(f"Scraping completed. Processed {page_num} pages and {len(all_car_data)} car listings.")

if __name__ == "__main__":
    main()

Scraping page 1: https://www.moteur.ma/fr/voiture/achat-voiture-occasion
Found 30 car listings on page 1
Scraping car: https://www.moteur.ma/fr/voiture/achat-voiture-occasion/detail-annonce/448014/audi-a3-.html
Scraping car: https://www.moteur.ma/fr/voiture/achat-voiture-occasion/detail-annonce/416090/audi-q8-.html
Scraping car: https://www.moteur.ma/fr/voiture/achat-voiture-occasion/detail-annonce/416133/porsche-cayenne-.html
Scraping car: https://www.moteur.ma/fr/voiture/achat-voiture-occasion/detail-annonce/452570/porsche-cayenne-.html
Scraping car: https://www.moteur.ma/fr/voiture/achat-voiture-occasion/detail-annonce/452571/audi-q8-.html
Scraping car: https://www.moteur.ma/fr/voiture/achat-voiture-occasion/detail-annonce/457295/land-rover-range-rover-vogue-.html
Scraping car: https://www.moteur.ma/fr/voiture/achat-voiture-occasion/detail-annonce/457296/mercedes-benz-classe-gle-.html
Scraping car: https://www.moteur.ma/fr/voiture/achat-voiture-occasion/detail-annonce/460113/mercede