In [None]:
import time
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import pandas as pd

def extract_vehicle_data(start = 1, end = 50, base_url = 'https://www.milanuncios.com/coches-de-segunda-mano/?pagina='):
    '''
    Connects to Milanuncios and extracts a list of predefined features from each vehicle listed.

    Parameters:
        start (int):The first page over which we want to start scrapping.
        end (int):The last page on which we want to stop scrapping.
        base_url (str):The base URL to which we want to connect.

    Returns:
        vehicle_list(list):List of dictionaries containing information about all the vehicles listed within the execution.
    '''
    try:
        assert isinstance(start, int), 'The "start" parameter is not an integer.'
        assert isinstance(end, int), 'The "end" parameter is not an integer.'
        assert isinstance(base_url, str), 'The "base_url" parameter is not a string.'
        assert start > 0, 'The "start" parameter has to be greater than 0.'
        assert start < end, 'The "end" parameter has to be greater than the "start" parameter.'
        assert re.match('https://www\.milanuncios\.com/.+pagina=$', base_url), 'The "base_url" parameter does not match the expected regex.'
    except AssertionError as ae:
        print(ae)
        
    # Create an instance of the Chrome web driver
    browser = webdriver.Chrome()

    vehicle_list = []
    # Iterate over each page until specified in the range function
    for page in range(start, end + 1):
        # Form the real URL appending the string value of the current page over which we're iterating
        browser.get(base_url + str(page))

        # Wait a little for the website to load
        time.sleep(1)

        # If we're on the first page, dismiss the cookies pop-up
        if page == start:
            browser.find_element_by_css_selector('button.sui-AtomButton.sui-AtomButton--primary.sui-AtomButton--solid.sui-AtomButton--center').click()

        # Calculate the height of the website
        total_height = int(browser.execute_script("return document.body.scrollHeight"))

        # Slowly scroll down until you reach the bottom
        for i in range(1, total_height, 50):
            browser.execute_script("window.scrollTo(0, {});".format(i))

        # Capture the DOM elements of our interest
        post_elems = browser.find_elements_by_class_name("ma-AdCard-body")

        # Iterate over each of the elements that we're capturing
        for post in post_elems:

            # Sometimes you'll also capture ads. Skip them.
            if 'OFERTA PATROCINADA' in post.find_element_by_xpath('..').text:
                print('Skipping ad...')
                continue

            # Convert the captured data into BS4 format for simplified extractions
            html = post.get_attribute('innerHTML')
            soup = BeautifulSoup(html)

            # Certain features are not listed in an structured format and we cannot access them directly
            tags = [x.text for x in soup.find_all('span', {'class': 'ma-AdTag-label'})]

            # Try to extract the associated features. In certain situations, some will miss. In this situation, we will skip the entire row.
            # These are all very important for machine learning purpuses and it's rare to find a missing value, so it's preferable to skip it at this point.
            try:
                location = soup.find('a', {'class': 'ma-AdCard-subtitleLink'}).text.split(' en ')[1]
                hp = next(x for x in tags if x.endswith('CV'))
                mileage = next(x for x in tags if x.endswith('kms') or x.endswith('km'))
                year = next(x for x in tags if x.isdigit())
                transmission = next(x for x in tags if x in ['Manual', 'Automático'])
                doors = next(x for x in tags if x.endswith('puertas'))
                price = soup.find('span', {'class': 'ma-AdPrice-value'}).text
            except:
                print('Found a problem when gathering information for: [', soup.find('h2').text, '] Skipping...')
                continue

            # Populate all the features in a dictionary
            vehicle = {
                'title': soup.find('h2').text,
                'location': location,
                'url': soup.find('a', {'class': 'ma-AdCard-titleLink'}, href = True)['href'],
                'desc': soup.find('p', {'class': 'ma-AdCardDescription-text'}).text,
                'price': price,
                'seller': soup.find('span', {'class': 'ma-AdTag-label'}).text,
                'mileage': mileage,
                'year': year,
                'transmission': transmission,
                'doors': doors,
                'hp': hp
            }

            # Add the dictionary to the list of vehicles
            vehicle_list.append(vehicle)

    return vehicle_list

def write_csv_output(vehicle_list, output_name):
    '''
    Takes a list of dictionaries containing vehicle information and writes it in csv format.

    Parameters:
        vehicle_list (list):List of dictionaries containing vehicle information.
        output_name (str): The name of the output csv file.
    Returns:
        None
    '''
    try:
        assert isinstance(vehicle_list, list), 'The "vehicle_list" parameter is not a dictionary.'
        assert isinstance(output_name, str), 'The "output_name" parameter is not a string.'
    except AssertionError as ae:
        print(ae)
        
    # Define a Pandas DataFrame containing the vehicle information
    vehicle_list_df = pd.DataFrame(vehicle_list)
    
    # Create the output csv file with the previous data
    vehicle_list_df.to_csv(f'{output_name}.csv')

In [None]:
vehicle_list = extract_vehicle_data(end = 200)

In [None]:
write_csv_output(vehicle_list, 'vehicle_listing')