In [1]:
import requests
from bs4 import BeautifulSoup
import re
import random 
import time
import csv

In [2]:

def scrap_request(url, user_agents):
    """
    Creates a requests.Session and then
    makes a GET request using the provided 'url' and 'user_agents'
    It will pick a random user_agent from the user_agents list provided
    It has a built-in random delay to avoid bot detection when web scrapping
    It returns the session.get request
    Will return None if some exception arises
    """

    try:
        random_user_agent = random.choice(user_agents)

        session = requests.Session()

        response = session.get(url, headers = {"User-Agent":random_user_agent} )

        delay = random.uniform(1,3)

        time.sleep(delay)

        return response
    
    except:
        return None

In [3]:
def scrap_car_urls(url, user_agents):
    """
    This functions scraps each car announcement URL from a autocasion.com page
    It uses scrap_request function for the GET request
    It will return a list with the complete link for each car announcement
    Will return None if some exception arises
    """
    try:
        page_bs = BeautifulSoup(scrap_request(url, user_agents).text, "lxml")
        
        cars_list = page_bs.find_all("article", {'class': re.compile("anuncio")})
        
        starting_url = "https://www.autocasion.com"
        
        links_list = [starting_url + link.find("a")["href"]
                  for link in cars_list]
        
        return links_list
    
    except:
        return None


In [4]:
def scrap_car_details(url, user_agents):
    """
    This function scraps a car announcement url from autocasion.com
    It uses scrap_request function for the GET request
    It returns a dictionary with its main characteristics (brand, model, transmission...)
    Will return None if some exception arises
    """
    try:
        car_dict = {}
        
        car_info_bs = BeautifulSoup(scrap_request(url, user_agents).text, "lxml")
        
        #car announcement url
        car_dict["link"] = url
        
        #car brand
        car_dict["marca"] = car_info_bs.find("div", class_="bloque paginacion-ficha").find_all(itemprop="name")[1].text
        
        #province where the car is being sold
        car_dict["provincia"] = car_info_bs.find("div", class_="bloque paginacion-ficha").find_all(itemprop="name")[2].text
        
        #car model
        car_dict["modelo"] = car_info_bs.find("div", class_="bloque paginacion-ficha").find_all(itemprop="name")[3].text
        
        #car price
        car_dict["precio"] = int("".join(re.findall("\d",car_info_bs.find("div", class_="precio").find("span").text.strip())))
        
        #year in which the car was registrated
        car_dict["matriculacion"] = car_info_bs.find(
            "ul", class_="datos-basicos-ficha").find(
            text= re.compile("Fecha de matriculación")
        ).find_next().text[-4:]
        
        #fuel type
        car_dict["combustible"] =  car_info_bs.find(
            "ul", class_="datos-basicos-ficha").find(
            text = re.compile("Combustible")).find_next().text.strip()
        
        #car km
        car_dict["kilometros"] =  car_info_bs.find(
            "ul", class_="datos-basicos-ficha").find(
            text = re.compile("Kilómetros")).find_next().text.strip()
        
        #Transform to int kilometros
        car_dict["kilometros"] = int("".join(re.findall("\d", car_dict["kilometros"])))
        
        #car body type
        car_dict["carroceria"] =  car_info_bs.find(
            "ul", class_="datos-basicos-ficha").find(
            text = re.compile("Carrocería")).find_next().text.strip()
        
        #transmission type
        car_dict["cambio"] =  car_info_bs.find(
            "ul", class_="datos-basicos-ficha").find(
            text = re.compile("Cambio")).find_next().text.strip()
        
        #car power
        car_dict["potencia"] =  car_info_bs.find(
            "ul", class_="datos-basicos-ficha").find(
            text = re.compile("Potencia")).find_next().text
        
        #guarantee months
        car_dict["garantia"] =  car_info_bs.find(
            "ul", class_="datos-basicos-ficha").find(
            text = re.compile("Garantía")).find_next().text[0:2]
        
        #color
        car_dict["color"] =  car_info_bs.find(
            "ul", class_="datos-basicos-ficha").find(
            text = re.compile("Color")).find_next().text.strip().replace("\n","").replace("  ","")
        
        #Car environmental labeling
        try:
            car_dict["distintivo"] =  car_info_bs.find(
                "span",class_="icon icon-info").find_next().text
        except:
            car_dict["distintivo"] = 'NULL'
        
        return car_dict
    
    except:
        return None

In [5]:
#define starting url for scrapping
start = "https://www.autocasion.com/coches-ocasion?page="

#define user_agents pool
user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
               'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
               'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
               'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
               'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
               'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
               'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'
              ]

#Path to save the csv
path = 'aocars.csv'

#number of pages to scrap
PAGES = 625

In [6]:
#Open the CSV file and create a csv
with open(path, 'w', newline='') as outfile:
    
    writer = csv.writer(outfile, delimiter=';')
    
    #Write header for each characteristic

    writer.writerow(["link",
                     "marca",
                     "provincia",
                     "modelo",
                     "precio",
                     "matriculacion",
                     "combustible",
                     "kilometros",
                     "carroceria",
                     "cambio",
                     "potencia",
                     "garantia",
                     "color",
                     "distintivo"
                    ])
    
    print("CSV File created, starting scrapping")
    
    #Iterate through website pages
    for page in range(1, PAGES + 1):
        
        if page%10 == 0:
            print("Currently scrapping page", page)
            
        start = start + str(page)
        #Scrap all car announcements urls from the current page
        for car_url in scrap_car_urls(start, user_agents):
            try:
                detalles = scrap_car_details(car_url, user_agents)
                writer.writerow(detalles.values())
                
            except:
                print("Error in page",page, " and link", car_url)
                continue
                
print("Web Scrapping has ended. Check the generated CSV file")

CSV File created, starting scrapping
Currently scrapping page 10
Currently scrapping page 20
Currently scrapping page 30
Currently scrapping page 40
Currently scrapping page 50
Currently scrapping page 60
Currently scrapping page 70
Currently scrapping page 80
Currently scrapping page 90
Currently scrapping page 100
Currently scrapping page 110
Currently scrapping page 120
Currently scrapping page 130
Currently scrapping page 140
Currently scrapping page 150
Currently scrapping page 160
Currently scrapping page 170
Currently scrapping page 180
Currently scrapping page 190
Currently scrapping page 200
Currently scrapping page 210
Currently scrapping page 220
Currently scrapping page 230
Currently scrapping page 240
Currently scrapping page 250
Currently scrapping page 260
Currently scrapping page 270
Currently scrapping page 280
Currently scrapping page 290
Error in page 291  and link https://www.autocasion.com/coches-segunda-mano/dacia-duster-ocasion/duster-1-2-tce-laureate-4x2-125-ref