In [21]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import math
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [8]:
def getCarLinks(main_soup):
    """ Will generate a list of pages, then loop over each page getting 
    all the car links appending them to a list"""
    links_list = []
    pages = getPageList(main_soup) # Will retrieve a list of pages to loop over it.
    for page in pages:
        web_c = requests.get(page).content 
        soup = BeautifulSoup(web_c, "html.parser")
        links = soup.find_all("a","ui-search-result__content ui-search-link")
        for link in links:
            links_list.append(link.get('href'))
    return links_list

def getPageList(soup):
    """ Will take the total car results from the webpage, will calculate how many 
    pages there are, then will create the links to the pages."""
    pages = []
    page_link = soup.find_all("a","andes-pagination__link ui-search-link")[0].get("href")
    q_res = soup.find_all("span","ui-search-search-result__quantity-results")[0].text
    q_res = q_res.replace(" resultados","")
    q_res = q_res.replace(".","")
    q_res = int(q_res)
    n_pag = math.floor(q_res/48)
    j = 49 # 2nd page
    pages.append(page_link)
    pages.append("{}_Desde_{}".format(page_link, j))
    for i in range(1, n_pag):
        j += 48
        pages.append("{}_Desde_{}".format(page_link, j))
    return pages

In [5]:
def cleanText(text):
    """ Some text fields have different characters we don't need, this function
    will take them out"""
    cl_text = text
    chars = ["\n","\t","Publicado el ",'\"',")","("]
    for char in chars:
        cl_text = cl_text.replace(char,"")
    return cl_text

def getRep(soup):
    """ Sometimes the seller doesn't have reputation, so it will try to retrieve
    this info if it does exist"""
    try:
        rep = clean_text(soup.find("div","reputation-info block").span.text) 
    except:
        rep = soup.find("div","reputation-info block").p.text
    return rep

def getSpecList(soup):
    spec_list = soup.find('ul','specs-list').find_all("li")
    specs = {}
    for spec in spec_list:
        specs[spec.strong.text] = spec.span.text
    specs["Kilómetros"] = int(specs["Kilómetros"].replace(" km","").replace(".",""))
    specs["Año"] = int(specs["Año"])
    return specs

def extraAtrib(link):
    """ There was written function in the code that also had some interesting data,
    I figured out what the parameters were, so I took them and I gave them a new
    description"""
    test_list = str(link.find_all("script")[1]).replace('meli_ga("set", ',"").split(";")[6:-17]
    dims = {"dimension35":"Seller Type","dimension72":"Reputation Code"
            ,"dimension111": ["Brand", "Model"],"dimension49": "Codigo"
            ,"dimension139": "Loc1","dimension140": "Loc2","dimension141": "Loc3"}
    atr = {}
    for elem in test_list:
        for dim,val in dims.items():
            if dim in elem:
                if dim == "dimension111":
                    elem = elem.split(",")
                    atr[val[0]] = cleanText(elem[1].replace("BRAND:","").strip())
                    atr[val[1]] = cleanText(elem[2].replace("MODEL:","").strip())
                else:
                    elem = elem.split(",")
                    atr[val]= cleanText(elem[1]).strip()
    return atr 

In [6]:
def getCarData(link):
    """ 
        Table will contain:
           - Desc
           - Publish Date 
           - Price 
           - Currency 
           - Loc 1 
           - Loc 2 
           - Loc 3 
           - Seller Reputation
           - Año
           - Color
           - Tipo de Combustible
           - Puertas
           - Transmision
           - Motor
           - Tipo de Carroceria
           - Kilometros
           - Link
           - Pulling Date """
    link_req = requests.get(link).content
    temp_soup = BeautifulSoup(link_req)
    car_specs = {}
    car_specs["Desc"] = cleanText(temp_soup.find_all("header","item-title")[0].text)
    car_specs["Publish Date"] = cleanText(temp_soup.find("article","vip-classified-info item-published-date").text)
    car_specs["Currency"] = temp_soup.find_all("fieldset", "item-price")[0].find_all("span", "price-tag-symbol")[0].text
    car_specs["Price"] = int(temp_soup.find_all("fieldset", "item-price")[0].find_all("span", "price-tag-fraction")[0].text.replace(".",""))
#     car_specs["Reputacion"] = get_rep(temp_soup)
    specs = getSpecList(temp_soup)
    extraSpecs = extraAtrib(temp_soup)
#     for k,v in specs.items():
#         car_specs[k] = v
    car_specs.update(specs)
    car_specs.update(extraSpecs)
    return car_specs

In [9]:
car_c = requests.get("https://autos.mercadolibre.com.ar/renault/clio/").content
main_soup = BeautifulSoup(car_c, "html.parser")
car_list = getCarLinks(main_soup)

In [14]:
%%time
car_specs_list = []
for link in car_list:
    car = getCarData(link)
    car_specs_list.append(car)

CPU times: user 4min 19s, sys: 3.64 s, total: 4min 23s
Wall time: 30min 41s


In [15]:
len(car_specs_list)

1373

In [13]:
len(car_list)

1373

In [16]:
cars_df = pd.DataFrame(car_specs_list)

In [19]:
cars_df.to_csv(path_or_buf=r"/Users/leito/ML_cars_data_pulling/clio_dataset.txt", sep = ";", index= False)

In [22]:
cars_df

Unnamed: 0,Desc,Publish Date,Currency,Price,Año,Color,Tipo de combustible,Puertas,Transmisión,Motor,Tipo de carrocería,Kilómetros,Codigo,Seller Type,Reputation Code,Brand,Model,Loc1,Loc2,Loc3
0,Renault Clio 1.2 Authentique Pack Ii 75cv,11/12/2020,$,499000,2011,Azul,Nafta,3,Manual,1.2,Hatchback,90000,MLA900720374,car_dealer,5_green,Renault,Clio,Capital Federal,Capital Federal,Villa Ortúzar
1,Renault Clio 1.2 Mio Confort Plus Abs Abcp,26/11/2020,$,610000,2014,Gris,Nafta,5,Manual,1.2,Hatchback,119000,MLA898875512,car_dealer,5_green,Renault,Clio,Bs.As. G.B.A. Norte,Vicente López,Munro
2,Renault Clio 1.2 Mio Confort Abs Abcp,10/12/2020,$,649999,2014,Gris,Nafta,3,Manual,1.2,Hatchback,56000,,,,,,,,
3,Renault Clio 1.5 Privilege,18/11/2020,$,250000,2006,,Diésel,4,Manual,1.5,Sedán,185000,MLA897683496,normal,none,Renault,Clio,Capital Federal,Capital Federal,none
4,Renault Clio 1.2 Pack Plus 5 p,05/12/2020,$,465000,2009,Negro,Nafta,5,Manual,1.2,Hatchback,79000,MLA900053214,normal,4_light_green,Renault,Clio,Bs.As. G.B.A. Norte,Vicente López,Olivos
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,Renault Clio 1.2 Mio Confort Pack,20/10/2020,$,570000,2016,,Nafta,3,Manual,1.2,Hatchback,30000,MLA884696952,normal,none,Renault,Clio,Córdoba,Córdoba,none
1369,Renault Clio 1.5 Dci Fairway,14/11/2020,$,250000,2007,,Diésel,5,Manual,1.5,Hatchback,193000,,,,,,,,
1370,Renault Clio 1.6 Rt Symbol,28/11/2020,$,240000,2001,Plateado,Nafta,5,Manual,1.6,Hatchback,183000,MLA899156885,normal,none,Renault,Clio,Bs.As. G.B.A. Oeste,Hurlingham,Hurlingham
1371,Renault Clio 1.2 Mio Dynamique,19/10/2020,$,590000,2016,,Nafta,3,Manual,1.2,Hatchback,41000,MLA884529623,normal,none,Renault,Clio,Córdoba,Tercero Arriba,none
