In [87]:
import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import re

In [121]:
def scrap(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, "lxml")
    
    if soup is None:
        return None
    
    return soup

def extractProduct(product_div):
    
    if product_div is None:
        return None

    product = dict()

    product['sku'] = product_div["data-key"]
    product['marca'] = product_div.find("div", {"class":"product-brand"}).text
    product['nombre'] = product_div.find("h2", {"class":"product-title"}).text
    product['precio'] = int("".join(re.findall(r'\d', product_div.find("div", {"class":"price"}).text)))
    product['imagen'] = product_div.find("a", {"class":"link-primary"}).div.img["src"]
    product['url'] = "www.sodimac.cl" + product_div.find("a", {"class":"link-primary"})['href']
    product['time'] = datetime.datetime.now()
    
    return product

In [148]:
search_term = "comoda"
base_url = "https://www.sodimac.cl/sodimac-homy/search?Ntt=" + search_term

url = base_url

products = list()

for page in range(1,1000):
    if page > 1:
        url = base_url + "&currentpage=" + str(page)
        
    print("Scrapping: " + url)
        
    soup = scrap(url)
    
    elements = soup.findAll("div", {"class": "product-wrapper"})
    
    if len(elements) <= 0:
        break
    
    products.extend([ extractProduct(i) for i in elements])
    
print("Total scraped products: ", len(products))

Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda
28
Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=2
28
Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=3
28
Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=4
28
Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=5
28
Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=6
28
Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=7
27
Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=8
0


195

In [149]:
df = pd.DataFrame(products)
df

Unnamed: 0,sku,marca,nombre,precio,imagen,url,time
0,4025695,Meya Muebles,Cómoda taiwan,400990,https://sodimac.scene7.com/is/image//SodimacCL...,www.sodimac.cl/sodimac-homy/product/4025695/Co...,2020-03-14 21:07:31.415617
1,4025660,Meya Muebles,Cómoda bali,680990,https://sodimac.scene7.com/is/image//SodimacCL...,www.sodimac.cl/sodimac-homy/product/4025660/Co...,2020-03-14 21:07:31.416615
2,363888X,Medular,Comoda cherry,215990,https://sodimac.scene7.com/is/image//SodimacCL...,www.sodimac.cl/sodimac-homy/product/363888X/Co...,2020-03-14 21:07:31.417615
3,4227204,Kidscool,"Cómoda 40x61c85,5 cm",69990,https://sodimac.scene7.com/is/image//SodimacCL...,www.sodimac.cl/sodimac-homy/product/4227204/Co...,2020-03-14 21:07:31.418615
4,537653X,Kidscool,Cómoda 77x93x46 cm menta,109990,https://sodimac.scene7.com/is/image//SodimacCL...,www.sodimac.cl/sodimac-homy/product/537653X/Co...,2020-03-14 21:07:31.420614
...,...,...,...,...,...,...,...
190,3048357,Homy,Tocador 42x90x142 cm Blanco,139990,https://sodimac.scene7.com/is/image//SodimacCL...,www.sodimac.cl/sodimac-homy/product/3048357/To...,2020-03-14 21:07:36.421677
191,3855589,Tecnomobili,Tocador 2 cajones 48x36x175 cm Blanco,109990,https://sodimac.scene7.com/is/image//SodimacCL...,www.sodimac.cl/sodimac-homy/product/3855589/To...,2020-03-14 21:07:36.421677
192,3048365,Homy,Tocador triple 40x90x145 cm blanco,139990,https://sodimac.scene7.com/is/image//SodimacCL...,www.sodimac.cl/sodimac-homy/product/3048365/To...,2020-03-14 21:07:36.422676
193,2733951,Tvilum,Zapatera 5 cajones 71x20x181 oak,99990,https://sodimac.scene7.com/is/image//SodimacCL...,www.sodimac.cl/sodimac-homy/product/2733951/Za...,2020-03-14 21:07:36.423676


In [150]:
df.describe()

Unnamed: 0,precio
count,195.0
mean,159235.128205
std,101600.74075
min,10990.0
25%,87990.0
50%,129990.0
75%,212490.0
max,680990.0


In [151]:
df[df.duplicated()]

Unnamed: 0,sku,marca,nombre,precio,imagen,url,time
