# Scrapping Retail Wrapper

## Retail Driver

### Homy

In [39]:
import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import re

def scrap(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, "lxml")
    
    if soup is None:
        return None
    
    return soup

def getPrices(product_div):
    
    if product_div is None:
        return None, None
    prices = product_div.findAll("div", {"class":"price"})
    
    precio_lista, precio_oferta = 0, 0
    
    if len(prices) == 1: 
        precio = int("".join(re.findall(r'\d', prices[0].text)))
        precio_lista = precio
        precio_oferta = precio
    elif len(prices) > 1: 
        precio_lista = int("".join(re.findall(r'\d', prices[1].text)))
        precio_oferta = int("".join(re.findall(r'\d', prices[0].text)))
        
    descuento = int((precio_lista - precio_oferta)/precio_lista*100)
    
    return precio_lista, precio_oferta, descuento

def extractProduct(product_div):
    
    if product_div is None:
        return None

    product = dict()

    product['tienda'] = 'Homy'
    product['sku'] = product_div["data-key"]
    product['marca'] = product_div.find("div", {"class":"product-brand"}).text
    product['nombre'] = product_div.find("h2", {"class":"product-title"}).text
    
    product['precio-lista'], product['precio-oferta'], product['descuento']  = getPrices(product_div)
    
    product['imagen'] = product_div.find("a", {"class":"link-primary"}).div.img["src"]
    product['url'] = "www.sodimac.cl" + product_div.find("a", {"class":"link-primary"})['href']
    product['time'] = datetime.datetime.now()
    
    return product

def searchProductHomy(search_term):
    base_url = "https://www.sodimac.cl/sodimac-homy/search?Ntt=" + search_term
    url = base_url
    products = list()

    for page in range(1,1000):
        if page > 1:
            url = base_url + "&currentpage=" + str(page)

        print("Homy Scrapping: " + url)

        soup = scrap(url)

        elements = soup.findAll("div", {"class": "product-wrapper"})

        if len(elements) <= 0:
            break
    
        products.extend([ extractProduct(i) for i in elements])
    
    return products

### Ripley

## Main Program

In [36]:
search_term = "comoda"

products = searchProductHomy(search_term)
    
print("Total scraped products: ", len(products))

Homy Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda
Homy Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=2
Homy Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=3
Homy Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=4
Homy Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=5
Homy Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=6
Homy Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=7
Homy Scrapping: https://www.sodimac.cl/sodimac-homy/search?Ntt=comoda&currentpage=8
Total scraped products:  189


## Data Analysis

In [37]:

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(os.getcwd() + '\pandas_simple.xlsx', engine='xlsxwriter')

df = pd.DataFrame(products)
print("Directory", os.getcwd())
print("Duplicated", df[df.duplicated()].marca.count())
df.to_excel(writer)
writer.save()
df.describe()

Directory C:\Users\jquin\Desktop\WORKSPACE\scrap-retail_notebook\notebooks
Duplicated 0


Unnamed: 0,precio-lista,precio-oferta,descuento
count,189.0,189.0,189.0
mean,165688.730159,155734.444444,4.936508
std,103964.43624,95569.559241,8.206522
min,10990.0,10990.0,0.0
25%,89990.0,87990.0,0.0
50%,139990.0,129990.0,0.0
75%,215990.0,199990.0,12.0
max,680990.0,680990.0,26.0
