In [126]:
import datetime
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

In [127]:
def scrap(url):
    html = requests.get(url).text
    
    soup = BeautifulSoup(html, "lxml")
    
    script = soup.find("script",{"type": "application/ld+json"})

    try:
        items = json.loads(script.text.rstrip('\n'))
    except:
        return dict()
    
    return items

def extractProduct(items):
    
    item_list = list()

    for item in items:
        d = dict()
        d['tienda'] = 'Ripey'
        d['sku'] = item['item']['sku']
        d['nombre'] = item['item']['name']
        d['marca'] = item['item']['brand']
        d['precio-oferta'] = int(item['item']['offers']['price'])
        d['imagen'] = item['item']['image']
        d['time'] = datetime.datetime.now()
        
        item_list.append(d)
    
    return item_list

def processScrapResults(items):    
    if isinstance(items, type(dict)):
        return list(), ""
    if not bool(items):
        return list(), ""
    if len(items.keys()) <= 0:
        return list(), ""
        
    url_redirect = items['url']
    
    item_list = extractProduct(items["itemListElement"])
    
    #print(len(item_list))
        
    return item_list, url_redirect

def getNextUrl(url, value):
    urlToken = url.split('?')
    params = urlToken[1].split('&')
    params[0] = params[0][:-1]
    return urlToken[0] + '?' + str(params[0]) + str(value+1) + "".join(["&"+param for param in params[1:]])

def searchProductRipley(search_term):
    url = "https://simple.ripley.cl/search/" + search_term

    items = list()
    urls = list()

    for value in range (1,1000):     
        
        print("Search: ", url)
        urls.append(url)

        scraped_items, url = (processScrapResults(scrap(url)))

        if len(url) <= 0:
            break

        if value == 1:
            url = url + "&page=2"
        elif value > 1:
            url = getNextUrl(url, value)

        items.extend(scraped_items)
    
    return items, urls

In [128]:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def getRipleyProductsOfUrl(driver):
    
    events = driver.find_elements_by_xpath('//a[contains(@class, "catalog-product-item")]')

    products = []
    
    for event in events:
        product = dict()

        product['sku'] = event.get_attribute("id")
        product['url'] = str(event.get_attribute('href'))

        prices = [x.text for x in event.find_elements_by_tag_name('li')]

        if len(prices) == 3:
            product['precio-lista'] = prices[0]
            product['precio-oferta'] = prices[2]
        elif len(prices) == 2:
            product['precio-lista'] = prices[0]
            product['precio-oferta'] = prices[1]
        elif len(prices) == 1:
            product['precio-lista'] = prices[0]
            product['precio-oferta'] = prices[0]

        products.append(product)
        
    return products

def getRipleyProductPricesOfUrls(urls):
    
    options = webdriver.FirefoxOptions()
    options.add_argument('-headless')
    driver = webdriver.Firefox(firefox_options=options, 
                              executable_path="C:\\Users\\jquin\\Desktop\\WORKSPACE\\scrap-retail_notebook\\geckodriver.exe")
    
    products = []
    
    for url in urls:
        print("Search: ", url)
        driver.get(url)
        products.extend(getRipleyProductsOfUrl(driver))
    
    
    driver.quit()
    
    return products


In [129]:
search_term = "comoda"
   
items, urls = searchProductRipley(search_term)

items_price = getRipleyProductPricesOfUrls(urls)

Search:  https://simple.ripley.cl/search/comoda
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?source=search&term=comoda&page=2
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?page=3&source=search&term=comoda
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?page=4&source=search&term=comoda
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?page=5&source=search&term=comoda
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?page=6&source=search&term=comoda
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?page=7&source=search&term=comoda
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?page=8&source=search&term=comoda




Search:  https://simple.ripley.cl/search/comoda
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?source=search&term=comoda&page=2
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?page=3&source=search&term=comoda
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?page=4&source=search&term=comoda
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?page=5&source=search&term=comoda
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?page=6&source=search&term=comoda
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?page=7&source=search&term=comoda
Search:  http://simple.ripley.cl/dormitorio/muebles-de-dormitorio/comoda-y-tocadores?page=8&source=search&term=comoda


In [130]:
df_p = pd.DataFrame(items)

In [131]:
import re
df_pp = pd.DataFrame(items_price)
#df_pp['precio-lista'] = df_pp['precio-lista'].fillna('0').astype('int32')
df_pp['precio-lista'] = df_pp['precio-lista'].fillna('0').astype('str').apply(lambda x : "".join(re.findall(r'\d', x))).astype('int32')
df_pp['precio-oferta'] = df_pp['precio-oferta'].fillna('0').astype('str').apply(lambda x : "".join(re.findall(r'\d', x))).astype('int32')
#df_pp['precio-lista'].tolist()

In [132]:
result = df_p.merge(df_pp, how='inner', left_on='sku', right_on='sku')
result.dtypes
result['descuento'] = (result['precio-lista']-result['precio-oferta_x'])/result['precio-lista']*100

In [133]:
result.describe()

Unnamed: 0,precio-oferta_x,precio-lista,precio-oferta_y,descuento
count,156.0,156.0,156.0,156.0
mean,147380.641026,267598.653846,142701.346154,-inf
std,87036.297313,178741.070913,87879.67913,
min,30990.0,0.0,0.0,-inf
25%,79990.0,119990.0,79990.0,33.337037
50%,110990.0,209990.0,99990.0,42.30932
75%,202490.0,379922.5,199990.0,52.108006
max,399990.0,699990.0,399990.0,76.669222


In [134]:
result[result['precio-oferta_y'] != result['precio-oferta_x'] ][['url','nombre', 'precio-lista', 'precio-oferta_x', 'precio-oferta_y']]

Unnamed: 0,url,nombre,precio-lista,precio-oferta_x,precio-oferta_y
118,https://simple.ripley.cl/comoda-davos-cafe-gri...,CÓMODA DAVOS CAFÉ/ GRIS | WHITE MARKET,0,319990,0
119,https://simple.ripley.cl/comoda-lucerna-cafe-o...,CÓMODA LUCERNA CAFÉ | WHITE MARKET,0,239990,0
143,https://simple.ripley.cl/comoda-armario-inval-...,CÓMODA ARMARIO INVAL CROSS,0,169990,0


In [135]:
import os
writer = pd.ExcelWriter(os.getcwd() + '\pandas_simple.xlsx', engine='xlsxwriter')

result.to_excel(writer)
writer.save()
result.describe()

Unnamed: 0,precio-oferta_x,precio-lista,precio-oferta_y,descuento
count,156.0,156.0,156.0,156.0
mean,147380.641026,267598.653846,142701.346154,-inf
std,87036.297313,178741.070913,87879.67913,
min,30990.0,0.0,0.0,-inf
25%,79990.0,119990.0,79990.0,33.337037
50%,110990.0,209990.0,99990.0,42.30932
75%,202490.0,379922.5,199990.0,52.108006
max,399990.0,699990.0,399990.0,76.669222
