In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from os.path  import basename
import time
import csv
import requests
import os
import glob
import pandas as pd

## Constantes

In [4]:
SLEEP_TIME_BASE = 0.1

## Funciones

In [5]:
# Preparación del navegador y aceptación de politica de cookies
def prepare_environtment(driver, url):
    driver.implicitly_wait(SLEEP_TIME_BASE * 60)
    driver.get(url)
    time.sleep(SLEEP_TIME_BASE * 4)
    driver.find_elements_by_class_name("accept-cookie")[0].click()

In [6]:
# Permite cambiar la URL del navegador y obtener una nueva pagina para parsear
def change_url(driver, url):
    driver.get(url)
    time.sleep(SLEEP_TIME_BASE)
    return BeautifulSoup(driver.page_source,"html5lib")

In [57]:
# Dado una pagina de producto -> obtener todas las características que consideramos relevantes
def get_product_detail_info(page):
    product = {}
    product['timestamp'] = time.time()
    product['company_name'] = 'pccomponentes'
    product['name'] = page.find("div", class_="ficha-producto__encabezado").find("div", class_="articulo").h1.strong.text
    try:
        product['brand_name'] = page.find("div", class_="ficha-producto__datos-de-compra").a.text
    except:
        product['brand_name'] = 'undefined'
    product['category'] = page.find("div", class_="navegacion-secundaria__migas-de-pan").findAll("a")[-1].text
    product['product_number'] = page.find("span", id="codigo-articulo-pc").parent.find("span").text
    product['price'] = float(page.find("div", class_="ficha-producto__encabezado").find("div", class_="priceBlock")['data-baseprice'])
    try:
        product['score'] = float(page.find("div", id="ficha-producto-opinones").find("div", class_="percentage").text)
    except:
        product['score'] = 0
    try:
        product['image_url'] = 'https:' + page.find("img", class_="pc-com-zoom")['src']
    except:
        product['image_url'] = 'undefined'
    try:
        product['reviews'] = int(page.find("div", class_="ficha-producto__encabezado").find("span", class_="acciones").a.text.replace(' Opiniones', '').replace('\n',''))
    except:
        product['reviews'] = 0
    return product

In [58]:
# Permite recorrer todas las url de productos y ir obteniendo sus características
def obtain_all_products(driver, urls):
    products = []
    for i, url in enumerate(urls):
        page = change_url(driver, url)
        product = get_product_detail_info(page)
        products.append(product)
        if i % 50 == 0:
            print(i)
    return products

In [56]:
# Filtrar los productos por las categorias del dataset
def filter_products_by_categories(products):
    products_filtered = []
    selected_categories = ['Procesadores', 'Discos Duros', 'Fuentes Alimentación', 'Memoria RAM', 'Placas Base', 'Tarjetas de Sonido', 'Tarjetas Gráficas', 'Torres']
    for product in products:
        if product['category'] in selected_categories:
            products_filtered.append(product)
    return products_filtered

In [9]:
# Función para obtener 1 imagen
def download_image(category, image_url):
    try:
        os.makedirs(os.path.join('.','images', category))
    except:
        pass
    file_path = os.path.join('.','images', category, basename(image_url))
    with open(file_path, "wb") as f:
        f.write(requests.get(image_url).content)
    return file_path

In [31]:
# Permite recorrer todas las url de los productos e ir obteniendo las imagenes
def download_product_images(products):
    sleep_time = SLEEP_TIME_BASE
    sleep_time_inc = 0.1
    sleep_time_max = SLEEP_TIME_BASE * 20
    count_success = 0
    for i, product in enumerate(products):
        try:
            product['image_url_dataset'] = download_image(product['category'], product['image_url'])
            count_success += 1
            if sleep_time > SLEEP_TIME_BASE and count_success > 10:
                sleep_time -= sleep_time_inc
                print('accelerating speed to: ' + str(sleep_time))
        except:
            product['image_url_dataset'] = product['image_url']
            sleep_time += sleep_time_inc
            print('slowing down to: ' + str(sleep_time))
            count_success = 0
            if sleep_time > sleep_time_max:
                sleep_time = sleep_time_max
        if i % 50 == 0:
            print(i)
        time.sleep(sleep_time)
    return products

In [66]:
# Dada una lista de productos con sus caracteristicas -> genera un nuevo documento CSV con todos los productos
def to_csv(products, name_csv):
    with open(name_csv, 'w', newline='') as csvfile:
        fieldnames = ['timestamp','company_name','name', 'brand_name', 'category','product_number', 'price', 'score', 'image_url','reviews', 'image_url_dataset']
        productwriter = csv.DictWriter(csvfile, delimiter=',', fieldnames=fieldnames)
        productwriter.writeheader()
        for product in products:
            productwriter.writerow(product)

## Obtención del dataset de productos de PCComponentes por partes

In [24]:
# Preparación del entorno para poder recorrer la tienda
driver = webdriver.Chrome()
prepare_environtment(driver, 'https://www.pccomponentes.com/')

In [13]:
# Obtención de todas las url de productos desde el sitemap
sitemap_products = change_url(driver, 'https://www.pccomponentes.com/sitemap_articles_components.xml')
urls_clean = list(map(lambda x: x.text, sitemap_products.findAll("loc",text=True)))

In [None]:
n = 500
for i in range(0,len(urls_clean), n):
    # Obtención de todas las características de los productos de la tienda por chunks
    products_chunk = obtain_all_products(driver, urls_clean[i:i+n])
    # Seleccionar solo aquellos productos que interesan para el dataset
    products_chunk_filtered = filter_products_by_categories(products_chunk)
    # Descargar la imagen principal de cada producto de la tienda del chunk
    products_chunk_images = download_product_images(products_chunk_filtered)
    # Generación parcial del dataset
    to_csv(products_chunk_images, 'pccomponentes_products_' + str(i) + '_' + str(i+n) + '.csv')

## Fusión de todos los csv parciales en uno completo

In [2]:
os.chdir("./")
all_filenames = [i for i in glob.glob('*.{}'.format('csv'))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv.to_csv( "dataset_pccomponentes.csv", index=False, encoding='utf-8')