In [1]:
"""
Web scraping price information from www.mediamarkt.es
Based on:
https://github.com/Brinkhuis/Mediamarkt/blob/master/code/mediamarkt.py
"""

import ast
import requests
import urllib

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import time

from bs4 import BeautifulSoup
from tqdm import tqdm

%matplotlib inline

In [2]:
# URL dónde está la lista de productos
URL = "https://www.mediamarkt.es/sitemap/sitemap-productlist.xml"

In [3]:
# Defino la función para leer el xml y extraer los links que pasaremos
# Referencia:
# https://stackoverflow.com/questions/18966368/python-beautifulsoup-scrape-tables
# Le paso la URL como parámetro, así podrá valer para los diferentes sitios.
def get_products (URL):
    soup = BeautifulSoup(requests.get(URL).text, 'lxml')
    #links = soup.find_all('loc')
    links = []
    #links = soup.find_all("loc")
    for link in soup.find_all("loc"):
        links.append(str(link).replace("<loc>","").replace("</loc>",""))
    return links

In [4]:
pagelinks = get_products(URL)

In [5]:
pagelinks[1:10]

['https://www.mediamarkt.es/es/category/_afeitado-y-depilaci%C3%B3n-701138.html',
 'https://www.mediamarkt.es/es/category/_afeitadoras-faciales-701281.html',
 'https://www.mediamarkt.es/es/category/_afeitadoras-corporales-701282.html',
 'https://www.mediamarkt.es/es/category/_afeitadoras-multifunci%C3%B3n-701283.html',
 'https://www.mediamarkt.es/es/category/_cortapelos-701284.html',
 'https://www.mediamarkt.es/es/category/_barberos-701285.html',
 'https://www.mediamarkt.es/es/category/_depiladoras-ipl-701286.html',
 'https://www.mediamarkt.es/es/category/_rasuradoras-701287.html',
 'https://www.mediamarkt.es/es/category/_depiladoras-de-arranque-701288.html']

In [6]:
#Compruebo los pagelinks
len(pagelinks)

1012

In [7]:
#URL = "https://www.mediamarkt.es/es/category/_afeitadoras-faciales-701281.html"
#URL = 'http://www.mediamarkt.nl/nl/category/_laptops-482723.html'

In [20]:
def npages(mysoup):
    pagination = list()
    npages = 0
    for page_number in mysoup.find_all('div', {'class': 'pagination-wrapper cf'}):
        pagination.append(page_number.find_all('a'))
        npages = int(str(pagination[0]).split(', ')[-2].strip('</a>').split('>')[-1])
        print (npages)
    return npages        


def get_data(URL, output_file = None):
    item_list = list()
    #npages = npages(BeautifulSoup(requests.get(URL).text, 'html.parser')
    for page in tqdm(range(1, npages(BeautifulSoup(requests.get(URL).text, 'html.parser')) + 1)):
        soup = BeautifulSoup(requests.get(URL + '?page=' + str(page)).text, 'html.parser')
        all_scripts = soup.find_all('script')
        for script in all_scripts:
            if script.text.startswith('var product'):
                item_list.append(ast.literal_eval(script.text.split(' = ')[1].strip(';')))

    productinfo = pd.DataFrame()
    
    productinfo = pd.DataFrame(item_list)
    
    #productinfo['price'] = pd.to_numeric(productinfo['price'], errors='coerce')
    
    #productinfo.to_csv(output_file, index=False)
    
    #print(f'{productinfo.shape[0]} records saved to {output_file}')
    
    return productinfo

In [21]:
#pro_info = get_data()

In [22]:
#output_file = './belleza_y_salud.csv'
#pro_info.to_csv(output_file)

In [23]:
#print(f'{pro_info.shape[0]} records saved to {output_file}')

In [24]:
def viz_data(input_file, output_file):
    productinfo = pd.read_csv(input_file)

    x_pixels, y_pixels, dpi = 1500, 1000, 150
    x_inch, y_inch = x_pixels / dpi, y_pixels / dpi

    plt.figure(figsize=(x_inch, y_inch), dpi=dpi)
    sns.boxplot(x='price',
                y='brand',
                data=productinfo.groupby('brand').filter(lambda x: len(x) > 3),
                order=list(productinfo.groupby('brand').filter(lambda x: len(x) > 3)
                           .groupby('brand').price.median().sort_values(ascending=False).index),
                palette='PRGn',
                width=0.75).set_title('Price Distribution per Brand')
    sns.despine(offset=10, trim=True)
    plt.savefig(output_file)
    plt.show()
    plt.close()
    
    print(f'Visualization saved to {output_file}')




In [25]:
#def main():
dat = './data/productinfo.csv'
viz = './plots/price_distribution_brand.png'

# Limito a los 10 primeras líneas
products = pd.DataFrame()
total_products = pd.DataFrame()
for link in pagelinks[2:10]:
    products = get_data(link, dat)
    total_products = total_products.append(products)
        
print("Done")
    #except:
    
    #time.sleep(2)
    #viz_data(dat, viz)

3


100%|██████████| 3/3 [00:02<00:00,  1.27it/s]
0it [00:00, ?it/s]


3


100%|██████████| 3/3 [00:03<00:00,  1.11s/it]


5


100%|██████████| 5/5 [00:05<00:00,  1.00s/it]


3


100%|██████████| 3/3 [00:02<00:00,  1.03it/s]


2


100%|██████████| 2/2 [00:02<00:00,  1.15s/it]
0it [00:00, ?it/s]


3


100%|██████████| 3/3 [00:03<00:00,  1.10s/it]


Done


In [47]:
#if __name__ == "__main__":
#    main()

In [26]:
total_products

Unnamed: 0,brand,category,dimension10,dimension24,dimension25,dimension26,dimension9,ean,id,name,price
0,PHILIPS,Belleza y salud,Afeitadoras faciales,21.0,OutOfStock,1.99,Afeitado y depilación,8710103738350,1293941,"Afeitadora - Philips S 5110/06 Cabezales Flex,...",57.99
1,PHILIPS,Belleza y salud,Afeitadoras faciales,21.0,InStock,1.99,Afeitado y depilación,8710103764588,1317305,"Afeitadora - Philips Shaver S1510/04, Recargab...",41.99
2,PHILIPS,Belleza y salud,Afeitadoras faciales,21.0,InStock,1.99,Afeitado y depilación,8710103819219,1360370,"Afeitadora - Philips S5550/06, Cuchillas Multi...",89.90
3,PHILIPS,Belleza y salud,Afeitadoras faciales,21.0,InStock,1.99,Afeitado y depilación,8710103764489,1317303,"Afeitadora - Philips S3510/06, sistema Comfort...",58.99
4,BRAUN,Belleza y salud,Afeitadoras faciales,21.0,InStock,1.99,Afeitado y depilación,4210201147022,1294131,"Afeitadora - Braun 5030s Series 5 + EN10, Reco...",87.99
5,PHILIPS,Belleza y salud,Afeitadoras faciales,21.0,InStock,1.99,Afeitado y depilación,8710103738121,1293945,Afeitadora - Philips S 5420/06 Sistema de cuch...,76.99
6,PHILIPS,Belleza y salud,Afeitadoras faciales,21.0,InStock,1.99,Afeitado y depilación,8710103686934,1252110,"Afeitadora - Philips S9031/12 V-Track, Autonom...",166.00
7,PHILIPS,Belleza y salud,Afeitadoras faciales,21.0,InStock,1.99,Afeitado y depilación,8710103609803,1187436,"Afeitadora - Philips AT750/26, Recargable, Cab...",55.99
8,PHILIPS,Belleza y salud,Afeitadoras faciales,21.0,InStock,1.99,Afeitado y depilación,8710103800958,1385320,"Afeitadora - Philips S7522/50, Anillos SkinGli...",149.00
9,PHILIPS,Belleza y salud,Afeitadoras faciales,21.0,InStock,1.99,Afeitado y depilación,8710103764458,1317304,"Afeitadora - Philips S3110/06, sistema Comfort...",49.99
