# Módulo 4 - Webscraping com Beautiful Soup

# aula 23

In [1]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [1]:
from bs4 import BeautifulSoup

In [4]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [5]:
soup


<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [6]:
soup.title

<title>The Dormouse's story</title>

In [7]:
soup.body

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>

In [8]:
soup.head

<head><title>The Dormouse's story</title></head>

In [9]:
soup.body.p

<p class="title"><b>The Dormouse's story</b></p>

In [10]:
soup.find_all('p','title')

[<p class="title"><b>The Dormouse's story</b></p>]

# aula 24

In [1]:
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import numpy as np

In [2]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

In [3]:
#para fazer requisição na API para puxar os dados html
#headers é um dicionário que vai dizer pra API da H&M que quem está fazendo a requisição
#é um browser e não um código python; é padrão

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
           
page = requests.get(url, headers=headers)

In [4]:
page.text



In [5]:
soup = BeautifulSoup(page.text, 'html.parser')

In [6]:
#olhei no html e vi onde os produtos estavam, no caso era na parte:
# <ul class="products-listing small">...</ul>

products = soup.find('ul', class_='products-listing small')
products

#o find só retorna o primeiro elemento, o find_all retorna uma lista com tudo; como só tem uma ul da para usar find


<ul class="products-listing small">
<li class="product-item">
<article class="hm-product-item" data-articlecode="1024256001" data-brand="H&amp;M" data-category="men_jeans_slim" data-energy-interval="" data-pre-access-end-date="" data-pre-access-groups="" data-pre-access-start-date="" data-style-with-articlecodes="" onclick="setOsaParameters(utag_data.category_id,'SMALL','1024256001'); setNotificationTicket('Oy9wbHAvcHJvZHVjdC1saXN0LXdpdGgtY291bnQvcHJvZHVjdC1saXN0OyM7cHJvZHVjdF9rZXk7MTAyNDI1Nl9ncm91cF8wMDFfZW5fdXM7MTAyNDI1NjAwMV9lbl91cztPQkpFQ1RJVkUkO05PTkU6Tk9ORTs0NDs','1024256001');">
<div class="image-container">
<a class="item-link" href="/en_us/productpage.1024256001.html" title="Slim Jeans">
<img alt="Slim JeansModel" class="item-image" data-altimage="//lp2.hm.com/hmgoepprod?set=source[/77/b5/77b566ce3845791ecb47a6252b6c1e16fd93ea68.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[m],hmver[2]&amp;call=url[file:/product/style]" data-alttext="Slim Jeans" data-src="//lp2.hm

In [7]:
#tem que olhar no html para encontrar onde está o que você quer
products_list = products.find_all('article', class_='hm-product-item')
products_list

[<article class="hm-product-item" data-articlecode="1024256001" data-brand="H&amp;M" data-category="men_jeans_slim" data-energy-interval="" data-pre-access-end-date="" data-pre-access-groups="" data-pre-access-start-date="" data-style-with-articlecodes="" onclick="setOsaParameters(utag_data.category_id,'SMALL','1024256001'); setNotificationTicket('Oy9wbHAvcHJvZHVjdC1saXN0LXdpdGgtY291bnQvcHJvZHVjdC1saXN0OyM7cHJvZHVjdF9rZXk7MTAyNDI1Nl9ncm91cF8wMDFfZW5fdXM7MTAyNDI1NjAwMV9lbl91cztPQkpFQ1RJVkUkO05PTkU6Tk9ORTs0NDs','1024256001');">
 <div class="image-container">
 <a class="item-link" href="/en_us/productpage.1024256001.html" title="Slim Jeans">
 <img alt="Slim JeansModel" class="item-image" data-altimage="//lp2.hm.com/hmgoepprod?set=source[/77/b5/77b566ce3845791ecb47a6252b6c1e16fd93ea68.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[m],hmver[2]&amp;call=url[file:/product/style]" data-alttext="Slim Jeans" data-src="//lp2.hm.com/hmgoepprod?set=source[/28/55/2855b986c1b682dcbf559fc1

In [8]:
print(f'Tem {len(products_list)} produtos')

Tem 36 produtos


In [9]:
#data-article code é o ID do produto
products_list[1].get('data-articlecode')

'0985159001'

### product id

In [10]:
#product id
product_id = [p.get('data-articlecode') for p in products_list]
product_id

['1024256001',
 '0985159001',
 '0875105024',
 '1024256002',
 '1004199004',
 '0985159007',
 '1024256003',
 '0690449036',
 '1024256005',
 '0690449022',
 '0985159008',
 '0971061002',
 '0690449051',
 '1024256007',
 '0985159005',
 '0690449043',
 '0875105018',
 '1008549001',
 '0690449056',
 '1024256004',
 '0971061005',
 '0971061004',
 '1008549006',
 '1008110001',
 '0938875007',
 '0985159006',
 '1004199001',
 '0938875013',
 '1004199002',
 '1008549002',
 '1008110002',
 '0971061006',
 '1074475001',
 '1024256008',
 '1008549003',
 '1013317012']

### product category

In [11]:
#product category
product_category = [p.get('data-category') for p in products_list]
product_category

['men_jeans_slim',
 'men_jeans_skinny',
 'men_jeans_relaxed',
 'men_jeans_slim',
 'men_jeans_skinny',
 'men_jeans_skinny',
 'men_jeans_slim',
 'men_jeans_ripped',
 'men_jeans_slim',
 'men_jeans_ripped',
 'men_jeans_skinny',
 'men_jeans_slim',
 'men_jeans_ripped',
 'men_jeans_slim',
 'men_jeans_skinny',
 'men_jeans_ripped',
 'men_jeans_relaxed',
 'men_jeans_regular',
 'men_jeans_ripped',
 'men_jeans_slim',
 'men_jeans_slim',
 'men_jeans_slim',
 'men_jeans_regular',
 'men_jeans_slim',
 'men_jeans_slim',
 'men_jeans_skinny',
 'men_jeans_skinny',
 'men_jeans_slim',
 'men_jeans_skinny',
 'men_jeans_regular',
 'men_jeans_slim',
 'men_jeans_slim',
 'men_jeans_loose',
 'men_jeans_slim',
 'men_jeans_regular',
 'men_jeans_joggers']

### product name

In [12]:
#product name
product_list = products.find_all('a', class_='link')

product_list[0].get_text()

'Slim Jeans'

In [13]:
product_name = [p.get_text() for p in product_list]
product_name

['Slim Jeans',
 'Skinny Jeans',
 'Relaxed Jeans',
 'Slim Jeans',
 'Skinny Cropped Jeans',
 'Skinny Jeans',
 'Slim Jeans',
 'Skinny Jeans',
 'Slim Jeans',
 'Skinny Jeans',
 'Skinny Jeans',
 'Slim Tapered Cropped Jeans',
 'Skinny Jeans',
 'Slim Jeans',
 'Skinny Jeans',
 'Skinny Jeans',
 'Relaxed Jeans',
 'Regular Jeans',
 'Skinny Jeans',
 'Slim Jeans',
 'Slim Tapered Cropped Jeans',
 'Slim Tapered Cropped Jeans',
 'Regular Jeans',
 'Freefit® Slim Jeans',
 'Slim Tapered Jeans',
 'Skinny Jeans',
 'Skinny Cropped Jeans',
 'Slim Tapered Jeans',
 'Skinny Cropped Jeans',
 'Regular Jeans',
 'Freefit® Slim Jeans',
 'Slim Tapered Cropped Jeans',
 'Loose Jeans',
 'Slim Jeans',
 'Regular Jeans',
 'Hybrid Regular Tapered Joggers']

### product price

In [14]:
#product price
# <span class="price regular">$ 19.99</span>

product_list = products.find_all('span',class_='price regular')
product_price = [p.get_text() for p in product_list]
product_price

['$ 19.99',
 '$ 19.99',
 '$ 29.99',
 '$ 19.99',
 '$ 29.99',
 '$ 19.99',
 '$ 19.99',
 '$ 39.99',
 '$ 19.99',
 '$ 39.99',
 '$ 19.99',
 '$ 29.99',
 '$ 39.99',
 '$ 19.99',
 '$ 19.99',
 '$ 39.99',
 '$ 29.99',
 '$ 19.99',
 '$ 39.99',
 '$ 19.99',
 '$ 29.99',
 '$ 29.99',
 '$ 19.99',
 '$ 49.99',
 '$ 39.99',
 '$ 19.99',
 '$ 29.99',
 '$ 39.99',
 '$ 29.99',
 '$ 19.99',
 '$ 49.99',
 '$ 29.99',
 '$ 39.99',
 '$ 19.99',
 '$ 19.99',
 '$ 44.99']

### criação do dataframe

In [15]:
data = pd.DataFrame( [product_id, product_category, product_name,product_price]).T

data.columns = ['product_id', 'product_category', 'product_name','product_price']

data.head()

Unnamed: 0,product_id,product_category,product_name,product_price
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99
1,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99
2,875105024,men_jeans_relaxed,Relaxed Jeans,$ 29.99
3,1024256002,men_jeans_slim,Slim Jeans,$ 19.99
4,1004199004,men_jeans_skinny,Skinny Cropped Jeans,$ 29.99


In [16]:
# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S' )

In [17]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 16:26:02
1,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2022-06-21 16:26:02
2,875105024,men_jeans_relaxed,Relaxed Jeans,$ 29.99,2022-06-21 16:26:02
3,1024256002,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 16:26:02
4,1004199004,men_jeans_skinny,Skinny Cropped Jeans,$ 29.99,2022-06-21 16:26:02


# aula 25

In [18]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#requisição para mostrar para a API que quem está fazendo a requisição dos dados é um 
#browser e não um código python

page = requests.get (url, headers = headers)

In [19]:
#para pegar o html da página
#page.text

In [20]:
soup = BeautifulSoup(page.text, 'html.parser')

### Paginação

- Nas páginas html são feitas paginações, que consiste em dividir os produtos em páginas para não sobrecarregar a API. Normalmente na url tem os parâmetros e é possível ver como funciona a divisão na página (offset=0&page-size=72)
    - offset é o valor que começa a mostrar o produto
    - page-size é até qual produto é mostrando na página

In [21]:
#paginação: para saber quantos produtos tem por página 
#pela inspeção (inspecionar) na página vi que está na tag h2 e na class 'load-more-heading'
soup.find_all('h2' , class_='load-more-heading')

[<h2 class="load-more-heading" data-items-shown="36" data-total="84">SHOWING 36 of 84 Items</h2>]

In [22]:
total_item = soup.find_all('h2' , class_='load-more-heading')[0].get('data-total')
total_item

'84'

In [23]:
#transformar em inteiro
int(total_item) 

84

In [24]:
#no caso pelo site vimos que tem 36 produtos por página, então para saber quantas páginas vamos precisar:
page_number = np.round(int(total_item)/36)
page_number

2.0

In [25]:
#montando a url
#em uma url depois do ? fica os parâmetros 

url02 = url + '?page-size' + str(int(page_number*36))
url02

'https://www2.hm.com/en_us/men/products/jeans.html?page-size72'

# aula 26 e 27

### pegando as informações de um produto específico

In [26]:
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import numpy as np

In [27]:
#primeiro faz o webscraping de um produto sozinho, se funcionar faz com todos os outros

url = 'https://www2.hm.com/en_us/productpage.0636207010.html'

#para fazer requisição na API para puxar os dados html
#headers é um dicionário que vai dizer pra API da H&M que quem está fazendo a requisição
#é um browser e não um código python; é padrão

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
           
page = requests.get(url, headers=headers)

page.text

soup = BeautifulSoup(page.text, 'html.parser')

### color name

In [28]:
#ir na página, inspecionar e procurar onde na html está a informação de cor
soup.find_all('a',class_='filter-option miniature')[0].get('data-color')

'Dark denim blue'

In [29]:
product_list= soup.find_all('a',class_='filter-option miniature')
product_list

[<a aria-checked="false" class="filter-option miniature" data-articlecode="0636207001" data-color="Dark denim blue" data-sizes="" href="/en_us/productpage.0636207001.html" id="filter-colour-0636207001" role="radio" title="Dark denim blue">
 <noscript data-alt="Dark denim blue" data-src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2F36%2F49%2F3649b9506787743b7d4b0e5f8014d13946bf3632.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5Bmen_jeans_slim%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B2%5D&amp;call=url[file:/product/miniature]">
 <img alt="Dark denim blue" src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2F36%2F49%2F3649b9506787743b7d4b0e5f8014d13946bf3632.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5Bmen_jeans_slim%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B2%5D&amp;call=url[file:/product/miniature]"/>
 </noscript>
 <span></span>
 </a>,
 <a aria-checked="false" class="filter-option miniature" data-articlecode="0636207002" data-color="Dark gray de

In [30]:
#percorrendo todas as cores do produto
color_name = [p.get('data-color') for p in product_list]
color_name

['Dark denim blue',
 'Dark gray denim',
 'Denim blue',
 'Gray',
 'Black',
 'Midnight blue',
 'Dark gray',
 'Denim blue',
 'White',
 'Pale denim blue']

### product id

In [31]:
product_id = [p.get('data-articlecode') for p in product_list]
product_id

['0636207001',
 '0636207002',
 '0636207004',
 '0636207005',
 '0636207006',
 '0636207011',
 '0636207014',
 '0636207015',
 '0636207017',
 '0636207019']

In [32]:
#criando o df
df_color = pd.DataFrame([product_id, color_name]).T
df_color.columns = ['product_id','product_color']
df_color

Unnamed: 0,product_id,product_color
0,636207001,Dark denim blue
1,636207002,Dark gray denim
2,636207004,Denim blue
3,636207005,Gray
4,636207006,Black
5,636207011,Midnight blue
6,636207014,Dark gray
7,636207015,Denim blue
8,636207017,White
9,636207019,Pale denim blue


In [33]:
#criando o style id + color id
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

In [34]:
df_color

Unnamed: 0,product_id,product_color,style_id,color_id
0,636207001,Dark denim blue,636207,1
1,636207002,Dark gray denim,636207,2
2,636207004,Denim blue,636207,4
3,636207005,Gray,636207,5
4,636207006,Black,636207,6
5,636207011,Midnight blue,636207,11
6,636207014,Dark gray,636207,14
7,636207015,Denim blue,636207,15
8,636207017,White,636207,17
9,636207019,Pale denim blue,636207,19


Vai na html, procura qual estrutura da html seu dado está armazenado, vê se tem algum nome de estrutura (seja classe, seja div, seja ID etc) que é única  ou se repete para todos os pontos onde os dados estão armazenados e vamos usar isso para poder extrair aquele estrutura html.
Depois que extrair a estrutura html a gente extrai o dado de dentro da estrutura. 

### compositon 

In [35]:
soup.find_all('div', class_='details-attributes-list-item')

[<div class="details-attributes-list-item">
 <dt class="details-headline">messages.garmentLength</dt>
 <dd class="details-list-item">Long</dd>
 </div>,
 <div class="details-attributes-list-item">
 <dt class="details-headline">messages.waistRise</dt>
 <dd class="details-list-item">Regular waist</dd>
 </div>,
 <div class="details-attributes-list-item">
 <dt class="details-headline">Fit</dt>
 <dd class="details-list-item">Slim fit</dd>
 </div>,
 <div class="details-attributes-list-item">
 <dt class="details-headline">Composition</dt>
 <dd class="details-list-item">Cotton 88%, Polyester 10%, Spandex 2%</dd>
 <dd class="details-list-item">Pocket lining: Cotton 100%</dd>
 </div>,
 <div class="details-attributes-list-item">
 <dt class="details-headline">Care instructions</dt>
 <dd class="details-list-item">Only non-chlorine bleach when needed</dd>
 <dd class="details-list-item">Medium iron</dd>
 <dd class="details-list-item">Machine wash cool</dd>
 <dd class="details-list-item">Wash with like

In [36]:
#parte fit e slim 
#get_text retorna os elementos entre as estruturas html

soup.find_all('div', class_='details-attributes-list-item')[2].get_text()

'\nFit\nSlim fit\n'

In [37]:
#quebra do texto em listas segundo um parâmetro, no caso '\n'
soup.find_all( 'div',class_='details-attributes-list-item' )[2].get_text().split('\n')

['', 'Fit', 'Slim fit', '']

In [38]:
#filtra os valores vazios
list(filter(None,soup.find_all( 'div',class_='details-attributes-list-item' )[2].get_text().split('\n')))

['Fit', 'Slim fit']

In [39]:
#composição

product_composition_list = soup.find_all('div', class_='details-attributes-list-item')

product_composition_full = [list(filter(None,p.get_text().split('\n'))) for p in product_composition_list]

#rename dataframe
df_aux_full = pd.DataFrame(product_composition_full).T
df_aux_full.columns = df_aux_full.iloc[0]

#delete first row
df_aux_full = df_aux_full.iloc[1:].fillna(method='ffill')

#Para arrumar o df que tem informações a mais que eu preciso
ex = pd.DataFrame(columns=['Fit','Composition','Art. No.'])
df_composition = pd.merge(ex, df_aux_full[['Fit','Composition','Art. No.']], how = 'right', on=['Fit','Composition','Art. No.'])
df_composition = df_composition.drop_duplicates()

In [40]:
#gerando style id + color id
df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])
df_composition

Unnamed: 0,Fit,Composition,Art. No.,style_id,color_id
0,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%",636207010,636207,10
1,Slim fit,Pocket lining: Cotton 100%,636207010,636207,10


In [41]:
#juntando data color + composition
data_sku = pd.merge(df_color,df_composition[['style_id','Fit','Composition']], how='left', on='style_id')
data_sku

Unnamed: 0,product_id,product_color,style_id,color_id,Fit,Composition
0,636207001,Dark denim blue,636207,1,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
1,636207001,Dark denim blue,636207,1,Slim fit,Pocket lining: Cotton 100%
2,636207002,Dark gray denim,636207,2,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
3,636207002,Dark gray denim,636207,2,Slim fit,Pocket lining: Cotton 100%
4,636207004,Denim blue,636207,4,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
5,636207004,Denim blue,636207,4,Slim fit,Pocket lining: Cotton 100%
6,636207005,Gray,636207,5,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
7,636207005,Gray,636207,5,Slim fit,Pocket lining: Cotton 100%
8,636207006,Black,636207,6,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
9,636207006,Black,636207,6,Slim fit,Pocket lining: Cotton 100%


### Resultado para um produto:

# One product

In [153]:
#API requests
#primeiro faz o webscraping de um produto sozinho, se funcionar faz com todos os outros
url = 'https://www2.hm.com/en_us/productpage.0636207010.html'

#para fazer requisição na API para puxar os dados html
#headers é um dicionário que vai dizer pra API da H&M que quem está fazendo a requisição
#é um browser e não um código python; é padrão

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
           
page = requests.get(url, headers=headers)

page.text

#BeautifulSoup objects
soup = BeautifulSoup(page.text, 'html.parser')


########################### color name ###################################
product_list= soup.find_all('a',class_='filter-option miniature')

#percorrendo todas as cores do produto
color_name = [p.get('data-color') for p in product_list]

#procuct id
product_id = [p.get('data-articlecode') for p in product_list]

#criando o df
df_color = pd.DataFrame([product_id, color_name]).T
df_color.columns = ['product_id','color_name']

#criando o style id + color id
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

########################### composition ###################################
product_composition_list = soup.find_all('div', class_='details-attributes-list-item')

product_composition_full = [list(filter(None,p.get_text().split('\n'))) for p in product_composition_list]

#rename dataframe
df_aux_full = pd.DataFrame(product_composition_full).T
df_aux_full.columns = df_aux_full.iloc[0]

#delete first row
df_aux_full = df_aux_full.iloc[1:].fillna(method='ffill')

#Para arrumar o df que tem informações a mais que eu preciso
ex = pd.DataFrame(columns=['Fit','Composition','Art. No.'])
df_composition = pd.merge(ex, df_aux_full[['Fit','Composition','Art. No.']], how = 'right', on=['Fit','Composition','Art. No.'])
df_composition = df_composition.drop_duplicates()

#gerando style id + color id
df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

#juntando data color + composition
data_sku = pd.merge(df_color,df_composition[['style_id','Fit','Composition']], how='left', on='style_id')
data_sku

Unnamed: 0,product_id,color_name,style_id,color_id,Fit,Composition
0,636207001,Dark denim blue,636207,1,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
1,636207001,Dark denim blue,636207,1,Slim fit,Pocket lining: Cotton 100%
2,636207002,Dark gray denim,636207,2,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
3,636207002,Dark gray denim,636207,2,Slim fit,Pocket lining: Cotton 100%
4,636207004,Denim blue,636207,4,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
5,636207004,Denim blue,636207,4,Slim fit,Pocket lining: Cotton 100%
6,636207005,Gray,636207,5,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
7,636207005,Gray,636207,5,Slim fit,Pocket lining: Cotton 100%
8,636207006,Black,636207,6,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
9,636207006,Black,636207,6,Slim fit,Pocket lining: Cotton 100%


### Multi Products

In [42]:
#cada linha é um produto na vitrine
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 16:26:02
1,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2022-06-21 16:26:02
2,875105024,men_jeans_relaxed,Relaxed Jeans,$ 29.99,2022-06-21 16:26:02
3,1024256002,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 16:26:02
4,1004199004,men_jeans_skinny,Skinny Cropped Jeans,$ 29.99,2022-06-21 16:26:02


In [21]:
#gerando todas as urls

for i in range(len(data)):
    #API requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i,'product_id'] + '.html'
    print(url)

https://www2.hm.com/en_us/productpage.0985159001.html
https://www2.hm.com/en_us/productpage.1024256001.html
https://www2.hm.com/en_us/productpage.0875105024.html
https://www2.hm.com/en_us/productpage.0690449056.html
https://www2.hm.com/en_us/productpage.1004199004.html
https://www2.hm.com/en_us/productpage.1024256002.html
https://www2.hm.com/en_us/productpage.0985159007.html
https://www2.hm.com/en_us/productpage.1024256003.html
https://www2.hm.com/en_us/productpage.1024256004.html
https://www2.hm.com/en_us/productpage.1024256005.html
https://www2.hm.com/en_us/productpage.1008110001.html
https://www2.hm.com/en_us/productpage.0971061002.html
https://www2.hm.com/en_us/productpage.0690449036.html
https://www2.hm.com/en_us/productpage.0971061005.html
https://www2.hm.com/en_us/productpage.0690449051.html
https://www2.hm.com/en_us/productpage.0690449022.html
https://www2.hm.com/en_us/productpage.1008549001.html
https://www2.hm.com/en_us/productpage.1024256007.html
https://www2.hm.com/en_us/pr

In [43]:
#A diferença principal é que para multiplos produtos a url muda

#criando um df vazio para juntar todos os produtos nele
df_details = pd.DataFrame()

#lista vazia para colocar todos os nomes das colunas para que o df seja padronizado, se algum produto
#não tiver alguma coluna vai ficar vazio
aux = []

cols = ['Art. No.', 'Composition', 'Fit', 'color_id', 'style_id']
df_pattern = pd.DataFrame(columns=cols)

#parametros
#headers é um dicionário que vai dizer pra API da H&M que quem está fazendo a requisição
#é um browser e não um código python; é padrão
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
  
#gerando todas as urls
for i in range(len(data)):
    #API requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i,'product_id'] + '.html'
    #print(url)
    
    #para fazer requisição na API para puxar os dados html       
    page = requests.get(url, headers=headers)

    #BeautifulSoup objects
    soup = BeautifulSoup(page.text, 'html.parser')

    ########################### color name ###################################
    product_list= soup.find_all('a',class_='filter-option miniature')

    #percorrendo todas as cores do produto
    color_name = [p.get('data-color') for p in product_list]

    #procuct id
    product_id = [p.get('data-articlecode') for p in product_list]

    #criando o df
    df_color = pd.DataFrame([product_id, color_name]).T
    df_color.columns = ['product_id','color_name']

    #criando o style id + color id
    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

    ########################### composition ###################################
    product_composition_list = soup.find_all('div', class_='details-attributes-list-item')

    product_composition_full = [list(filter(None,p.get_text().split('\n'))) for p in product_composition_list]

    #rename dataframe
    df_aux_full = pd.DataFrame(product_composition_full).T
    df_aux_full.columns = df_aux_full.iloc[0]

    #delete first row
    df_aux_full = df_aux_full.iloc[1:].fillna(method='ffill')
    
    #garantia que tenha a mesma quantidade de colunas
    df_composition = pd.concat( [df_pattern, df_composition], axis=0 )

    #Para arrumar o df que tem informações a mais que eu preciso
    ex = pd.DataFrame(columns=['Fit','Composition','Art. No.'])
    df_composition = pd.merge(ex, df_aux_full[['Fit','Composition','Art. No.']], how = 'right', on=['Fit','Composition','Art. No.'])
    df_composition = df_composition.drop_duplicates()

    #gerando style id + color id
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

    #lista vazia para colocar todos os nomes das colunas para que o df seja padronizado, se algum produto
    #não tiver alguma coluna vai ficar vazio
    aux = aux + df_composition.columns.tolist()
    
    #juntando data color + composition
    data_sku = pd.merge(df_color,df_composition[['style_id', 'Fit','Composition']], how='left', on='style_id')
    
    #juntando todos os dataframes individuais em um único df com todos os produtos
    df_details = pd.concat([df_details, data_sku], axis = 0)
    
# Join Showroom data + details
data['style_id'] = data['product_id'].apply(lambda x: x[:-3])
data['color_id'] = data['product_id'].apply(lambda x: x[-3:])

data_raw = pd.merge(data, df_details[['style_id', 'color_name', 'Fit','Composition']], how='left',on='style_id')

In [31]:
#set mostra os dados sem duplicação
#usado para saber quais dados tem os produtos
set(aux)

{'Art. No.', 'Composition', 'Fit', 'color_id', 'style_id'}

In [44]:
df_details.head()

Unnamed: 0,product_id,color_name,style_id,color_id,Fit,Composition
0,1024256002,Light denim blue,1024256,2,Slim fit,"Shell: Cotton 99%, Spandex 1%"
1,1024256002,Light denim blue,1024256,2,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
2,1024256003,Light denim blue,1024256,3,Slim fit,"Shell: Cotton 99%, Spandex 1%"
3,1024256003,Light denim blue,1024256,3,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
4,1024256004,Denim blue,1024256,4,Slim fit,"Shell: Cotton 99%, Spandex 1%"


In [51]:
#exportando para csv
data_raw.to_csv(r'C:\Users\laais\CDS_Python_do_DS_ao_DEV\df_products_hm.csv', index=0)

In [52]:
data_raw.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,Composition
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 16:26:02,1024256,1,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%"
1,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 16:26:02,1024256,1,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
2,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 16:26:02,1024256,1,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%"
3,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 16:26:02,1024256,1,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
4,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 16:26:02,1024256,1,Denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%"
