# Extração de Dados em HTML

ID, product_name, product_type, product_color, composition, price

In [1]:
import requests
import pandas as pd
import numpy as np
import re
from datetime import datetime
from bs4 import BeautifulSoup


In [2]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

page = requests.get(url, headers=headers)

In [3]:
soup = BeautifulSoup(page.text, 'html.parser')

In [4]:
products = soup.find('ul', class_='products-listing small')

In [5]:
product_list = products.find_all('article', class_='hm-product-item')
product_list

[<article class="hm-product-item" data-articlecode="0985159001" data-category="men_jeans_skinny" data-energy-interval="" data-pre-access-end-date="" data-pre-access-groups="" data-pre-access-start-date="" onclick="setOsaParameters(utag_data.category_id,'SMALL','0985159001'); setNotificationTicket('Oy9wbHAvcHJvZHVjdC1saXN0LXdpdGgtY291bnQvcHJvZHVjdC1saXN0OyM7cHJvZHVjdF9rZXk7MDk4NTE1OV9ncm91cF8wMDFfZW5fdXM7MDk4NTE1OTAwMV9lbl91cztPQkpFQ1RJVkUkO05PTkU6Tk9ORTs3NTs','0985159001');">
 <div class="image-container">
 <a class="item-link" href="/en_us/productpage.0985159001.html" title="Skinny Jeans">
 <img alt="Skinny JeansModel" class="item-image" data-altimage="//lp2.hm.com/hmgoepprod?set=source[/28/f0/28f0649eb24faedfb7baf756d9cc7b86f31e37de.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[m],hmver[2]&amp;call=url[file:/product/style]" data-alttext="Skinny Jeans" data-src="//lp2.hm.com/hmgoepprod?set=source[/0a/29/0a29e9a927238a518e8d0a4194bbd1d48a3eff2a.jpg],origin[dam],category[],

In [6]:
# product_id

product_id = [p.get('data-articlecode') for p in product_list]

# product_category

product_category = [p.get('data-category') for p in product_list]


In [7]:
# product_name

product_list = products.find_all('a', class_='link')

product_name = [p.get_text() for p in product_list]

In [8]:
# price

product_list = products.find_all('span', class_='price regular')

product_price = [p.get_text() for p in product_list]

In [9]:
data = pd.DataFrame([product_id, product_category, product_name, product_price]).T

In [10]:
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']

# scrapy datetime

data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [11]:
data

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-07-11 12:14:42
1,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-07-11 12:14:42
2,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-07-11 12:14:42
3,985197002,men_jeans_slim,Slim Jeans,$ 19.99,2021-07-11 12:14:42
4,985159003,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-07-11 12:14:42
5,427159006,men_jeans_ripped,Trashed Skinny Jeans,$ 39.99,2021-07-11 12:14:42
6,985197003,men_jeans_slim,Slim Jeans,$ 19.99,2021-07-11 12:14:42
7,971061001,,Slim Tapered Cropped Jeans,$ 29.99,2021-07-11 12:14:42
8,1004199001,,Skinny Cropped Jeans,$ 29.99,2021-07-11 12:14:42
9,971061002,,Slim Tapered Cropped Jeans,$ 29.99,2021-07-11 12:14:42


# Coletando os Dados II

In [12]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

page = requests.get(url, headers=headers)

In [13]:
soup = BeautifulSoup(page.text, 'html.parser')

In [14]:
# identificando o numero total de produtos

total_item = soup.find_all('h2', class_='load-more-heading')[0].get('data-total')

In [15]:
# transformando em inteiro

page_number = np.round(int(total_item) / 36)
page_number

2.0

In [16]:
url02 = url + '?page-size=' + str(int(page_number*36))
url02

'https://www2.hm.com/en_us/men/products/jeans.html?page-size=72'

In [17]:
int(total_item) / 36

2.0555555555555554

# Coletando os dados III

## Um produto

In [18]:
# API requests

url = 'https://www2.hm.com/en_us/productpage.0636207019.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

page = requests.get(url, headers=headers)

# beautiful soup object

soup = BeautifulSoup(page.text, 'html.parser')

In [19]:
# ================================ color name ================================

product_list = soup.find_all('a', class_='filter-option miniature')

color_name = [p.get('data-color') for p in product_list]

# product id

product_id = [p.get('data-articlecode') for p in product_list]

df_color = pd.DataFrame([product_id, color_name]).T
df_color.columns = ['product_id', 'color_name']

# novas colunas de 'style_id' com os 7 primeiros digitos, e 'color_id' com os 3 últimos digitos

df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

In [20]:
# =============================== composition ================================


product_composition_list = soup.find_all('div', class_='pdp-description-list-item')

product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]


# transformando em dataframe

df_composition = pd.DataFrame(product_composition).T


# primeira linha como cabeçalho

df_composition.columns = df_composition.iloc[0]

# deletando a primeira linha do df

df_composition = df_composition.iloc[1:].fillna(method='ffill')

# novas colunas de 'style_id' com os 7 primeiros digitos, e 'color_id' com os 3 últimos digitos

df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

In [21]:
# ============================== unindo os dados ===============================

# join color e composition

data_sku = pd.merge(df_color, df_composition[['style_id','Fit','Composition']], how='left', on='style_id')

## Múltiplos produtos

In [22]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}


# dataframe vazio

df_details = pd.DataFrame()

# colunas únicas para todos os produtos

aux = []

cols = ['Art. No.', 'Composition', 'Fit', 'Product safety','Size']
df_pattern = pd.DataFrame(columns= cols)

In [23]:
for i in range(len(data)):

    # API requests

    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] + '.html'

    page = requests.get(url, headers=headers)

    # beautiful soup object

    soup = BeautifulSoup(page.text, 'html.parser')
    

    # ================================ color name ================================

    product_list = soup.find_all('a', class_='filter-option miniature')

    color_name = [p.get('data-color') for p in product_list]

    # product id

    product_id = [p.get('data-articlecode') for p in product_list]

    df_color = pd.DataFrame([product_id, color_name]).T
    df_color.columns = ['product_id', 'color_name']

    # novas colunas de 'style_id' com os 7 primeiros digitos, e 'color_id' com os 3 últimos digitos

    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])


    # =============================== composition ================================


    product_composition_list = soup.find_all('div', class_='pdp-description-list-item')

    product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]


    # transformando em dataframe

    df_composition = pd.DataFrame(product_composition).T


    # primeira linha como cabeçalho

    df_composition.columns = df_composition.iloc[0]

    # deletando a primeira linha do df

    df_composition = df_composition.iloc[1:].fillna(method='ffill')
    
    # garantindo o mesmo número de colunas
    
    df_composition = pd.concat([df_pattern, df_composition], axis=0)

    # novas colunas de 'style_id' com os 7 primeiros digitos, e 'color_id' com os 3 últimos digitos

    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])
    
    aux = aux + df_composition.columns.tolist()


    # ============================== unindo os dados ===============================

    # join color e composition

    data_sku = pd.merge(df_color, df_composition[['style_id','Fit','Size', 'Product safety','Composition']], how='left', on='style_id')
    
    df_details = pd.concat([df_details, data_sku], axis=0)

In [24]:
# join vitrine e details

data['style_id'] = data['product_id'].apply(lambda x: x[:-3])
data['color_id'] = data['product_id'].apply(lambda x: x[-3:])

data_raw = pd.merge(data, df_details[['style_id', 'color_name', 'Fit', 'Composition', 'Size', 'Product safety']],
                    how='left', on='style_id')

In [25]:
data_raw.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,Composition,Size,Product safety
0,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-07-11 12:14:42,985159,1,Denim blue,Skinny fit,"Shell: Cotton 99%, Elastane 1%","The model is 187cm/6'2"" and wears a size 31/32",
1,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-07-11 12:14:42,985159,1,Denim blue,Skinny fit,Pocket lining: Cotton 100%,"The model is 187cm/6'2"" and wears a size 31/32",
2,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-07-11 12:14:42,985159,1,Dark gray,Skinny fit,"Shell: Cotton 99%, Elastane 1%","The model is 187cm/6'2"" and wears a size 31/32",
3,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-07-11 12:14:42,985159,1,Dark gray,Skinny fit,Pocket lining: Cotton 100%,"The model is 187cm/6'2"" and wears a size 31/32",
4,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-07-11 12:14:42,985159,1,Light denim blue,Skinny fit,"Shell: Cotton 99%, Elastane 1%","The model is 187cm/6'2"" and wears a size 31/32",


In [26]:
# salvando o df

data_raw.to_csv('products_hm.csv',index_label=False)



## Tratamento de dados

In [27]:
data = pd.read_csv('products_hm.csv')

In [28]:
data[['Composition','product_id']].groupby('Composition').count().reset_index()

Unnamed: 0,Composition,product_id
0,Cotton 100%,26
1,"Cotton 73%, Polyester 26%, Elastane 1%",30
2,"Cotton 80%, Polyester 19%, Elastane 1%",18
3,"Cotton 88%, Polyester 10%, Elastane 2%",20
4,"Cotton 90%, Elasterell-P 8%, Elastane 2%",18
5,"Cotton 93%, Polyester 6%, Elastane 1%",14
6,"Cotton 98%, Elastane 2%",193
7,"Cotton 99%, Elastane 1%",52
8,Lining: Polyester 100%,15
9,Pocket lining: Cotton 100%,156


In [29]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,Composition,Size,Product safety
0,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-07-11 12:14:42,985159,1,Denim blue,Skinny fit,"Shell: Cotton 99%, Elastane 1%","The model is 187cm/6'2"" and wears a size 31/32",
1,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-07-11 12:14:42,985159,1,Denim blue,Skinny fit,Pocket lining: Cotton 100%,"The model is 187cm/6'2"" and wears a size 31/32",
2,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-07-11 12:14:42,985159,1,Dark gray,Skinny fit,"Shell: Cotton 99%, Elastane 1%","The model is 187cm/6'2"" and wears a size 31/32",
3,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-07-11 12:14:42,985159,1,Dark gray,Skinny fit,Pocket lining: Cotton 100%,"The model is 187cm/6'2"" and wears a size 31/32",
4,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-07-11 12:14:42,985159,1,Light denim blue,Skinny fit,"Shell: Cotton 99%, Elastane 1%","The model is 187cm/6'2"" and wears a size 31/32",


In [30]:
data.isna().sum()

product_id            0
product_category    124
product_name          0
product_price         0
scrapy_datetime       0
style_id              0
color_id              0
color_name            0
Fit                   0
Composition           0
Size                456
Product safety      632
dtype: int64

In [31]:
data = pd.read_csv('products_hm.csv')


# product id


data['product_id'] = data['product_id'].astype(int)

# product name

data['product_name'] = data['product_name'].apply(lambda x: x.replace(' ', '_').lower() )


# product price

data['product_price'] = data['product_price'].apply(lambda x: x.replace('$ ', '') if pd.notnull(x) else x )

# scrapy datetime

data['scrapy_datetime'] = pd.to_datetime(data['scrapy_datetime'], format='%Y-%m-%d %H:%M:%S')

# style id

data['style_id'] = data['style_id'].astype(int)

# color id

data['color_id'] = data['color_id'].astype(int)

# color name

data['color_name'] = data['color_name'].apply(lambda x: x.replace(' ', '_').replace('/', '_').lower() if pd.notnull(x) else x )

# fit

data['Fit'] = data['Fit'].apply(lambda x: x.replace(' ', '_').lower() if pd.notnull(x) else x )

# size model

data['size_model'] = data['Size'].apply(lambda x: re.search('\d{3}cm', x).group(0) if pd.notnull(x) else x )
data['size_model'] = data['size_model'].apply(lambda x: re.search('\d+', x).group(0) if pd.notnull(x) else x )

# size number

data['size_number'] = data['Size'].str.extract('(\d+/\\d+)')



# product safety

# composition

#data['Composition'] = data['Composition'].apply(lambda x: x.replace('Pocket lining: ', '').replace('Lining: ', '').replace('Shell: ', ''))

data = data[~data['Composition'].str.contains('Pocket lining:', na=False)] # removendo a composição do forro do bolso
data = data[~data['Composition'].str.contains('Lining:', na=False)]
data = data[~data['Composition'].str.contains('Shell:', na=False)]

# drop duplicate

data = data.drop_duplicates(subset=['product_id','product_category','product_name', 'product_price',
                                   'scrapy_datetime', 'style_id', 'color_id', 'color_name','Fit'], keep='last')


# reset index

data = data.reset_index(drop=True)

# breack composition by comma

df1 = data['Composition'].str.split(',', expand=True)


In [32]:
data['Composition'].unique()

array(['Cotton 98%, Elastane 2%', 'Cotton 93%, Polyester 6%, Elastane 1%',
       'Cotton 99%, Elastane 1%',
       'Cotton 90%, Elasterell-P 8%, Elastane 2%', 'Cotton 100%',
       'Cotton 88%, Polyester 10%, Elastane 2%',
       'Cotton 80%, Polyester 19%, Elastane 1%',
       'Cotton 73%, Polyester 26%, Elastane 1%'], dtype=object)

In [33]:
# cotton / polyester / elastano / elasterell

df_ref = pd.DataFrame( index=np.arange(len(data)), columns=['cotton', 'polyester', 'elastano', 'elasterell'])

###################################

# cotton

df_cotton = df1.loc[df1[0].str.contains('Cotton', na=True),0] # selecionando os 'cotton' da primeira coluna
df_cotton.name = 'cotton1'

df_ref = pd.concat([df_ref, df_cotton], axis=1) # concatenando com o df de referencia
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

df_cotton = df1.loc[df1[1].str.contains('Cotton', na=True),1] # selecionando os 'cotton' da segunda coluna
df_cotton.name = 'cotton2'

df_ref = pd.concat([df_ref, df_cotton], axis=1) # concatenando com o df de referencia
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

df_ref['cotton1'] = df_ref['cotton1'].fillna(df_ref['cotton2']) # juntando os 'cotton' em uma coluna
df_ref['cotton1'] = df_ref['cotton1'].fillna('Cotton 0%')

df_ref.drop(columns=['cotton2'], inplace=True) # excluindo a coluna cotton2

###################################

# polyester

df_polyester = df1.loc[df1[0].str.contains('Polyester', na=True),0] # selecionando os 'polyester' da primeira coluna
df_polyester.name = 'polyester1'

df_ref = pd.concat([df_ref, df_polyester], axis=1) # concatenando com o df de referencia
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

df_polyester = df1.loc[df1[1].str.contains('Polyester', na=True),1] # selecionando os 'polyester' da segunda coluna
df_polyester.name = 'polyester2'

df_ref = pd.concat([df_ref, df_polyester], axis=1) # concatenando com o df de referencia
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

df_ref['polyester1'] = df_ref['polyester1'].fillna(df_ref['polyester2']) # juntando os 'polyester' em uma coluna
df_ref['polyester1'] = df_ref['polyester1'].fillna('Polyester 0%')

df_ref.drop(columns=['polyester2'], inplace=True) # excluindo a coluna polyester2

###################################

# elastano

df_elastane = df1.loc[df1[1].str.contains('Elastane', na=True),1] # selecionando os 'elastane' da segunda coluna
df_elastane.name = 'elastane1'

df_ref = pd.concat([df_ref, df_elastane], axis=1) # concatenando com o df de referencia
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

df_elastane = df1.loc[df1[2].str.contains('Elastane', na=True),2] # selecionando os 'elastane' da terceira coluna
df_elastane.name = 'elastane2'

df_ref = pd.concat([df_ref, df_elastane], axis=1) # concatenando com o df de referencia
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

df_ref['elastane1'] = df_ref['elastane1'].fillna(df_ref['elastane2']) # juntando os 'elastane' em uma coluna
df_ref['elastane1'] = df_ref['elastane1'].fillna('Elastane 0%')

df_ref.drop(columns=['elastane2'], inplace=True) # excluindo a coluna elastane2


###################################

# elasterell

df_elasterell = df1.loc[df1[1].str.contains('Elasterell', na=True),1] # selecionando os 'elasterell' da segunda coluna
df_elasterell.name = 'elasterell'

df_ref = pd.concat([df_ref, df_elasterell], axis=1) # concatenando com o df de referencia

df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')] # mantendo a últina coluna elasterell

df_ref['elasterell'] = df_ref['elasterell'].fillna('Elasterell 0%')

In [34]:
df_ref

Unnamed: 0,cotton,polyester,elastano,cotton1,polyester1,elastane1,elasterell
0,,,,Cotton 98%,Polyester 0%,Elastane 2%,Elasterell 0%
1,,,,Cotton 98%,Polyester 0%,Elastane 2%,Elasterell 0%
2,,,,Cotton 98%,Polyester 0%,Elastane 2%,Elasterell 0%
3,,,,Cotton 98%,Polyester 0%,Elastane 2%,Elasterell 0%
4,,,,Cotton 98%,Polyester 0%,Elastane 2%,Elasterell 0%
...,...,...,...,...,...,...,...
158,,,,Cotton 73%,Polyester 26%,Elastane 1%,Elasterell 0%
159,,,,Cotton 73%,Polyester 26%,Elastane 1%,Elasterell 0%
160,,,,Cotton 73%,Polyester 26%,Elastane 1%,Elasterell 0%
161,,,,Cotton 73%,Polyester 26%,Elastane 1%,Elasterell 0%


In [35]:
# excluindo as colunas referenciais

df_ref.drop(columns=['cotton', 'polyester', 'elastano'], inplace=True)

df_ref = df_ref.rename(columns={'cotton1': 'cotton', 'polyester1':'polyester', 'elastane1': 'elastane'})

df_ref= df_ref.fillna(value=np.nan)

In [36]:
df_ref.head()

Unnamed: 0,cotton,polyester,elastane,elasterell
0,Cotton 98%,Polyester 0%,Elastane 2%,Elasterell 0%
1,Cotton 98%,Polyester 0%,Elastane 2%,Elasterell 0%
2,Cotton 98%,Polyester 0%,Elastane 2%,Elasterell 0%
3,Cotton 98%,Polyester 0%,Elastane 2%,Elasterell 0%
4,Cotton 98%,Polyester 0%,Elastane 2%,Elasterell 0%


In [37]:
# final join

data = pd.concat([data, df_ref], axis=1)

# format composition data

data['cotton'] = data['cotton'].apply(lambda x: int(re.search('\d+', x).group(0)) / 100 if pd.notnull(x) else x)
data['polyester'] = data['polyester'].apply(lambda x: int(re.search('\d+', x).group(0)) / 100 if pd.notnull(x) else x)
data['elastane'] = data['elastane'].apply(lambda x: int(re.search('\d+', x).group(0)) / 100 if pd.notnull(x) else x)
data['elasterell'] = data['elasterell'].apply(lambda x: int(re.search('\d+', x).group(0)) / 100 if pd.notnull(x) else x)


# drop columns

data = data.drop(columns=['Size', 'Product safety','Composition'], axis=1)

# drop duplicates

data = data.drop_duplicates()

data.to_csv('products_hm_cleaned.csv', index_label=False)