# Verificação do Script de coleta dos dados

## Carregando as bibliotecas

In [1]:
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import numpy as np
import re
import time

In [42]:
time.sleep(2)

## Data colletion

In [3]:
# parameters
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# URL
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

# Request to URL
page = requests.get( url, headers=headers )

# Beautiful soup object
soup = BeautifulSoup( page.text, 'html.parser' )

########################### Product Data ###################################
products = soup.find( 'ul', class_='products-listing small' )
product_list = products.find_all( 'article', class_='hm-product-item')

# product id
#data-article code é o ID do produto
product_id = [p.get( 'data-articlecode' ) for p in product_list]

# product category
product_category = [p.get( 'data-category' ) for p in product_list]

# product name
product_list = products.find_all( 'a', class_='link' )
product_name = [p.get_text() for p in product_list]

# price
product_list = products.find_all( 'span', class_='price regular' )
product_price = [p.get_text() for p in product_list]

#criação do dataframe
data = pd.DataFrame( [product_id, product_category, product_name,product_price] ).T
data.columns = ['product_id', 'product_category', 'product_name','product_price']

# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S' )

In [4]:
data.shape

(36, 5)

In [17]:
data

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,1024256001,men_jeans_slim,Slim Jeans,$ 24.99,2022-08-15 18:58:33
1,690449036,men_jeans_ripped,Skinny Jeans,$ 39.99,2022-08-15 18:58:33
2,985159005,men_jeans_skinny,Skinny Jeans,$ 24.99,2022-08-15 18:58:33
3,875105024,men_jeans_relaxed,Relaxed Jeans,$ 29.99,2022-08-15 18:58:33
4,979945002,men_jeans_loose,Loose Jeans,$ 39.99,2022-08-15 18:58:33
5,1024711001,men_jeans_slim,Slim Jeans,$ 34.99,2022-08-15 18:58:33
6,690449056,men_jeans_ripped,Skinny Jeans,$ 39.99,2022-08-15 18:58:33
7,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2022-08-15 18:58:33
8,1008549001,men_jeans_regular,Regular Jeans,$ 24.99,2022-08-15 18:58:33
9,1024256002,men_jeans_slim,Slim Jeans,$ 24.99,2022-08-15 18:58:33


## Data colletion by product

In [6]:
#criando um df vazio 
df_compositions = pd.DataFrame()

#lista vazia para colocar todos os nomes das colunas para que o df seja padronizado, se algum produto
#não tiver alguma coluna vai ficar vazio
aux = []

cols = ['Art. No.', 'Composition', 'Fit', 'color_id', 'style_id']
df_pattern = pd.DataFrame(columns=cols)

#parametros
#headers é um dicionário que vai dizer pra API da H&M que quem está fazendo a requisição
#é um browser e não um código python; é padrão
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
  
#gerando todas as urls
for i in range(len(data)):
    #API requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i,'product_id'] + '.html'
    #print(url)
    
    #para fazer requisição na API para puxar os dados html       
    page = requests.get(url, headers=headers)

    #BeautifulSoup objects
    soup = BeautifulSoup(page.text, 'html.parser')

    ########################### color name ###################################
    product_list= soup.find_all('a',class_='filter-option miniature') + soup.find_all('a',class_='filter-option miniature active')
    
    #percorrendo todas as cores do produto
    color_name = [p.get('data-color') for p in product_list]

    #procuct id
    product_id = [p.get('data-articlecode') for p in product_list]

    #criando o df
    df_color = pd.DataFrame([product_id, color_name]).T
    df_color.columns = ['product_id','color_name']
    
    #loop que percorre todas as cores e coleta a composição de cada cor
    for j in range(len(df_color)):
        #API requests
        url = 'https://www2.hm.com/en_us/productpage.' + df_color.loc[j,'product_id'] + '.html'
        #url = 'https://www2.hm.com/en_us/productpage.1024256001.html'
        #print(url)

        #para fazer requisição na API para puxar os dados html       
        page = requests.get(url, headers=headers)

        #BeautifulSoup objects
        soup = BeautifulSoup(page.text, 'html.parser')
        
        ########################### Product Name #############################
        time.sleep(10)
        product_name = soup.find_all('hm-product-name', {'id':'js-product-name' })
        #product_name = soup.find_all('h1', {'class':'Heading-module--general__3HQET ProductName-module--productTitle__1T9f0 Heading-module--small__SFfSh' })
        #product_name = soup.find_all('h1', class_='class="Heading-module--general__3HQET ProductName-module--productTitle__1T9f0 Heading-module--small__SFfSh')
        
        product_name = product_name[0].get_text()
        #print(product_name)
        
        ########################### Product Price #############################
        product_price = soup.find_all('div', class_='primary-row product-item-price')
        #regex para pegar o preço
        product_price = re.findall(r'\d+\.?\d+',product_price[0].get_text())[0]
        
        ########################### composition ###################################
        product_composition_list = soup.find_all('div', class_='details-attributes-list-item')

        product_composition_full = [list(filter(None,p.get_text().split('\n'))) for p in product_composition_list]

        #rename dataframe
        df_composition = pd.DataFrame(product_composition_full).T
        df_composition.columns = df_composition.iloc[0]
        
        #add o product name e o price no df
        df_composition['product_name'] = product_name
        df_composition['product_price'] = product_price

        #delete first row
        df_composition = df_composition.iloc[1:].fillna(method='ffill')

        #removendo pocket lining, shell e lining
        df_composition['Composition'] = df_composition['Composition'].replace('Pocket lining: ','',regex=True)
        df_composition['Composition'] = df_composition['Composition'].replace('Shell: ','',regex=True)
        df_composition['Composition'] = df_composition['Composition'].replace('Lining: ','',regex=True)

        #garantia que tenha a mesma quantidade de colunas
        df_composition = pd.concat( [df_pattern, df_composition], axis=0 )

        #Para arrumar o df que tem informações a mais que eu preciso
        ex = pd.DataFrame(columns=['Fit','Composition','Art. No.'])
        df_composition = pd.merge(ex, df_composition[['Fit','Composition','Art. No.']], how = 'right', on=['Fit','Composition','Art. No.'])
        df_composition = df_composition.drop_duplicates()

        # gereando o style id + color id
        #df_composition['style_id'] = df_composition['Art. No.'].apply( lambda x: x[:-3] )
        #df_composition['color_id'] = df_composition['Art. No.'].apply( lambda x: x[-3:] )

        #renomeando as colunas
        df_composition.columns = ['fit','composition','product_id']
        df_composition['product_name'] = product_name
        df_composition['product_price'] = product_price

        #lista vazia para colocar todos os nomes das colunas para que o df seja padronizado, se algum produto
        #não tiver alguma coluna vai ficar vazio
        aux = aux + df_composition.columns.tolist()

        #juntando data color + composition
        df_composition = pd.merge(df_composition, df_color,how='left', on='product_id')

        #todos os produtos
        df_compositions = pd.concat([df_compositions, df_composition], axis = 0)
        
    
#Join Showroom data + details
df_compositions['style_id'] = df_compositions['product_id'].apply(lambda x: x[:-3])
df_compositions['color_id'] = df_compositions['product_id'].apply(lambda x: x[-3:])

#scrapy datetime
df_compositions['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
          

## Data cleaning

In [None]:
#product_id
df_data = df_compositions.dropna(subset=['product_id'])

#product_name
df_data['product_name'] = df_data['product_name'].str.replace( '\n', '' )
df_data['product_name'] = df_data['product_name'].str.replace( '\t', '' )
df_data['product_name'] = df_data['product_name'].str.replace( ' ', '' )

#deixar minusculo e separado por _ igual o product_category
df_data['product_name'] = df_data['product_name'].apply( lambda x: x.replace(' ','_').lower())

#product_price
#substituir o $ por nada e colocar o tipo como float
df_data['product_price'] = df_data['product_price'].apply( lambda x: x.replace('$ ','')).astype(float)

#scrapy_datetime
#tranformar para o tipo data
#data['scrapy_datetime'] = pd.to_datetime(data['scrapy_datetime'], format = '%Y-%m-%d %H:%M:%S')

#color_name    
#deixar minusculo e separado por _ quando não for nulo
df_data['color_name'] = df_data['color_name'].apply(lambda x: x.replace(' ','_').lower() if pd.notnull(x) else x)

#Fit 
#deixar minusculo e separado por _ quando não for nulo
df_data['Fit'] = df_data['Fit'].apply(lambda x: x.replace(' ','_').lower() if pd.notnull(x) else x)

# size number
df_data['size_number'] = df_data['size'].apply( lambda x: re.search( '\d{3}cm',x ).group(0) if pd.notnull( x ) else x )
df_data['size_number'] = df_data['size_number'].apply( lambda x: re.search('\d+', x ).group(0) if pd.notnull( x ) else x )

# size model
df_data['size_model'] = df_data['size'].str.extract( '(\d+/\\d+)' )

################################# Composition ###########################################
#quebrando a Composition na virgula e resetando o index
df1 = df_data['composition'].str.split(',', expand=True).reset_index(drop=True)

# cotton | polyester | elastano | elasterell
df_ref = pd.DataFrame( index=np.arange( len( df_data ) ),columns=['cotton','polyester', 'elastane', 'elasterell'] )

#criando as colunas com cada tipo de material
# ------ cotton -------
df_cotton_0 = df1.loc[df1[0].str.contains( 'Cotton', na=True ), 0]
df_cotton_0.name = 'cotton'

df_cotton_1 = df1.loc[df1[1].str.contains( 'Cotton', na=True ), 1]
df_cotton_1.name = 'cotton'

# combinando as colunas
df_cotton = df_cotton_0.combine_first( df_cotton_1 )
df_ref = pd.concat( [df_ref, df_cotton ], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last')]

# ------ polyester -------
df_polyester_0 = df1.loc[df1[0].str.contains( 'Polyester', na=True ), 0]
df_polyester_0.name = 'polyester'

df_polyester_1 = df1.loc[df1[1].str.contains( 'Polyester', na=True ), 1]
df_polyester_1.name = 'polyester'

# combine
df_polyester = df_polyester_0.combine_first( df_polyester_1 )
df_ref = pd.concat( [df_ref, df_polyester], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last') ]

# ------ spandex -------
df_spandex_0 = df1.loc[df1[0].str.contains( 'Spandex', na=True ), 0]
df_spandex_0.name = 'spandex'

df_spandex_1 = df1.loc[df1[1].str.contains( 'Spandex', na=True ), 1]
df_spandex_1.name = 'spandex'

# combine
df_spandex = df_spandex_0.combine_first( df_spandex_1 )
df_ref = pd.concat( [df_ref, df_spandex], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last') ]

# join que combina com o product_id
df_aux = pd.concat( [df_data['product_id'].reset_index(drop=True), df_ref],axis=1 )

#format composition data
#quero extrair só os numeros das colunas de composição
df_aux['cotton'] = df_aux['cotton'].apply(lambda x: int(re.search('\d+',x).group(0))/100 if pd.notnull(x) else x)
df_aux['polyester'] = df_aux['polyester'].apply(lambda x: int(re.search('\d+',x).group(0))/100 if pd.notnull(x) else x)
df_aux['spandex'] = df_aux['spandex'].apply(lambda x: int(re.search('\d+',x).group(0))/100 if pd.notnull(x) else x)

# final join
#pegando o valor máximo no agrupamento e depois colocando 0 quando estiver vazio
df_aux = df_aux.groupby( 'product_id' ).max().reset_index().fillna( 0 )

#juntando os dfs
df_data = pd.merge( df_data, df_aux, on='product_id', how='left' )

# Drop columns
df_data = df_data.drop(columns=['size', 'product_safety', 'composition'], axis=1 )

# Drop duplicates
df_data = df_data.drop_duplicates()

## Data Insert

In [1]:
import sqlite3
from sqlalchemy import create_engine

In [None]:
#mudando a posição das colunas do df
data_insert = df_data[['product_id',
                        'style_id',
                        'color_id',
                        'product_name',
                        'color_name',
                        'fit',
                        'product_price',
                        'size_number',
                        'size_model',
                        'cotton',
                        'polyester',
                        'spandex',
                        'scrapy_datetime'
]]


In [2]:
#query para criar a tabela
query = """
 'product_id',
                            'style_id',
                            'color_id',
                            'product_name',
                            'color_name',
                            'fit',
                            'product_price',
                            'cotton',
                            'polyester',
                            'spandex',
                            'scrapy_datetime'
"""


In [3]:
# criando a tabela
conn = sqlite3.connect( 'database_hm.sqlite' )
cursor = conn.execute( query)
conn.commit()

In [4]:
#query para criar a tabela
query_showroom_schema = """
 CREATE TABLE vitrine (product_id TEXT,
                        style_id TEXT,
                        color_id TEXT,
                        product_name TEXT,
                        color_name TEXT,
                        fit TEXT,
                        product_price REAL,
                        cotton REAL,
                        polyester REAL,
                        spandex REAL,
                        scrapy_datetime TEXT)
"""


In [5]:
# criando a tabela
conn = sqlite3.connect( 'database_hm.sqlite' )
cursor = conn.execute( query_showroom_schema )
conn.commit()

In [None]:
#criando a conexão com o banco
conn = create_engine( 'sqlite:///database_hm.sqlite', echo=False )

#data insert
data_insert.to_sql( 'vitrine', con=conn, if_exists='append', index=False )

In [47]:
for j in range(len(df_color)):
        #API requests
        url = 'https://www2.hm.com/en_us/productpage.' + df_color.loc[j,'product_id'] + '.html'
        print(url)

https://www2.hm.com/en_us/productpage.1024256002.html
https://www2.hm.com/en_us/productpage.1024256003.html
https://www2.hm.com/en_us/productpage.1024256004.html
https://www2.hm.com/en_us/productpage.1024256005.html
https://www2.hm.com/en_us/productpage.1024256006.html
https://www2.hm.com/en_us/productpage.1024256007.html
https://www2.hm.com/en_us/productpage.1024256008.html
https://www2.hm.com/en_us/productpage.1024256001.html


In [46]:
        #url = 'https://www2.hm.com/en_us/productpage.' + df_color.loc[j,'product_id'] + '.html'
        url = 'https://www2.hm.com/en_us/productpage.1024256001.html'
        print(url)

        #para fazer requisição na API para puxar os dados html       
        page = requests.get(url, headers=headers)

        #BeautifulSoup objects
        soup = BeautifulSoup(page.text, 'html.parser')
        
        ########################### Product Name #############################
        time.sleep(10)
        product_name = soup.find_all('hm-product-name', {'id':'js-product-name' })
        #product_name = soup.find_all('h1', {'class':'Heading-module--general__3HQET ProductName-module--productTitle__1T9f0 Heading-module--small__SFfSh' })
        #product_name = soup.find_all('h1', class_='class="Heading-module--general__3HQET ProductName-module--productTitle__1T9f0 Heading-module--small__SFfSh')
        
        print(product_name)
        product_name = product_name[0].get_text()
        print(product_name)
        

https://www2.hm.com/en_us/productpage.1024256001.html
[<hm-product-name id="js-product-name">
<div>
<h1>Slim Jeans</h1>
<h2>
</h2>
</div>
</hm-product-name>]


Slim Jeans






In [35]:
df_color2 = df_color[df_color['product_id']=='1024256002']
df_color2

Unnamed: 0,product_id,color_name
0,1024256002,Light denim blue


In [33]:
df_composition['product_name'] = product_name
df_composition['product_price'] = product_price

ValueError: Length of values (0) does not match length of index (2)

In [32]:
df_compositions

Unnamed: 0,fit,composition,product_id,color_name
0,Slim fit,"Cotton 99%, Spandex 1%",1024256002,Light denim blue
1,Slim fit,"Polyester 65%, Cotton 35%",1024256002,Light denim blue
0,Slim fit,"Cotton 99%, Spandex 1%",1024256003,Light denim blue
1,Slim fit,"Polyester 65%, Cotton 35%",1024256003,Light denim blue
0,Slim fit,"Cotton 99%, Spandex 1%",1024256004,Denim blue
1,Slim fit,"Polyester 65%, Cotton 35%",1024256004,Denim blue
0,Slim fit,"Cotton 99%, Spandex 1%",1024256005,Dark blue
1,Slim fit,"Polyester 65%, Cotton 35%",1024256005,Dark blue
0,Slim fit,"Cotton 99%, Spandex 1%",1024256006,Dark denim blue
1,Slim fit,Cotton 100%,1024256006,Dark denim blue


In [13]:
    for j in range(len(df_color)):
        #API requests
        url = 'https://www2.hm.com/en_us/productpage.' + df_color.loc[j,'product_id'] + '.html'
        print(url)

        #para fazer requisição na API para puxar os dados html       
        page = requests.get(url, headers=headers)

        #BeautifulSoup objects
        soup = BeautifulSoup(page.text, 'html.parser')
        
        ########################### Product Name #############################
        product_name = soup.find_all( 'hm-product-name', {'id':'js-product-name' })
        product_name = product_name[0].get_text()

https://www2.hm.com/en_us/productpage.0690449001.html


IndexError: list index out of range

In [18]:
df_color

Unnamed: 0,product_id,color_name
0,690449001,Light denim blue/trashed
1,690449002,Denim blue
2,690449006,Black/washed
3,690449007,Light denim blue
4,690449009,Black washed out
5,690449011,White
6,690449013,Black/washed
7,690449021,Dark denim blue/trashed
8,690449022,Black/trashed
9,690449024,Dark blue/Trashed


In [21]:
data

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,1024256001,men_jeans_slim,Slim Jeans,$ 24.99,2022-08-13 19:20:18
1,690449036,men_jeans_ripped,Skinny Jeans,$ 39.99,2022-08-13 19:20:18
2,985159005,men_jeans_skinny,Skinny Jeans,$ 24.99,2022-08-13 19:20:18
3,979945002,men_jeans_loose,Loose Jeans,$ 39.99,2022-08-13 19:20:18
4,875105024,men_jeans_relaxed,Relaxed Jeans,$ 29.99,2022-08-13 19:20:18
5,1024711001,men_jeans_slim,Slim Jeans,$ 34.99,2022-08-13 19:20:18
6,690449056,men_jeans_ripped,Skinny Jeans,$ 39.99,2022-08-13 19:20:18
7,1008549001,men_jeans_regular,Regular Jeans,$ 24.99,2022-08-13 19:20:18
8,985159001,men_jeans_skinny,Skinny Jeans,$ 24.99,2022-08-13 19:20:18
9,1024256002,men_jeans_slim,Slim Jeans,$ 24.99,2022-08-13 19:20:18


In [25]:
df_compositions

Unnamed: 0,fit,composition,product_id,color_name
0,Slim fit,"Cotton 99%, Spandex 1%",1024256002,Light denim blue
1,Slim fit,"Polyester 65%, Cotton 35%",1024256002,Light denim blue
0,Slim fit,"Cotton 99%, Spandex 1%",1024256003,Light denim blue
1,Slim fit,"Polyester 65%, Cotton 35%",1024256003,Light denim blue
0,Slim fit,"Cotton 99%, Spandex 1%",1024256004,Denim blue
1,Slim fit,"Polyester 65%, Cotton 35%",1024256004,Denim blue
0,Slim fit,"Cotton 99%, Spandex 1%",1024256005,Dark blue
1,Slim fit,"Polyester 65%, Cotton 35%",1024256005,Dark blue
0,Slim fit,"Cotton 99%, Spandex 1%",1024256006,Dark denim blue
1,Slim fit,Cotton 100%,1024256006,Dark denim blue


In [30]:
data

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,985159007,men_jeans_skinny,Skinny Jeans,$ 24.99,2022-08-10 19:36:28
1,971061002,men_jeans_slim,Slim Tapered Cropped Jeans,$ 29.99,2022-08-10 19:36:28
2,1024256001,men_jeans_slim,Slim Jeans,$ 24.99,2022-08-10 19:36:28
3,1008549001,men_jeans_regular,Regular Jeans,$ 24.99,2022-08-10 19:36:28
4,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2022-08-10 19:36:28
5,1024256002,men_jeans_slim,Slim Jeans,$ 24.99,2022-08-10 19:36:28
6,1008549006,men_jeans_regular,Regular Jeans,$ 24.99,2022-08-10 19:36:28
7,690449043,men_jeans_ripped,Skinny Jeans,$ 39.99,2022-08-10 19:36:28
8,985159001,men_jeans_skinny,Skinny Jeans,$ 24.99,2022-08-10 19:36:28
9,1013317002,men_jeans_joggers,Hybrid Regular Tapered Joggers,$ 44.99,2022-08-10 19:36:28


In [26]:
df_composition

Unnamed: 0,fit,composition,product_id,color_name
0,Skinny fit,"Cotton 99%, Spandex 1%",1004199001,Light denim blue


In [5]:
df_color

Unnamed: 0,product_id,color_name
0,1004199002,Black
1,1004199003,Denim blue
2,1004199004,White
3,1004199005,Light gray
4,1004199001,Light denim blue


In [29]:
df_compositions

Unnamed: 0,fit,composition,product_id,color_name
0,Slim fit,"Cotton 99%, Spandex 1%",1024256002,Light denim blue
1,Slim fit,"Polyester 65%, Cotton 35%",1024256002,Light denim blue
0,Slim fit,"Cotton 99%, Spandex 1%",1024256003,Light denim blue
1,Slim fit,"Polyester 65%, Cotton 35%",1024256003,Light denim blue
0,Slim fit,"Cotton 99%, Spandex 1%",1024256004,Denim blue
1,Slim fit,"Polyester 65%, Cotton 35%",1024256004,Denim blue
0,Slim fit,"Cotton 99%, Spandex 1%",1024256005,Dark blue
1,Slim fit,"Polyester 65%, Cotton 35%",1024256005,Dark blue
0,Slim fit,"Cotton 99%, Spandex 1%",1024256006,Dark denim blue
1,Slim fit,Cotton 100%,1024256006,Dark denim blue


In [16]:
ex = pd.DataFrame(columns=['Fit','Composition','Art. No.'])
ex

Unnamed: 0,Fit,Composition,Art. No.


In [17]:
df_composition = pd.merge(ex, df_composition[['Fit','Composition','Art. No.']], how = 'right', on=['Fit','Composition','Art. No.'])
df_composition

Unnamed: 0,Fit,Composition,Art. No.
0,Skinny fit,"Cotton 99%, Spandex 1%",1004199001
1,Skinny fit,"Cotton 99%, Spandex 1%",1004199001
2,Skinny fit,"Cotton 99%, Spandex 1%",1004199001
3,Skinny fit,"Cotton 99%, Spandex 1%",1004199001
4,Skinny fit,"Cotton 99%, Spandex 1%",1004199001
5,Skinny fit,"Cotton 99%, Spandex 1%",1004199001
6,Skinny fit,"Cotton 99%, Spandex 1%",1004199001
7,Skinny fit,"Cotton 99%, Spandex 1%",1004199001


In [18]:
df_composition = df_composition.drop_duplicates()
df_composition

Unnamed: 0,Fit,Composition,Art. No.
0,Skinny fit,"Cotton 99%, Spandex 1%",1004199001


In [19]:
df_color

Unnamed: 0,product_id,color_name
0,1004199002,Black
1,1004199003,Denim blue
2,1004199004,White
3,1004199005,Light gray
4,1004199001,Light denim blue


## Data cleaning