In [4]:
################################ imports ###############################
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import numpy as np
import re
import time
import sqlite3
from sqlalchemy import create_engine
import os
import logging

############################## Data Colletion ##########################
def data_colletion(url, headers):
    # Request to URL
    page = requests.get( url, headers=headers )

    # Beautiful soup object
    soup = BeautifulSoup( page.text, 'html.parser' )

    ######## Product Data #########
    products = soup.find( 'ul', class_='products-listing small' )
    product_list = products.find_all( 'article', class_='hm-product-item')

    # product id
    #data-article code é o ID do produto
    product_id = [p.get( 'data-articlecode' ) for p in product_list]

    # product category
    product_category = [p.get( 'data-category' ) for p in product_list]

    # product name
    product_list = products.find_all( 'a', class_='link' )
    product_name = [p.get_text() for p in product_list]

    # price
    product_list = products.find_all( 'span', class_='price regular' )
    product_price = [p.get_text() for p in product_list]

    #criação do dataframe
    data = pd.DataFrame( [product_id, product_category, product_name,product_price] ).T
    data.columns = ['product_id', 'product_category', 'product_name','product_price']

    # scrapy datetime
    data['scrapy_datetime'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S' )
    
    return data

############################## Data Colletion by Product ##########################
def data_colletion_by_product(data, headers):
    #criando um df vazio 
    df_compositions = pd.DataFrame()

    #lista vazia para colocar todos os nomes das colunas para que o df seja padronizado, se algum produto
    #não tiver alguma coluna vai ficar vazio
    aux = []

    cols = ['Art. No.', 'Composition', 'Fit', 'color_id', 'style_id']
    df_pattern = pd.DataFrame(columns=cols)

    #parametros
    #headers é um dicionário que vai dizer pra API da H&M que quem está fazendo a requisição
    #é um browser e não um código python; é padrão
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

    #gerando todas as urls
    for i in range(len(data)):
        #API requests
        url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i,'product_id'] + '.html'
        logger.debug('Product: %s ', url)

        #para fazer requisição na API para puxar os dados html       
        page = requests.get(url, headers=headers)

        #BeautifulSoup objects
        soup = BeautifulSoup(page.text, 'html.parser')

        ######## color name ########
        product_list= soup.find_all('a',class_='filter-option miniature') + soup.find_all('a',class_='filter-option miniature active')

        #percorrendo todas as cores do produto
        color_name = [p.get('data-color') for p in product_list]

        #procuct id
        product_id = [p.get('data-articlecode') for p in product_list]

        #criando o df
        df_color = pd.DataFrame([product_id, color_name]).T
        df_color.columns = ['product_id','color_name']

        #loop que percorre todas as cores e coleta a composição de cada cor
        for j in range(len(df_color)):
            #API requests
            url = 'https://www2.hm.com/en_us/productpage.' + df_color.loc[j,'product_id'] + '.html'
            #url = 'https://www2.hm.com/en_us/productpage.1024256001.html'
            logger.debug('Color: %s ', url)

            #para fazer requisição na API para puxar os dados html       
            page = requests.get(url, headers=headers)

            #BeautifulSoup objects
            soup = BeautifulSoup(page.text, 'html.parser')

            ########Product Name ########
            time.sleep(10)
            product_name = soup.find_all('hm-product-name', {'id':'js-product-name' })
            #product_name = soup.find_all('h1', {'class':'Heading-module--general__3HQET ProductName-module--productTitle__1T9f0 Heading-module--small__SFfSh' })
            #product_name = soup.find_all('h1', class_='class="Heading-module--general__3HQET ProductName-module--productTitle__1T9f0 Heading-module--small__SFfSh')

            product_name = product_name[0].get_text()
            #print(product_name)

            #########Product Price ########
            product_price = soup.find_all('div', class_='primary-row product-item-price')
            #regex para pegar o preço
            product_price = re.findall(r'\d+\.?\d+',product_price[0].get_text())[0]

            ######## composition ########
            product_composition_list = soup.find_all('div', class_='details-attributes-list-item')

            product_composition_full = [list(filter(None,p.get_text().split('\n'))) for p in product_composition_list]

            #rename dataframe
            df_composition = pd.DataFrame(product_composition_full).T
            df_composition.columns = df_composition.iloc[0]

            #add o product name e o price no df
            df_composition['product_name'] = product_name
            df_composition['product_price'] = product_price

            #delete first row
            df_composition = df_composition.iloc[1:].fillna(method='ffill')

            #removendo pocket lining, shell e lining
            df_composition['Composition'] = df_composition['Composition'].replace('Pocket lining: ','',regex=True)
            df_composition['Composition'] = df_composition['Composition'].replace('Shell: ','',regex=True)
            df_composition['Composition'] = df_composition['Composition'].replace('Lining: ','',regex=True)

            #garantia que tenha a mesma quantidade de colunas
            df_composition = pd.concat( [df_pattern, df_composition], axis=0 )

            #Para arrumar o df que tem informações a mais que eu preciso
            ex = pd.DataFrame(columns=['Fit','Composition','Art. No.'])
            df_composition = pd.merge(ex, df_composition[['Fit','Composition','Art. No.']], how = 'right', on=['Fit','Composition','Art. No.'])
            df_composition = df_composition.drop_duplicates()

            # gereando o style id + color id
            #df_composition['style_id'] = df_composition['Art. No.'].apply( lambda x: x[:-3] )
            #df_composition['color_id'] = df_composition['Art. No.'].apply( lambda x: x[-3:] )

            #renomeando as colunas
            df_composition.columns = ['fit','composition','product_id']
            df_composition['product_name'] = product_name
            df_composition['product_price'] = product_price

            #lista vazia para colocar todos os nomes das colunas para que o df seja padronizado, se algum produto
            #não tiver alguma coluna vai ficar vazio
            aux = aux + df_composition.columns.tolist()

            #juntando data color + composition
            df_composition = pd.merge(df_composition, df_color,how='left', on='product_id')

            #todos os produtos
            df_compositions = pd.concat([df_compositions, df_composition], axis = 0)


    #Join Showroom data + details
    df_compositions['style_id'] = df_compositions['product_id'].apply(lambda x: x[:-3])
    df_compositions['color_id'] = df_compositions['product_id'].apply(lambda x: x[-3:])

    #scrapy datetime
    df_compositions['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    return df_compositions

############################## Data Cleaning ##########################

def data_cleaning(data_products):
    #product_id
    df_data = data_products.dropna(subset=['product_id'])

    #product_name
    df_data['product_name'] = df_data['product_name'].str.replace( '\n', '' )
    df_data['product_name'] = df_data['product_name'].str.replace( '\t', '' )
    df_data['product_name'] = df_data['product_name'].str.replace( ' ', '' )

    #deixar minusculo e separado por _ igual o product_category
    df_data['product_name'] = df_data['product_name'].apply( lambda x: x.replace(' ','_').lower())

    #product_price
    #substituir o $ por nada e colocar o tipo como float
    df_data['product_price'] = df_data['product_price'].apply( lambda x: x.replace('$ ','')).astype(float)

    #scrapy_datetime
    #tranformar para o tipo data
    #data['scrapy_datetime'] = pd.to_datetime(data['scrapy_datetime'], format = '%Y-%m-%d %H:%M:%S')

    #color_name    
    #deixar minusculo e separado por _ quando não for nulo
    df_data['color_name'] = df_data['color_name'].apply(lambda x: x.replace(' ','_').lower() if pd.notnull(x) else x)

    #Fit 
    #deixar minusculo e separado por _ quando não for nulo
    df_data['fit'] = df_data['fit'].apply(lambda x: x.replace(' ','_').lower() if pd.notnull(x) else x)

    # size number
    #df_data['size_number'] = df_data['size'].apply( lambda x: re.search( '\d{3}cm',x ).group(0) if pd.notnull( x ) else x )
    #df_data['size_number'] = df_data['size_number'].apply( lambda x: re.search('\d+', x ).group(0) if pd.notnull( x ) else x )

    # size model
    #df_data['size_model'] = df_data['size'].str.extract( '(\d+/\\d+)' )

    ####### Composition #########
    #quebrando a Composition na virgula e resetando o index
    df1 = df_data['composition'].str.split(',', expand=True).reset_index(drop=True)

    # cotton | polyester | spandex | 
    df_ref = pd.DataFrame( index=np.arange( len( df_data ) ),columns=['cotton','polyester', 'elastane', 'elasterell'] )

    #criando as colunas com cada tipo de material
    # ------ cotton -------
    df_cotton_0 = df1.loc[df1[0].str.contains( 'Cotton', na=True ), 0]
    df_cotton_0.name = 'cotton'

    df_cotton_1 = df1.loc[df1[1].str.contains( 'Cotton', na=True ), 1]
    df_cotton_1.name = 'cotton'

    # combinando as colunas
    df_cotton = df_cotton_0.combine_first( df_cotton_1 )
    df_ref = pd.concat( [df_ref, df_cotton ], axis=1 )
    df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last')]

    # ------ polyester -------
    df_polyester_0 = df1.loc[df1[0].str.contains( 'Polyester', na=True ), 0]
    df_polyester_0.name = 'polyester'

    df_polyester_1 = df1.loc[df1[1].str.contains( 'Polyester', na=True ), 1]
    df_polyester_1.name = 'polyester'

    # combine
    df_polyester = df_polyester_0.combine_first( df_polyester_1 )
    df_ref = pd.concat( [df_ref, df_polyester], axis=1 )
    df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last') ]

    # ------ spandex -------
    df_spandex_0 = df1.loc[df1[0].str.contains( 'Spandex', na=True ), 0]
    df_spandex_0.name = 'spandex'

    df_spandex_1 = df1.loc[df1[1].str.contains( 'Spandex', na=True ), 1]
    df_spandex_1.name = 'spandex'

    # combine
    df_spandex = df_spandex_0.combine_first( df_spandex_1 )
    df_ref = pd.concat( [df_ref, df_spandex], axis=1 )
    df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last') ]

    # join que combina com o product_id
    df_aux = pd.concat( [df_data['product_id'].reset_index(drop=True), df_ref],axis=1 )

    #format composition data
    #quero extrair só os numeros das colunas de composição
    df_aux['cotton'] = df_aux['cotton'].apply(lambda x: int(re.search('\d+',x).group(0))/100 if pd.notnull(x) else x)
    df_aux['polyester'] = df_aux['polyester'].apply(lambda x: int(re.search('\d+',x).group(0))/100 if pd.notnull(x) else x)
    df_aux['spandex'] = df_aux['spandex'].apply(lambda x: int(re.search('\d+',x).group(0))/100 if pd.notnull(x) else x)

    # final join
    #pegando o valor máximo no agrupamento e depois colocando 0 quando estiver vazio
    df_aux = df_aux.groupby( 'product_id' ).max().reset_index().fillna( 0 )

    #juntando os dfs
    df_data = pd.merge( df_data, df_aux, on='product_id', how='left' )

    # Drop columns
    #df_data = df_data.drop(columns=['size', 'product_safety', 'composition'], axis=1 )
    df_data = df_data.drop(columns=['composition'], axis=1 )

    # Drop duplicates
    df_data = df_data.drop_duplicates()
    
    return df_data

############################## Data Insert ##########################
def  data_insert(data_product_cleaned):
#mudando a posição das colunas do df
    data_insert = data_product_cleaned[['product_id',
                            'style_id',
                            'color_id',
                            'product_name',
                            'color_name',
                            'fit',
                            'product_price',
                            'cotton',
                            'polyester',
                            'spandex',
                            'scrapy_datetime'
    ]]

    #criando a conexão com o banco
    conn = create_engine( 'sqlite:///database_hm.sqlite', echo=False )

    #data insert
    data_insert.to_sql( 'vitrine', con=conn, if_exists='append', index=False )
    
    return None

if __name__ == '__main__':
    #logging
    path = 'C:/Users/laais/CDS_Python_do_DS_ao_DEV/'
    if not os.path.exists(path + 'Logs'):
        os.makedirs(path + 'Logs')
        
    logging.basicConfig(
        filename = path + 'Logs/webscraping_hm.log',
        level = logging.DEBUG,
        format= '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
        datefmt = '%Y-%m-%d %H:%M:%S')

    # logger é o agente que vai fazer o logging, se deixar padrão vai aparecer root, mas colocar o nome da aplicação
    logger = logging.getLogger('webscraping_hm')
    
    #parameters and constants
    # parameters
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

    # URL
    url = 'https://www2.hm.com/en_us/men/products/jeans.html'
    
    # data colletion
    data = data_colletion(url, headers)
    logger.info('data colletion done')
    
    # data colletion by product
    data_product = data_colletion_by_product(data, headers)
    logger.info('data colletion by product done')
    
    # data cleaning
    data_product_cleaned = data_cleaning(data_product)
    logger.info('data cleaning done')
    
    #data insertion
    data_insert(data_product_cleaned)
    logger.info('data insertion done')

-----

In [4]:
 conn = create_engine( 'sqlite:///database_hm.sqlite', echo=False )

In [5]:
query = """
SELECT * FROM vitrine
"""

df1 = pd.read_sql_query(query, conn)
df1.head()

Unnamed: 0,product_id,style_id,color_id,product_name,color_name,fit,product_price,cotton,polyester,spandex,scrapy_datetime
0,1024256002,1024256,2,slimjeans,light_denim_blue,slim_fit,19.99,0.99,0.65,0.01,2022-09-14 10:03:28
1,1024256003,1024256,3,slimjeans,light_denim_blue,slim_fit,19.99,0.99,0.65,0.01,2022-09-14 10:03:28
2,1024256004,1024256,4,slimjeans,denim_blue,slim_fit,19.99,0.99,0.65,0.01,2022-09-14 10:03:28
3,1024256005,1024256,5,slimjeans,dark_blue,slim_fit,19.99,0.99,0.65,0.01,2022-09-14 10:03:28
4,1024256006,1024256,6,slimjeans,dark_denim_blue,slim_fit,19.99,1.0,0.0,0.01,2022-09-14 10:03:28
