# **1.0 Data Scrapping**

In [6]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import numpy as np
import re
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

#Encontrando o html dos produtos
soup = BeautifulSoup(page.text, 'html.parser')
products = soup.find('ul', class_ = 'products-listing small')
products_list = products.find_all('article', class_= 'hm-product-item')


#Product id
product_id = [p.get('data-articlecode') for p in products_list]


#Product Category
product_category = [p.get('data-category') for p in products_list]

#Product name
products_list = products.find_all('a', class_= 'link')
product_name = [p.get_text() for p in products_list]

#Price
price_list = products.find_all('span', class_= 'price regular')
product_price = [ p.get_text() for p in price_list]
product_price

data = pd.DataFrame([product_id, product_category, product_name,  product_price]).T
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']

# **2.0 Data Collection by Product**

In [7]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
# empty dataframe
df_details = pd.DataFrame()
# unique columns for all products
aux = []
cols = ['Art. No.', 'Composition', 'Fit', 'Product safety', 'Size']
df_compositions = pd.DataFrame( columns=cols )
for i in range( len( data ) ):
    
    # API Requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id']+ '.html'
    page = requests.get( url, headers=headers )
    
    # Beautiful Soup object
    soup = BeautifulSoup( page.text, 'html.parser' )
    
    # ==================== color name =================================
    product_list = soup.find_all( 'a', class_='filter-option miniature' ) soup.find_all( 'a', class_='filter-option miniature-active' )
    color_name = [p.get( 'data-color' ) for p in product_list]
    
    
    # product id
    product_id = [p.get( 'data-articlecode' ) for p in product_list]
    df_color = pd.DataFrame( [product_id, color_name] ).T
    df_color.columns = ['product_id', 'color_name']
    
    
    # generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply( lambda x: x[:-3] )
    df_color['color_id'] = df_color['product_id'].apply( lambda x: x[-3:] )
    
    # ==================== composition =================================
    product_composition_list = soup.find_all( 'div', class_='pdp-description-list-item' )
    product_composition = [list( filter( None, p.get_text().split( '\n' ) ) ) for p in product_composition_list]
    
    # rename dataframe
    df_composition = pd.DataFrame( product_composition ).T
    df_composition.columns = df_composition.iloc[0]
    
    # delete first row
    df_composition = df_composition.iloc[1:].fillna( method='ffill' )

    #remove pocket ling, shell and lining
    df_composition['Composition'] = df_composition['Composition'].replace('Pocket lining:', '', regex = True )
    df_composition['Composition'] = df_composition['Composition'].replace('Shell:', '', regex = True )
    df_composition['Composition'] = df_composition['Composition'].replace('Lining:', '', regex = True )
    
    # garantee the same number of columns
    df_composition = pd.concat( [df_pattern, df_composition], axis=0 )

    #Rename columns
    df_composition.columns = ['product_id', 'composition', 'fit', 'product_safety', 'size']

    # generate style id + color id
    #df_composition['style_id'] = df_composition['Art. No.'].apply( lambda x: x[:-3] )
    #df_composition['color_id'] = df_composition['Art. No.'].apply( lambda x: x[-3:] )
    
    #Keep new columns if it shows up
    aux = aux + df_composition.columns.tolist()

    # merge data color + decomposition 
    
    df_composition = pd.merge( df_composition, df_color, how='left', on='product_id' )
    
    # all details products
    df_compositions = pd.concat( [df_compositions, df_composition], axis=0 )

    
    # Join Showroom data + details
    # data['style_id'] = data['product_id'].apply( lambda x: x[:-3] )
    # data['color_id'] = data['product_id'].apply( lambda x: x[-3:] )
    # data_raw = pd.merge( data, df_details[['style_id', 'color_name', 'Fit','Composition', 'Size', 'Product safety']], 
    #                     how='left', on='style_id' )

# **3.0 Data Cleaning**    

In [8]:
data = pd.read_csv('product_hm.csv')
data = data.drop(columns = ['Unnamed: 0'])
# product id
data = data.dropna( subset=['product_id'] )
data['product_id'] = data['product_id'].astype( int )

# product name
#data['product_name'] = data['product_name'].apply( lambda x: x.replace( ' ' '_' ).lower() 

data['Price'] = data['Price'].apply( lambda x: x.replace( '$ ','' ) ).astype( float )

# style id
data['style_id'] = data['style_id'].astype( int )

# color id
data['color_id'] = data['color_id'].astype( int )

# color name
#data['color_name'] = data['color_name'].apply( lambda x: x.replace( ' ', '_' ).replace( '/', '_' ).lower() if pd.notnull( x ) else x )

# fit
data['Fit'] = data['Fit'].apply( lambda x: x.replace( ' ', '_' ).lower() if pd.notnull( x ) else x )

# size number
data['size_number'] = data['Size'].apply( lambda x: re.search( '\d{3}cm', x ).group(0) if pd.notnull( x ) else x )
data['size_number'] = data['size_number'].apply( lambda x: re.search( '\d+', x).group(0) if pd.notnull( x ) else x )

# size model
data['size_model'] = data['Size'].str.extract( '(\d+/\\d+)' )
# composition
data = data[~data['Composition'].str.contains( 'Pocket lining:', na=False )]
data = data[~data['Composition'].str.contains( 'Lining:', na=False )]
data = data[~data['Composition'].str.contains( 'Shell:', na=False )]

# drop duplicates
data = data.drop_duplicates( subset=['product_id', 'product_category', 'Price','scrapy_datetime', 'style_id', 'color_id','color_name', 'Fit'], keep='last' )

# reset index
data = data.reset_index( drop=True )
# break composition by comma
df1 = data['Composition'].str.split( ',', expand=True )
# cotton | polyester | elastano | elasterell
df_ref = pd.DataFrame( index=np.arange( len( data ) ),columns=['cotton','polyester', 'elastane', 'elasterell'] )
# cotton
df_cotton = df1[0]
df_cotton.name = 'cotton'
df_ref = pd.concat( [df_ref, df_cotton ], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last')]
df_ref['cotton'] = df_ref['cotton'].fillna( 'Cotton 0%' )
# polyester
df_polyester = df1.loc[df1[1].str.contains( 'Polyester', na=True ), 1]
df_polyester.name = 'polyester'
df_ref = pd.concat( [df_ref, df_polyester], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last') ]
df_ref['polyester'] = df_ref['polyester'].fillna( 'Polyester 0%' )
# elastano
df_elastane = df1.loc[df1[1].str.contains( 'Elastane', na=True ), 1]
df_elastane.name = 'elastane'
# combine elastane from both columns 1 and 2
df_elastane = df_elastane.combine_first( df1[2] )

df_ref = pd.concat( [df_ref, df_elastane], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last') ]
df_ref['elastane'] = df_ref['elastane'].fillna( 'Elastane 0%' )
# elasterell
df_elasterell = df1.loc[df1[1].str.contains( 'Elasterell', na=True ), 1]
df_elasterell.name = 'elasterell'
df_ref = pd.concat( [df_ref, df_elasterell], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last') ]
df_ref['elasterell'] = df_ref['elasterell'].fillna( 'Elasterell-P 0%' )
# final join
data = pd.concat( [data, df_ref], axis=1 )
# format composition data
data['cotton'] = data['cotton'].apply( lambda x: int( re.search( '\d+', x ).group(0) ) / 100 if pd.notnull( x ) else x )
data['polyester'] = data['polyester'].apply( lambda x: int( re.search( '\d+', x).group(0) ) / 100 if pd.notnull( x ) else x )
data['elastane'] = data['elastane'].apply( lambda x: int( re.search( '\d+', x ).group(0) ) / 100 if pd.notnull( x ) else x )
data['elasterell'] = data['elasterell'].apply( lambda x: int(re.search('\d+',x ).group(0) ) / 100 if pd.notnull( x ) else x )
# Drop columns
data = data.drop( columns=['Size', 'Product safety', 'Composition'], axis=1 )
# Drop duplicates
data = data.drop_duplicates()
data.shape

(134, 14)