# Imports

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import re
from sqlalchemy import create_engine
import sqlite3
import lxml

In [2]:
# functions

def request_soup(url_link):    
    headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'}    
    page = requests.get( url, headers = headers)
    soup_obj = BeautifulSoup(page.text, 'html.parser')
    return( soup_obj )

# Data Requesting

##  Home Page Scraping

In [3]:
# all products url
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

# headers for request
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'}

# requesting
page = requests.get(url=url, headers=headers)

# instatiating bs4 object
soup = BeautifulSoup(page.text, 'html.parser')

In [4]:
# finding load more products element
p = soup.find('div', class_='load-more-products')

# all products
all_products = int(p.find('h2').get('data-total'))

# products per page
products_per_page = int(p.find('h2').get('data-items-shown'))

# rounding up numer of pages needed for web scraping
total_pages = np.ceil(all_products/products_per_page)


##  All products in Home Page Scraping

In [5]:
# creating a page with all products
url_all_prods = url + '?&offset=0&page-size={}'.format(int(total_pages*products_per_page))

all_prods = requests.get(url = url_all_prods, headers=headers)

In [6]:
soup = BeautifulSoup(all_prods.text, 'html.parser')#.get('li', class_='product-item')

# soup.find('li', class_ = 'product-item').find('a').get('href') #.get('item-link')  #.get('item-link') #, class_ = 'item-link')
# all find all products listed in homepage
products = soup.find_all('li', class_='product-item')

# get link to all projects
home_links = ['https://www2.hm.com' + link.find('a').get('href') for link in products ]

##  All products in Each Product Page

In [7]:
# resulting list of all products to scrap
links = []

for link in home_links:
    # scrap each product in home page list
    single_product = requests.get(link, headers = headers)
    soup = BeautifulSoup(single_product.text, 'html.parser')

    # gets the links to all products listed in a page
    products_ul = soup.find('ul', class_='inputlist clearfix')
    products = products_ul.find_all('a')

    links_ul = []
    links_ul = [ 'https://www2.hm.com' + item.get('href') for item in products]
    links.extend(links_ul)

In [38]:
# getting all unique products listed

# converting to a set and then back to list
links = list(set(links))
links.sort()

In [355]:
# defining base dataframe
df_prods = pd.DataFrame(columns=['product_id', 'color', 'style_id', 'color_id', 'link'])

for link in home_links:
    
    # scrap each product in home page list
    single_product = requests.get(link, headers = headers)
    soup = BeautifulSoup(single_product.text, 'html.parser')

    # scrap all products listed in a page
    products_ul = soup.find('ul', class_='inputlist clearfix')
    products = products_ul.find_all('a')

    for product in products:
        
        #product it
        product_id = product.get('data-articlecode')

        # color
        color = product.get('data-color')
        
        # style id
        style_id = product_id[:-3]
        
        # style id
        color_id = product_id[-3:]

        # link
        link = 'https://www2.hm.com/en_us/productpage.{}.html'.format(product_id)

        df_temp = pd.DataFrame( {'product_id': product_id, 'color': color, 'style_id' :style_id, 'color_id' : color_id, 'link': link}, index = [0] )
        
        df_prods = pd.concat([df_prods, df_temp], axis = 0)


df_prods.reset_index(inplace = True, drop = True)

df_prods.drop_duplicates('product_id',inplace = True)

In [187]:
# defining base dataframe
# df_prods = pd.DataFrame(columns=['product_id', 'color', 'style_id', 'color_id', 'link'])
df_prods = pd.DataFrame()

for link in links[:5]:
    
    # scrap each product in home page list
    single_product = requests.get(link, headers = headers)
    soup = BeautifulSoup(single_product.text, 'html.parser')
    
    # scrap all products listed in a page
    products_ul = soup.find('ul', class_='inputlist clearfix')
    products = products_ul.find_all('a')

    # product headline
    headline = soup.find('h1', class_='primary product-item-headline').text


    for product in products:
        
        #product it
        sku = product.get('data-articlecode')
       
        # color
        color = product.get('data-color')
        
        # product id
        product_id = sku[:-3]
        
        # style id
        color_id = sku[-3:]

        # link
        link = 'https://www2.hm.com/en_us/productpage.{}.html'.format(product_id)

        df_temp = pd.DataFrame( {'sku': sku, 'product_id' :product_id, 'color_id' : color_id, 'color': color, 'headline' : headline, 'link': link}, index = [0] )
        
        df_prods = pd.concat([df_prods, df_temp], axis = 0)


df_prods.reset_index(inplace = True, drop = True)

df_prods.drop_duplicates('sku',inplace = True)

In [196]:
soup.find('div', class_='price parbase')
soup.find('div', class_='primary-row product-item-price')
soup.find('div', class_='ProductPrice-module--productItemPrice__2i2Hc')
soup.find('section', class_='name-price')

<section class="name-price">
<h1 class="primary product-item-headline">
							  Trashed Skinny Jeans</h1>
<div class="price parbase"><div class="primary-row product-item-price">
<span class="price-value">
                $39.99</span>
</div>
</div>
<hm-comparative-price id="comparative-price" price-per-unit="" unit=""></hm-comparative-price>
</section>

In [188]:
df_prods

Unnamed: 0,sku,product_id,color_id,color,headline,link
0,427159001,427159,1,Black denim,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.0427159....
1,427159002,427159,2,Blue washed out,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.0427159....
2,427159003,427159,3,Denim blue,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.0427159....
3,427159004,427159,4,Light denim blue,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.0427159....
4,427159005,427159,5,Dark denim blue,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.0427159....
5,427159006,427159,6,Light denim blue,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.0427159....
6,427159007,427159,7,Black washed out,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.0427159....
7,427159008,427159,8,Denim blue,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.0427159....
8,427159010,427159,10,White denim,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.0427159....
9,427159011,427159,11,Light blue washed out,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.0427159....


## Individual Scraping Selenium


### Output Format

In [154]:
df_b = pd.read_csv('./backups/df_backup-2021-12-14_16_43_22.csv')

In [164]:
df_b.Fit.unique()
df_b.Composition[0]
# df_b.texts[0]
df_b.columns

Index(['Art. No.', 'style_id', 'color_id', 'color', 'Fit', 'Price',
       'Composition', 'texts', 'Cotton', 'Polyester', 'Elastane',
       'Elasterell-P', 'Modal', 'Viscose', 'link', 'date'],
      dtype='object')

sku, product_id, color_id, color, fit, price, head_line, text, 'cotton', 'polyester', 'elastane', 'elasterell_p', 'spandex' 'modal', 'viscose'
pocket_lining, 

## Individual Scraping

### Scraping Data

In [198]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By

options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)

In [200]:
links[0]

'https://www2.hm.com/en_us/productpage.0427159001.html'

### Text

In [236]:
# driver.get(links[0])
driver.get("https://www2.hm.com/en_us/productpage.1024256004.html")

In [237]:
elements = driver.find_elements(by=By.CLASS_NAME, value="ProductAttributesList-module--descriptionListItem__3vUL2")
for e in elements:
    print(e.text)


FitSlim fit
CompositionShell: Cotton 99%, Spandex 1%Pocket lining: Polyester 65%, Cotton 35%
More sustainable materialsLining: Recycled polyester 65%, Recycled cotton 10%Shell: Recycled cotton 20%
Art. No.1024256004


In [260]:
text = str()

text = [text + line.text  for line  in elements]
text

['FitSlim fit',
 'CompositionShell: Cotton 99%, Spandex 1%Pocket lining: Polyester 65%, Cotton 35%',
 'More sustainable materialsLining: Recycled polyester 65%, Recycled cotton 10%Shell: Recycled cotton 20%',
 'Art. No.1024256004']

In [272]:
# searching for words fit and composition in all text retrieved from products web page

for element in text:
    if 'fit' in element:
        fit = element
    if 'Composition' in element:
        Composition = element

In [289]:
driver.quit()

In [285]:
# positive lookbehind + words I'm searching + positive lookahead
regex = "(?<=Fit)(.*)(?= fit)"
fit_result = re.findall(regex, fit)[0]
fit_result

'Slim'

In [None]:
texts = []
for text in df['Composition']:
    if 'Pocket' in text:
        regex = '(Shell: .* Pocket|Shell.*%|Cotton.* Pocket|% Cotton.*)'
        texts.append(re.findall( regex, text)[0])
    else:
        regex = '(Cotton.*Lining|Cotton.*%)'
        texts.append(re.findall( regex, text)[0])

df = pd.DataFrame()
df['texts'] = texts
df_comp = df[['Composition', 'texts']].copy()
df_comp[[ 'Cotton', 'Polyester', 'Elastane', 'Elasterell-P', 'Modal', 'Viscose' ]] = 0

for row in range(len( df_comp) ):
    text = str( df_comp.loc[row,'texts'])
    cotton_res = re.findall( 'Cotton [0-9]*%' , text)
    poly_res = re.findall( 'Polyester [0-9]*%' , text)
    elas_res = re.findall( 'Elastane [0-9]*%' , text)
    elasrell_res = re.findall( 'Elasterell-P [0-9]*%' , text)
    modal_res = re.findall( 'Modal [0-9]*%', text)
    visc_res = re.findall( 'Viscose [0-9]*%', text)

In [234]:
elements = driver.find_element(by=By.CLASS_NAME, value = "ProductDescription-module--descriptionText__1zy9P")
elements.text


'5-pocket jeans in stretch cotton denim. Regular waist, zip fly with button, and slim legs.'

In [110]:
links[]

'https://www2.hm.com/en_us/productpage.0427159010.html'

In [144]:
driver.get("https://www2.hm.com/en_us/productpage.1024256001.html")

In [150]:
elements = driver.find_elements(by=By.CLASS_NAME, value="ProductAttributesList-module--descriptionListItem__3vUL2")
for e in elements:
    print(e.text)


FitSkinny fit
CompositionCotton 91%, Polyester 7%, Spandex 2%
Art. No.0427159001


In [151]:
elements = driver.find_element(by=By.CLASS_NAME, value = "ProductDescription-module--descriptionText__1zy9P")
elements.text


'5-pocket jeans in washed stretch denim. Heavily distressed details, regular waist, and button fly. Skinny legs.'

In [152]:
driver.quit()

In [217]:
driver.get("https://www2.hm.com/en_us/productpage.0427159022.html")
# driver.get("https://www2.hm.com/en_us/productpage.1008110003.html")

In [220]:
from selenium.webdriver.support.ui import WebDriverWait

In [229]:
driver.get("https://www2.hm.com/en_us/productpage.0427159022.html")
# driver.get("https://www2.hm.com/en_us/productpage.1008110003.html")


content = WebDriverWait(driver, timeout=10).until(lambda x: x.find_element(by=By.CLASS_NAME, value='ProductPrice-module--productItemPrice__2i2Hc'))
# content = driver.find_element(by=By.CLASS_NAME, value='ProductPrice-module--productItemPrice__2i2Hc')
content.text

'$39.99$9.99'

# TODO

- separate compositions
- create feature ispromo
- create final df

In [326]:
# SAVING THIS FOR REFERENCE IN CASE NEEDED LATER

# soup.find_all('a', class_='swatch')#[0].get('title')
# elements = soup.find_all('a', class_='swatch')

# df = pd.DataFrame(columns=['product_id', 'color', 'style_id', 'color_id', 'link'])

# for element in elements:
#     # article number
#     product_id = element.get('href').split('.')[1]
    
#     # color
#     color = element.get('title')

#     # style id
#     style_id = product_id[:-3]
    
#     # style id
#     color_id = product_id[-3:]

#     # link
#     link = 'https://www2.hm.com/en_us/productpage.{}.html'.format(product_id)

#     df_temp = pd.DataFrame( {'product_id': product_id, 'color': color, 'style_id' :style_id, 'color_id' : color_id, 'link': link}, index = [0] )
    
#     df = pd.concat([df, df_temp], axis = 0)

# df.reset_index(inplace = True, drop = True)


# Data Parsing

# Data Saving