# Imports

In [1]:
%%time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import re
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
import time

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import gc

CPU times: user 1.04 s, sys: 344 ms, total: 1.39 s
Wall time: 6.79 s


In [88]:
# functions

def request_soup(url_link):    
    headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'}    
    page = requests.get( url, headers = headers)
    soup_obj = BeautifulSoup(page.text, 'html.parser')
    return( soup_obj )

def composition_to_df(list_of_comp):
    """ Creates a dataframe from a list of compositions"""
    keys = []
    values = []

    for idx, element in enumerate(list_of_comp):
        # if idx is even, element is a key (column in dataframe)
        if idx % 2 == 0:
            keys.append(element)
        else:
            values.append(element.strip('%,\n')) #added \n

    # final dataframe
    res = dict(zip(keys,values))
    res = pd.DataFrame(res, index = [0])
    return (res)

# Data Requesting

##  Home Page Scraping

In [3]:
# all products url
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

# headers for request
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'}

# requesting
page = requests.get(url=url, headers=headers)

# instatiating bs4 object
soup = BeautifulSoup(page.text, 'html.parser')

print('Starting Web Scraping')

Starting Web Scraping


In [4]:
# finding load more products element
p = soup.find('div', class_='load-more-products')

# all products
all_products = int(p.find('h2').get('data-total'))

# products per page
products_per_page = int(p.find('h2').get('data-items-shown'))

# rounding up numer of pages needed for web scraping
total_pages = np.ceil(all_products/products_per_page)


##  All products in Home Page Scraping

In [5]:
# creating a page with all products
url_all_prods = url + '?&offset=0&page-size={}'.format(int(total_pages*products_per_page))

all_prods = requests.get(url = url_all_prods, headers=headers)

In [6]:
soup = BeautifulSoup(all_prods.text, 'html.parser')#.get('li', class_='product-item')

# soup.find('li', class_ = 'product-item').find('a').get('href') #.get('item-link')  #.get('item-link') #, class_ = 'item-link')
# all find all products listed in homepage
products = soup.find_all('li', class_='product-item')

# get link to all projects
home_links = ['https://www2.hm.com' + link.find('a').get('href') for link in products ]

##  All products in Each Product Page

In [7]:
%%time

# resulting list of all products to scrap
links = []

for link in home_links:
    # scrap each product in home page list
    single_product = requests.get(link, headers = headers)
    soup = BeautifulSoup(single_product.text, 'html.parser')

    # gets the links to all products listed in a page
    products_ul = soup.find('ul', class_='inputlist clearfix')
    products = products_ul.find_all('a')

    links_ul = []
    links_ul = [ 'https://www2.hm.com' + item.get('href') for item in products]
    links.extend(links_ul)

CPU times: user 22.5 s, sys: 272 ms, total: 22.8 s
Wall time: 1min 11s


In [8]:
# getting all unique products listed

# converting to a set and then back to list
links = list(set(links))
links.sort()

In [9]:
# test bench - in case the website devs change some classes names or anything

# soup.find('h1', class_='primary product-item-headline').text
# soup.find('h1', class_="ProductName-module--container__3e-gi").text
# soup.find('h1', class_="web-component-placeholder breadcrumbs-placeholder").text
class_ = "product-name-price"
class_ = "Heading-module--general__3HQET ProductName-module--productTitle__1T9f0 Heading-module--small__SFfSh"
# soup.find('h1', class_=class_)
soup.find_all(id='js-product-name')

soup.find('hm-product-name', id = 'js-product-name').text

'\n\nSlim Tapered Cropped Jeans\n\n\n\n'

In [10]:
# links[0]

In [125]:
# defining base dataframe
df_prods = pd.DataFrame()

for link in links:
    
    # scrap each product in home page list
    single_product = requests.get(link, headers = headers)
    soup = BeautifulSoup(single_product.text, 'html.parser')
    
    # scrap all products listed in a page
    products_ul = soup.find('ul', class_='inputlist clearfix')
    products = products_ul.find_all('a')

    # product headline
    # try:
    #     headline = soup.find('h1', class_='primary product-item-headline').text
    # except:
    #     headline = 'NA'

    headline = soup.find('hm-product-name', id = 'js-product-name').text


    for product in products:
        
        #product it
        sku = product.get('data-articlecode')
       
        # color
        color = product.get('data-color')
        
        # product id
        product_id = sku[:-3]
        
        # style id
        color_id = sku[-3:]

        # link
        link = 'https://www2.hm.com/en_us/productpage.{}.html'.format(sku)

        df_temp = pd.DataFrame( {'sku': sku, 'product_id' :product_id, 'color_id' : color_id, 'color': color, 'headline' : headline, 'link': link}, index = [0] )
        
        df_prods = pd.concat([df_prods, df_temp], axis = 0)


df_prods.drop_duplicates('sku',inplace = True)

df_prods.reset_index(inplace = True, drop = True)

In [126]:
df_prods.head()

Unnamed: 0,sku,product_id,color_id,color,headline,link
0,427159001,427159,1,Black denim,\n\nTrashed Skinny Jeans\n\n\n\n,https://www2.hm.com/en_us/productpage.04271590...
1,427159002,427159,2,Blue washed out,\n\nTrashed Skinny Jeans\n\n\n\n,https://www2.hm.com/en_us/productpage.04271590...
2,427159003,427159,3,Denim blue,\n\nTrashed Skinny Jeans\n\n\n\n,https://www2.hm.com/en_us/productpage.04271590...
3,427159004,427159,4,Light denim blue,\n\nTrashed Skinny Jeans\n\n\n\n,https://www2.hm.com/en_us/productpage.04271590...
4,427159005,427159,5,Dark denim blue,\n\nTrashed Skinny Jeans\n\n\n\n,https://www2.hm.com/en_us/productpage.04271590...


In [127]:
gc.collect()

85329

## Individual Scraping

### Instantiating a Web Driver

In [14]:
# starting drive

#options = Options()
#options.headless = True
#driver = webdriver.Firefox(options=options)

# set implicit wait
#driver.implicitly_wait(20) # seconds

In [15]:
# print(driver.timeouts._implicit_wait/1000)
# print(driver.timeouts._page_load/1000)
# print(driver.timeouts._script/1000)

### Scrapping Everything

In [128]:
%%time

# attempting to instantiate the driver and close it inside the for loop

df_comp = pd.DataFrame()

# defining driver options
options = Options()
options.headless = True


# wait at max 120s
time_out = 120

for idx, link in enumerate(df_prods['link']):

    # sku
    sku = link.split('.')[3]
    print('scraping page {}/{}: {}'.format( idx+1, len(df_prods), link))
    
    # instantiating the driver
    driver = webdriver.Firefox(options=options)
    # set implicit wait
    driver.implicitly_wait(20) # seconds

    # load web page
    driver.get(link)

    # sleep
    
    time.sleep(3)
    # get price
    # try this class (for no promo days)
    class_price = "ProductPrice-module--productItemPrice__2i2Hc"
    # element = WebDriverWait(driver, timeout=time_out).until( EC.presence_of_element_located( (By.CLASS_NAME, class_price) ) )
    element = WebDriverWait(driver, timeout=time_out).until( EC.visibility_of_element_located( (By.CLASS_NAME, class_price) ) )
    price = element.text

    # if element returns empty, try this other class
    if element.text == '':
        class_price = "price.parbase"
        # element = WebDriverWait(driver, timeout=time_out).until( EC.presence_of_element_located( (By.CLASS_NAME, class_price) ) )
        element = WebDriverWait(driver, timeout=time_out).until( EC.visibility_of_element_located( (By.CLASS_NAME, class_price) ) )
        price = element.text
        
        if price == '':
            price = 'NA'
    
    # get product description   
    class_desc = "ProductDescription-module--descriptionText__1zy9P"      
    # test if description exists
    try: 
        # content = WebDriverWait(driver, timeout=time_out).until(EC.presence_of_element_located( (By.CLASS_NAME, class_desc) ))
        content = WebDriverWait(driver, timeout=time_out).until(EC.visibility_of_element_located( (By.CLASS_NAME, class_desc) ))
        desc = content.text
    except:
        desc = 'NA'
    
    # get text
    class_text = 'ProductAttributesList-module--descriptionListItem__3vUL2'
    # contents = WebDriverWait(driver, timeout=time_out).until( EC.presence_of_all_elements_located( (By.CLASS_NAME, class_text) ) )
    contents = WebDriverWait(driver, timeout=time_out).until( EC.visibility_of_all_elements_located( (By.CLASS_NAME, class_text) ) )

    
    # concatenate all lines of text
    text = str()
    # list with all text
    text = [text + line.text  for line  in contents]

    # separate fit and composition from text
    # if fit or composition is not informed they'll return NA
    fit = 'NA'
    composition = 'NA'
    for element in text:
        if 'fit' in element:
            fit = element
        if 'Composition' in element:
            composition = element    
    
    # saving raw text
    text_raw =' /'.join(text)
    
    # saving results
    df_aux = pd.DataFrame( {'sku' : sku, 'price' : price, 'fit' : fit, 'composition' : composition, 'description' : desc ,'text' : text_raw,}, index = [0] )
    df_comp = pd.concat( [df_comp, df_aux], axis = 0 )     
    
    # closing driver
    #driver.close()
    driver.quit()

    # collecting garbage to reduce memory consumption
    gc.collect()

    # sleep to avoid memory leak
    time.sleep(10)
    # time.sleep(1)

df_comp.reset_index(inplace = True, drop = True)

scraping page 1/174: https://www2.hm.com/en_us/productpage.0427159001.html
scraping page 2/174: https://www2.hm.com/en_us/productpage.0427159002.html
scraping page 3/174: https://www2.hm.com/en_us/productpage.0427159003.html
scraping page 4/174: https://www2.hm.com/en_us/productpage.0427159004.html
scraping page 5/174: https://www2.hm.com/en_us/productpage.0427159005.html
scraping page 6/174: https://www2.hm.com/en_us/productpage.0427159006.html
scraping page 7/174: https://www2.hm.com/en_us/productpage.0427159007.html
scraping page 8/174: https://www2.hm.com/en_us/productpage.0427159008.html
scraping page 9/174: https://www2.hm.com/en_us/productpage.0427159010.html
scraping page 10/174: https://www2.hm.com/en_us/productpage.0427159011.html
scraping page 11/174: https://www2.hm.com/en_us/productpage.0427159017.html
scraping page 12/174: https://www2.hm.com/en_us/productpage.0427159022.html
scraping page 13/174: https://www2.hm.com/en_us/productpage.0427159023.html
scraping page 14/174:

In [21]:
# quitting driver

driver.quit()

In [130]:
gc.collect()

1694

In [131]:
df_comp.shape

(174, 6)

In [132]:
df_comp.head()

Unnamed: 0,sku,price,fit,composition,description,text
0,427159001,$39.99,Fit\nSkinny fit,"Composition\nCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 91%, Poly..."
1,427159002,$39.99,Fit\nSkinny fit,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 93%, Poly..."
2,427159003,$39.99,Fit\nSkinny fit,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 93%, Poly..."
3,427159004,$39.99,Fit\nSkinny fit,"Composition\nCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 99%, Span..."
4,427159005,$39.99,Fit\nSkinny fit,"Composition\nCotton 72%, Polyester 20%, Modal ...",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 72%, Poly..."


# Data Parsing

### Composition

In [133]:
# removing composition using regex

df_comp_aux = df_comp.copy()

comps = []
linings = []

for idx, text in enumerate(df_comp_aux['composition']):
    # case 1 pocket lining present
    if 'Pocket' in text:
        # regex = '(Shell: .*?=Pocket|Cotton.*(?=Pocket))'
        regex = 'Cotton.*(?=Pocket)'
        try:
            comp = re.findall( regex, text, flags=re.DOTALL)[0]
        except:
            comp = 'NA'
    # case 2 pocket lining not present
    else:
        regex = '(Cotton.*(?=Lining)|Cotton.*(?=lining)|Cotton.*%)'
        try:
            comp = re.findall( regex, text, flags=re.DOTALL)[0]
        except:
            comp = 'NA'
        # print(df_comp_aux.loc[idx, 'sku'] + '|' + text +' | ' + comp)
    
    # geting pocket composition:
    regex = '(?<=lining: ).*'
    try:
        lining = re.findall(regex, text, flags=re.DOTALL)[0]
    except:
        lining = 'Not Informed'
    linings.append(lining)
    
    comps.append(comp)
df_comp_aux['comp'] = comps
df_comp_aux['lining'] = linings

In [134]:
# result
df_comp_aux.head()

Unnamed: 0,sku,price,fit,composition,description,text,comp,lining
0,427159001,$39.99,Fit\nSkinny fit,"Composition\nCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 91%, Poly...","Cotton 91%, Polyester 7%, Spandex 2%",Not Informed
1,427159002,$39.99,Fit\nSkinny fit,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 93%, Poly...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed
2,427159003,$39.99,Fit\nSkinny fit,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 93%, Poly...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed
3,427159004,$39.99,Fit\nSkinny fit,"Composition\nCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 99%, Span...","Cotton 99%, Spandex 1%",Not Informed
4,427159005,$39.99,Fit\nSkinny fit,"Composition\nCotton 72%, Polyester 20%, Modal ...",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 72%, Poly...","Cotton 72%, Polyester 20%, Modal 7%, Spandex 1%",Not Informed


In [135]:
# creating a dataframe for all compositions
df_comp_split = pd.DataFrame()

for composition in df_comp_aux['comp']:
    comp_list = composition.split(' ') 

    # creating a df of compositions
    df_aux = composition_to_df(comp_list)

    # concatenating results
    df_comp_split = pd.concat( [df_comp_split, df_aux], axis = 0 )

df_comp_split.reset_index(inplace = True, drop = True)

In [136]:
# result
df_comp_aux = pd.concat( [df_comp_aux, df_comp_split], axis = 1 )
df_comp_aux.head()


Unnamed: 0,sku,price,fit,composition,description,text,comp,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester
0,427159001,$39.99,Fit\nSkinny fit,"Composition\nCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 91%, Poly...","Cotton 91%, Polyester 7%, Spandex 2%",Not Informed,91,7.0,2,,
1,427159002,$39.99,Fit\nSkinny fit,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 93%, Poly...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,
2,427159003,$39.99,Fit\nSkinny fit,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 93%, Poly...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,
3,427159004,$39.99,Fit\nSkinny fit,"Composition\nCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 99%, Span...","Cotton 99%, Spandex 1%",Not Informed,99,,1,,
4,427159005,$39.99,Fit\nSkinny fit,"Composition\nCotton 72%, Polyester 20%, Modal ...",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 72%, Poly...","Cotton 72%, Polyester 20%, Modal 7%, Spandex 1%",Not Informed,72,20.0,1,7.0,


In [137]:
df_comp_aux.tail()

Unnamed: 0,sku,price,fit,composition,description,text,comp,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester
169,1063426001,$34.99,,Composition\nCotton 100%,Edition by is an uncompromising collection of ...,"Size\nThe model is 188cm/6'2"" and wears a size...",Cotton 100%,Not Informed,100,,,,
170,1071707001,$29.99,Fit\nRelaxed fit,Composition\nShell: Cotton 100%\nPocket lining...,5-pocket jeans in sturdy cotton denim with a r...,Fit\nRelaxed fit /Composition\nShell: Cotton 1...,Cotton 100%\n,Cotton 100%,100,,,,
171,1071707002,$29.99,Fit\nRelaxed fit,Composition\nShell: Cotton 100%\nPocket lining...,5-pocket jeans in sturdy cotton denim with a r...,"Size\nThe model is 184cm/6'0"" and wears a size...",Cotton 100%\n,"Polyester 65%, Cotton 35%",100,,,,
172,1071707008,$29.99,Fit\nRelaxed fit,Composition\nShell: Cotton 100%\nPocket lining...,5-pocket jeans in sturdy cotton denim with a r...,"Size\nThe model is 182cm/6'0"" and wears a size...",Cotton 100%\n,"Polyester 65%, Cotton 35%",100,,,,
173,1074475001,$39.99,Fit\nLoose fit,Composition\nShell: Cotton 100%\nPocket lining...,5-pocket jeans in sturdy cotton denim with har...,"Size\nThe model is 187cm/6'2"" and wears a size...",Cotton 100%\n,"Polyester 65%, Cotton 35%",100,,,,


### Fit

In [138]:
# positive lookbehind + words I'm searching + positive lookahead
regex = "((?<=Fit).*(?= fit)|NA)"

df_comp_aux['fit'] = df_comp_aux['fit'].apply(lambda x: re.findall(regex, x, flags=re.DOTALL)[0].strip('\n') ) 
df_comp_aux.head()

Unnamed: 0,sku,price,fit,composition,description,text,comp,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester
0,427159001,$39.99,Skinny,"Composition\nCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 91%, Poly...","Cotton 91%, Polyester 7%, Spandex 2%",Not Informed,91,7.0,2,,
1,427159002,$39.99,Skinny,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 93%, Poly...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,
2,427159003,$39.99,Skinny,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 93%, Poly...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,
3,427159004,$39.99,Skinny,"Composition\nCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 99%, Span...","Cotton 99%, Spandex 1%",Not Informed,99,,1,,
4,427159005,$39.99,Skinny,"Composition\nCotton 72%, Polyester 20%, Modal ...",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 72%, Poly...","Cotton 72%, Polyester 20%, Modal 7%, Spandex 1%",Not Informed,72,20.0,1,7.0,


### Price

In [139]:
# df_comp_aux = df_comp_aux.copy()

# if there are 2 prices then there is a discount/promo
regex = "\$\d+\.\d+\$\d+.\d+"
df_comp_aux['isPromo'] = df_comp_aux['price'].apply(lambda x: 1 if bool(re.match(regex, x)) else 0)

# first price
regex = "^\$\d+\.\d+"
df_comp_aux['firstPrice'] = df_comp_aux['price'].apply( lambda x: re.findall(regex, x)[0] )

# second price
regex = "\$\d+\.\d+$"
df_comp_aux['secondPrice'] = df_comp_aux['price'].apply( lambda x: re.findall(regex, x)[0] )

# removing
df_comp_aux['firstPrice'] = df_comp_aux['firstPrice'].apply(lambda x: x.strip('$')).astype(float)
df_comp_aux['secondPrice'] = df_comp_aux['secondPrice'].apply(lambda x: x.strip('$')).astype(float)

# 
df_comp_aux['finalPrice'] = df_comp_aux.apply( lambda x: x['firstPrice'] if x['firstPrice'] <= x['secondPrice'] else x['secondPrice'], axis =1 )
df_comp_aux['originalPrice'] = df_comp_aux.apply( lambda x: x['secondPrice'] if x['secondPrice'] >= x['firstPrice'] else x['firstPrice'], axis =1 )

df_comp_aux.drop(['firstPrice', 'secondPrice'], axis = 1, inplace = True)

In [140]:
# df_comp_aux[df_comp_aux.firstPrice == df_comp_aux.secondPrice]
df_comp_aux.head()

Unnamed: 0,sku,price,fit,composition,description,text,comp,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester,isPromo,finalPrice,originalPrice
0,427159001,$39.99,Skinny,"Composition\nCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 91%, Poly...","Cotton 91%, Polyester 7%, Spandex 2%",Not Informed,91,7.0,2,,,0,39.99,39.99
1,427159002,$39.99,Skinny,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 93%, Poly...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,,0,39.99,39.99
2,427159003,$39.99,Skinny,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 93%, Poly...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,,0,39.99,39.99
3,427159004,$39.99,Skinny,"Composition\nCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 99%, Span...","Cotton 99%, Spandex 1%",Not Informed,99,,1,,,0,39.99,39.99
4,427159005,$39.99,Skinny,"Composition\nCotton 72%, Polyester 20%, Modal ...",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 72%, Poly...","Cotton 72%, Polyester 20%, Modal 7%, Spandex 1%",Not Informed,72,20.0,1,7.0,,0,39.99,39.99


In [141]:
print('Found {} promos'.format(df_comp_aux[df_comp_aux['isPromo']== True].shape[0]))
df_comp_aux[df_comp_aux['isPromo'] == True].tail()

Found 0 promos


Unnamed: 0,sku,price,fit,composition,description,text,comp,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester,isPromo,finalPrice,originalPrice


### Headline

In [142]:
# removing whitespace characteres

df_prods['headline'] = df_prods['headline'].apply(lambda x: x.strip('\n\t ')) 
df_prods.headline.value_counts()

Slim Jeans                        34
Skinny Jeans                      26
Regular Jeans                     23
Relaxed Jeans                     17
Trashed Skinny Jeans              15
Slim Tapered Jeans                14
Hybrid Regular Tapered Joggers     8
Regular Tapered Crop Jeans         6
Skinny Cropped Jeans               5
Freefit® Slim Jeans                5
Loose Jeans                        5
Slim Tapered Cropped Jeans         5
Hybrid Regular Denim Joggers       3
Relaxed Pull-on Jeans              3
Relaxed Denim Joggers              2
Regular Bootcut Jeans              2
Cotton Denim Jeans                 1
Name: headline, dtype: int64

In [143]:
df_comp_aux.head()


Unnamed: 0,sku,price,fit,composition,description,text,comp,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester,isPromo,finalPrice,originalPrice
0,427159001,$39.99,Skinny,"Composition\nCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 91%, Poly...","Cotton 91%, Polyester 7%, Spandex 2%",Not Informed,91,7.0,2,,,0,39.99,39.99
1,427159002,$39.99,Skinny,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 93%, Poly...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,,0,39.99,39.99
2,427159003,$39.99,Skinny,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 93%, Poly...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,,0,39.99,39.99
3,427159004,$39.99,Skinny,"Composition\nCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 99%, Span...","Cotton 99%, Spandex 1%",Not Informed,99,,1,,,0,39.99,39.99
4,427159005,$39.99,Skinny,"Composition\nCotton 72%, Polyester 20%, Modal ...",5-pocket jeans in washed stretch denim. Heavil...,"Fit\nSkinny fit /Composition\nCotton 72%, Poly...","Cotton 72%, Polyester 20%, Modal 7%, Spandex 1%",Not Informed,72,20.0,1,7.0,,0,39.99,39.99


In [144]:
df_final = pd.concat( [df_prods, df_comp_aux.drop('sku', axis = 1)], axis =1 )

# adding date time
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
df_final['date'] = now
df_final.head()

Unnamed: 0,sku,product_id,color_id,color,headline,link,price,fit,composition,description,...,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester,isPromo,finalPrice,originalPrice,date
0,427159001,427159,1,Black denim,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"Composition\nCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,91,7.0,2,,,0,39.99,39.99,2022-05-31 20:25:14
1,427159002,427159,2,Blue washed out,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,93,6.0,1,,,0,39.99,39.99,2022-05-31 20:25:14
2,427159003,427159,3,Denim blue,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,93,6.0,1,,,0,39.99,39.99,2022-05-31 20:25:14
3,427159004,427159,4,Light denim blue,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"Composition\nCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,99,,1,,,0,39.99,39.99,2022-05-31 20:25:14
4,427159005,427159,5,Dark denim blue,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"Composition\nCotton 72%, Polyester 20%, Modal ...",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,72,20.0,1,7.0,,0,39.99,39.99,2022-05-31 20:25:14


In [145]:
df_final.columns

Index(['sku', 'product_id', 'color_id', 'color', 'headline', 'link', 'price',
       'fit', 'composition', 'description', 'text', 'comp', 'lining', 'Cotton',
       'Polyester', 'Spandex', 'Modal', 'Elastomultiester', 'isPromo',
       'finalPrice', 'originalPrice', 'date'],
      dtype='object')

In [146]:
#sku, product_id, color_id, color, fit, price, headline, 'cotton', 'polyester', 'elastane', 'elasterell_p', 'spandex' 'modal', 'viscose', pocket_lining, text

# renaming some columns
selected_cols = ['finalPrice', 'originalPrice', 'isPromo'] 
rename_cols = ['final_price', 'original_price', 'is_promo'] 

final_cols = dict(zip(selected_cols, rename_cols))
df_final.rename(columns = final_cols, inplace = True )

# converting all columns to lower case
original_col = list(df_final.columns)
lower_col = [col.lower() for col in original_col]
final_cols = dict(zip(original_col, lower_col))

df_final.rename(columns = final_cols, inplace = True )

df_final.head()

Unnamed: 0,sku,product_id,color_id,color,headline,link,price,fit,composition,description,...,lining,cotton,polyester,spandex,modal,elastomultiester,is_promo,final_price,original_price,date
0,427159001,427159,1,Black denim,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"Composition\nCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,91,7.0,2,,,0,39.99,39.99,2022-05-31 20:25:14
1,427159002,427159,2,Blue washed out,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,93,6.0,1,,,0,39.99,39.99,2022-05-31 20:25:14
2,427159003,427159,3,Denim blue,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"Composition\nCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,93,6.0,1,,,0,39.99,39.99,2022-05-31 20:25:14
3,427159004,427159,4,Light denim blue,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"Composition\nCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,99,,1,,,0,39.99,39.99,2022-05-31 20:25:14
4,427159005,427159,5,Dark denim blue,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"Composition\nCotton 72%, Polyester 20%, Modal ...",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,72,20.0,1,7.0,,0,39.99,39.99,2022-05-31 20:25:14


### Converting Data Types

In [147]:
# converting dtypes to numeric
df_final.dtypes

sku                  object
product_id           object
color_id             object
color                object
headline             object
link                 object
price                object
fit                  object
composition          object
description          object
text                 object
comp                 object
lining               object
cotton               object
polyester            object
spandex              object
modal                object
elastomultiester     object
is_promo              int64
final_price         float64
original_price      float64
date                 object
dtype: object

In [148]:
# fill NAs
df_final.isna().sum()


sku                   0
product_id            0
color_id              0
color                 0
headline              0
link                  0
price                 0
fit                   0
composition           0
description           0
text                  0
comp                  0
lining                0
cotton                0
polyester           152
spandex              35
modal               173
elastomultiester    169
is_promo              0
final_price           0
original_price        0
date                  0
dtype: int64

In [149]:
# fill NAs as 0 on rows that do not have the above material
df_final.fillna(0, inplace = True)

In [150]:
# selects all numerical columns
cols_to_num = list(df_final.columns)
str_cols = ['sku','product_id','color_id', 'color', 'fit', 'price', 'final_price', 'original_price', 'headline', 'description', 'composition', 'comp', 'lining', 'text', 'link', 'date']
for col in str_cols:
    cols_to_num.remove(col)

cols_to_num
print(cols_to_num)

for col in cols_to_num:
    try:
        # convert to float then to int (to avoid NA to int error)
        df_final[col] = df_final[col].astype(int)
    except:
        df_final[col] = df_final[col].astype('Int64')
    finally:
        pass

# converting date to datetime
df_final['date'] = pd.to_datetime( df_final['date'], errors = 'coerce')

['cotton', 'polyester', 'spandex', 'modal', 'elastomultiester', 'is_promo']


In [151]:
df_final.dtypes

sku                         object
product_id                  object
color_id                    object
color                       object
headline                    object
link                        object
price                       object
fit                         object
composition                 object
description                 object
text                        object
comp                        object
lining                      object
cotton                       int64
polyester                    int64
spandex                      int64
modal                        int64
elastomultiester             int64
is_promo                     int64
final_price                float64
original_price             float64
date                datetime64[ns]
dtype: object

# Data Saving

## Saving Locally

In [None]:
# dropping unnecessary columns 

df_final.drop(['price', 'composition', 'comp'], axis = 1, inplace = True)

In [None]:
# Saving Locally
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# saving df as a local backup
# relative path (can't be used on bash script)
# df_final.to_csv('../backups/df_backup-{}.csv'.format(now), index = False)

# absolute path
df_final.to_csv('/home/ubuntu/project/Web-Scraping-Jeans/backups/df_backup-{}.csv'.format(now), index = False)

print('saved: df_backup-{}.csv'.format(now))

saved: df_backup-2022-04-19 10:56:51.csv


In [152]:
df_final.dtypes

sku                         object
product_id                  object
color_id                    object
color                       object
headline                    object
link                        object
price                       object
fit                         object
composition                 object
description                 object
text                        object
comp                        object
lining                      object
cotton                       int64
polyester                    int64
spandex                      int64
modal                        int64
elastomultiester             int64
is_promo                     int64
final_price                float64
original_price             float64
date                datetime64[ns]
dtype: object

## Inserting data to MySQL on AWS

In [None]:
# reading credentials

# relative path (can't be used on bash script)
# secrets_json = open('./secrets/secrets.json')

# absolute path
secrets_json = open('/home/ubuntu/project/Web-Scraping-Jeans/src/secrets/secrets.json')


secrets = json.load(secrets_json)

dialect =   secrets["dialect"]
driver =    secrets["driver"]
host =      secrets["host"]
username =  secrets["username"]
password =  secrets["password"]
port =      secrets["port"]
database =  secrets["database"]

url = "{}+{}://{}:{}@{}:{}/{}".format(dialect, driver, username, password, host, port, database)
# engine = create_engine(url = url, echo = True) #, pool_pre_ping = True

In [None]:
# instantianting engine

# creating sqlalchemy engine for connection
engine = create_engine(url, echo=True)

# creating a Session class
Session = sessionmaker(bind=engine)

# creating a session
session = Session()

In [None]:
# testing case a new column is added

try:
    # adding data
    df_final.to_sql('hm_showroom', con = engine, if_exists='append', index = False)

    # committing changes
    session.commit()
except:
    try:
        # in case scraped data returns with a new column, it will be added to a new table
        table_name = "hm_showroom_backup-{}".format(datetime.now().strftime("%Y-%m-%d"))
        df_final.to_sql( table_name, con = engine, if_exists='append', index = False)

        session.commit()
    except:  
        # if even this fails, undo everything      
        session.rollback()

finally:
    session.close()

# USE THIS SCRIPT TO PREVENT FAILS

2022-04-19 11:14:47,131 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-04-19 11:14:47,133 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-04-19 11:14:47,176 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2022-04-19 11:14:47,178 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-04-19 11:14:47,201 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2022-04-19 11:14:47,202 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-04-19 11:14:47,267 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2022-04-19 11:14:47,269 INFO sqlalchemy.engine.Engine [generated in 0.00163s] {'table_schema': 'humberto_personal_projects', 'table_name': 'hm_showroom'}
2022-04-19 11:14:47,316 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-04-19 11:14:47,351 INFO sqlalchemy.engine.Engine INSERT INTO hm_showroom (sku, product_id, color_id, color, headline, link, fit, description, text, lining, cotton, poly

In [None]:
session.close()

In [None]:
print('Job done')

**Brainstorm of solutions**
- increase sleep time
- change EC.wait from located to visible
- add implicit wait
- open and close a new driver
- add explicit wait after loading page
- clear driver cache at the end of each execution

Solution Attempts

- remove sleep and use visibility_of_element_located 
    - 5/170
- use visibility_of_element_located
    - 26/170
- add a implicitly wait and use time sleep  10 (v6)
    - 51/176
- use driver.close every loop doesn't work
    - driver.close and driver.quit both kill the process if using a single page
- use implicity wait, visibility_of_element_located, explicit sleep and open and close a driver for each page (safest way)
    - worked well so far

Conclusions

Due to some error the mozila driver wasn't working properlly if used continuously. This way I am opening and closing a new driver for each page scraped,
also I added a 20s implicit wait to make sure the page was properly loaded