# Imports

In [3]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import re
from sqlalchemy import create_engine
import sqlite3

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [4]:
# functions

def request_soup(url_link):    
    headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'}    
    page = requests.get( url, headers = headers)
    soup_obj = BeautifulSoup(page.text, 'html.parser')
    return( soup_obj )

##  Desired Output Format

In [5]:
df_b = pd.read_csv('./backups/df_backup-2021-12-14_16_43_22.csv')

In [6]:
df_b.Fit.unique()
df_b.Composition[0]
df_b.columns

Index(['Art. No.', 'style_id', 'color_id', 'color', 'Fit', 'Price',
       'Composition', 'texts', 'Cotton', 'Polyester', 'Elastane',
       'Elasterell-P', 'Modal', 'Viscose', 'link', 'date'],
      dtype='object')

sku, product_id, color_id, color, fit, price, head_line, 'cotton', 'polyester', 'elastane', 'elasterell_p', 'spandex' 'modal', 'viscose', pocket_lining, text

# Data Requesting

##  Home Page Scraping

In [7]:
# all products url
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

# headers for request
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'}

# requesting
page = requests.get(url=url, headers=headers)

# instatiating bs4 object
soup = BeautifulSoup(page.text, 'html.parser')

In [8]:
# finding load more products element
p = soup.find('div', class_='load-more-products')

# all products
all_products = int(p.find('h2').get('data-total'))

# products per page
products_per_page = int(p.find('h2').get('data-items-shown'))

# rounding up numer of pages needed for web scraping
total_pages = np.ceil(all_products/products_per_page)


##  All products in Home Page Scraping

In [9]:
# creating a page with all products
url_all_prods = url + '?&offset=0&page-size={}'.format(int(total_pages*products_per_page))

all_prods = requests.get(url = url_all_prods, headers=headers)

In [10]:
soup = BeautifulSoup(all_prods.text, 'html.parser')#.get('li', class_='product-item')

# soup.find('li', class_ = 'product-item').find('a').get('href') #.get('item-link')  #.get('item-link') #, class_ = 'item-link')
# all find all products listed in homepage
products = soup.find_all('li', class_='product-item')

# get link to all projects
home_links = ['https://www2.hm.com' + link.find('a').get('href') for link in products ]

##  All products in Each Product Page

In [11]:
# resulting list of all products to scrap
links = []

for link in home_links:
    # scrap each product in home page list
    single_product = requests.get(link, headers = headers)
    soup = BeautifulSoup(single_product.text, 'html.parser')

    # gets the links to all products listed in a page
    products_ul = soup.find('ul', class_='inputlist clearfix')
    products = products_ul.find_all('a')

    links_ul = []
    links_ul = [ 'https://www2.hm.com' + item.get('href') for item in products]
    links.extend(links_ul)

In [12]:
# getting all unique products listed

# converting to a set and then back to list
links = list(set(links))
links.sort()

In [13]:
# defining base dataframe
# df_prods = pd.DataFrame(columns=['product_id', 'color', 'style_id', 'color_id', 'link'])
df_prods = pd.DataFrame()

for link in links:
    
    # scrap each product in home page list
    single_product = requests.get(link, headers = headers)
    soup = BeautifulSoup(single_product.text, 'html.parser')
    
    # scrap all products listed in a page
    products_ul = soup.find('ul', class_='inputlist clearfix')
    products = products_ul.find_all('a')

    # product headline
    headline = soup.find('h1', class_='primary product-item-headline').text


    for product in products:
        
        #product it
        sku = product.get('data-articlecode')
       
        # color
        color = product.get('data-color')
        
        # product id
        product_id = sku[:-3]
        
        # style id
        color_id = sku[-3:]

        # link
        link = 'https://www2.hm.com/en_us/productpage.{}.html'.format(sku)

        df_temp = pd.DataFrame( {'sku': sku, 'product_id' :product_id, 'color_id' : color_id, 'color': color, 'headline' : headline, 'link': link}, index = [0] )
        
        df_prods = pd.concat([df_prods, df_temp], axis = 0)


df_prods.drop_duplicates('sku',inplace = True)

df_prods.reset_index(inplace = True, drop = True)

In [14]:
df_prods.head()

Unnamed: 0,sku,product_id,color_id,color,headline,link
0,427159001,427159,1,Black denim,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...
1,427159002,427159,2,Blue washed out,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...
2,427159003,427159,3,Denim blue,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...
3,427159004,427159,4,Light denim blue,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...
4,427159005,427159,5,Dark denim blue,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...


## Individual Scraping

### Scraping Data

In [34]:
# starting drive

options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)

In [16]:
links[0]

'https://www2.hm.com/en_us/productpage.0427159001.html'

### Product Description

In [17]:
driver.get("https://www2.hm.com/en_us/productpage.0811993021.html")

# class_ = "BodyText-module--general__32l6J" # if below doesn't work
class_ = "ProductDescription-module--descriptionText__1zy9P"

try: 
    content = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.CLASS_NAME, class_) ))
    desc = content.text
except:
    desc = 'NA'

content = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.CLASS_NAME, class_) ))
desc = content.text
desc

'H&M Essentials. No. 2: The Jeans. 5-pocket jeans in stretch cotton denim. Regular waist, zip fly with button, and straight legs with good room for movement over thighs and knees.'

### Text

In [18]:
# driver.get(links[0])
driver.get("https://www2.hm.com/en_us/productpage.1024256004.html")

In [19]:
elements = driver.find_elements(by=By.CLASS_NAME, value="ProductAttributesList-module--descriptionListItem__3vUL2")
for e in elements:
    print(e.text)

FitSlim fit
CompositionShell: Cotton 99%, Spandex 1%Pocket lining: Polyester 65%, Cotton 35%
More sustainable materialsShell: Recycled cotton 20%Lining: Recycled polyester 65%, Recycled cotton 10%
Art. No.1024256004


In [20]:
text = str()

text = [text + line.text  for line  in elements]
text

['FitSlim fit',
 'CompositionShell: Cotton 99%, Spandex 1%Pocket lining: Polyester 65%, Cotton 35%',
 'More sustainable materialsShell: Recycled cotton 20%Lining: Recycled polyester 65%, Recycled cotton 10%',
 'Art. No.1024256004']

In [21]:
text_raw =' /'.join(text)
text_raw

'FitSlim fit /CompositionShell: Cotton 99%, Spandex 1%Pocket lining: Polyester 65%, Cotton 35% /More sustainable materialsShell: Recycled cotton 20%Lining: Recycled polyester 65%, Recycled cotton 10% /Art. No.1024256004'

In [22]:
# searching for words fit and composition in all text retrieved from products web page
for element in text:
    if 'fit' in element:
        fit = element
    if 'Composition' in element:
        composition = element
print(fit)
print(composition)

FitSlim fit
CompositionShell: Cotton 99%, Spandex 1%Pocket lining: Polyester 65%, Cotton 35%


### Price

In [23]:
driver.get("https://www2.hm.com/en_us/productpage.0811993021.html")

class_price = "ProductPrice-module--productItemPrice__2i2Hc"

element = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, class_price) ) )
element.text

'$29.99$22.99'

### Joining Everything

In [35]:
df_comp = pd.DataFrame()

for idx, link in enumerate(df_prods['link']):
    # sku
    sku = link.split('.')[3]
    print('scraping page {}/{}: {}'.format( idx+1, len(df_prods), link))
    
    # load web page
    driver.get(link)
    
    # get price
    # price = WebDriverWait(driver, 10).until(lambda x: x.find_element(by=By.CLASS_NAME, value='ProductPrice-module--productItemPrice__2i2Hc'))
    class_price = 'ProductPrice-module--productItemPrice__2i2Hc'
    content = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, class_price)) )    
    try:
        price = content.text
    except:
        price = 'NA'
    
    # get product description   
    class_desc = "ProductDescription-module--descriptionText__1zy9P"      
    # test if description exists
    try: 
        content = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.CLASS_NAME, class_desc) ))
        desc = content.text
    except:
        desc = 'NA'
    
    # get text
    class_text = 'ProductAttributesList-module--descriptionListItem__3vUL2'
    contents = WebDriverWait(driver, timeout=10).until( EC.presence_of_all_elements_located( (By.CLASS_NAME, class_text) ) )
    
    # concatenate all lines of text
    text = str()
    # list with all text
    text = [text + line.text  for line  in contents]

    # separate fit and composition from text
    # if fit or composition is not informed they'll return NA
    fit = 'NA'
    composition = 'NA'
    for element in text:
        if 'fit' in element:
            fit = element
        if 'Composition' in element:
            composition = element    
    
    # saving raw text
    text_raw =' /'.join(text)
    
    # saving results
    df_aux = pd.DataFrame( {'sku' : sku, 'price' : price, 'fit' : fit, 'composition' : composition, 'description' : desc ,'text' : text_raw,}, index = [0] )
    df_comp = pd.concat( [df_comp, df_aux], axis = 0 )     

df_comp.reset_index(inplace = True, drop = True)

scraping page 1/177: https://www2.hm.com/en_us/productpage.0427159001.html
scraping page 2/177: https://www2.hm.com/en_us/productpage.0427159002.html
scraping page 3/177: https://www2.hm.com/en_us/productpage.0427159003.html
scraping page 4/177: https://www2.hm.com/en_us/productpage.0427159004.html
scraping page 5/177: https://www2.hm.com/en_us/productpage.0427159005.html
scraping page 6/177: https://www2.hm.com/en_us/productpage.0427159006.html
scraping page 7/177: https://www2.hm.com/en_us/productpage.0427159007.html
scraping page 8/177: https://www2.hm.com/en_us/productpage.0427159008.html
scraping page 9/177: https://www2.hm.com/en_us/productpage.0427159010.html
scraping page 10/177: https://www2.hm.com/en_us/productpage.0427159011.html
scraping page 11/177: https://www2.hm.com/en_us/productpage.0427159017.html
scraping page 12/177: https://www2.hm.com/en_us/productpage.0427159022.html
scraping page 13/177: https://www2.hm.com/en_us/productpage.0427159023.html
scraping page 14/177:

In [36]:
df_comp.head()

Unnamed: 0,sku,price,fit,composition,description,text
0,427159001,$39.99,FitSkinny fit,"CompositionCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste..."
1,427159002,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste..."
2,427159003,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste..."
3,427159004,$39.99,FitSkinny fit,"CompositionCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ..."
4,427159005,$39.99,FitSkinny fit,"CompositionCotton 72%, Polyester 20%, Modal 7%...",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste..."


In [26]:
# class_ = "ProductDescription-module--descriptionText__1zy9P"
# e = driver.find_element(by=By.CLASS_NAME, value=class_)
# e.text

class_ = "ProductDescription-module--descriptionText__1zy9P"
try: 
    content = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.CLASS_NAME, class_) ))
    desc = content.text
except:
    desc = 'NA'

desc

'Edition by is an uncompromising collection of workwear-inspired garments and accessories made from more sustainable materials. Jeans in thick denim made from recycled cotton and dyed with natural, plant-based indigo. Zip fly with button, coin pocket, front pockets, and back pockets. Straight legs. Style with other pieces from the collection for a complete look.'

In [27]:
for idx, link in enumerate(df_prods['link'][-3:]):
    print(idx, ' ', link)


0   https://www2.hm.com/en_us/productpage.1048642002.html
1   https://www2.hm.com/en_us/productpage.1049466001.html
2   https://www2.hm.com/en_us/productpage.1063426001.html


In [28]:
df_comp

Unnamed: 0,sku,price,fit,composition,description,text
0,1048642002,$39.99,FitRegular fit,"CompositionShell: Cotton 99%, Spandex 1%Pocket...",5-pocket jeans in stretch cotton denim with a ...,"FitRegular fit /CompositionShell: Cotton 99%, ..."
1,1049466001,$39.99,FitLoose fit,CompositionCotton 100%,"Jeans in thick cotton denim. Regular waist, zi...",FitLoose fit /CompositionCotton 100% /More sus...
2,1063426001,$49.99,,CompositionCotton 100%,Edition by is an uncompromising collection of ...,"SizeThe model is 188cm/6'2"" and wears a size 3..."


In [29]:
df_comp.shape
df_comp.isna().sum()

now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

df_comp.to_csv(path_or_buf='./backups/df_comp-{}.csv'.format(now), index= False)

In [30]:
driver.get("https://www2.hm.com/en_us/productpage.1024256001.html")

In [31]:
elements = driver.find_elements(by=By.CLASS_NAME, value="ProductAttributesList-module--descriptionListItem__3vUL2")
for e in elements:
    print(e.text)


SizeThe model is 185cm/6'1" and wears a size 31/32
FitSlim fit
CompositionShell: Cotton 99%, Spandex 1%Pocket lining: Polyester 65%, Cotton 35%
More sustainable materialsShell: Recycled cotton 20%Lining: Recycled cotton 20%
Art. No.1024256001


In [32]:
driver.quit()

# TODO

- separate compositions
- create feature ispromo
- create final df using:

sku, product_id, color_id, color, fit, price, head_line, 'cotton', 'polyester', 'elastane', 'elasterell_p', 'spandex' 'modal', 'viscose', pocket_lining, text

# Data Parsing

### Composition

In [188]:
df_comp.head()

Unnamed: 0,sku,price,fit,composition,description,text
0,427159001,$39.99,FitSkinny fit,"CompositionCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste..."
1,427159002,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste..."
2,427159003,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste..."
3,427159004,$39.99,FitSkinny fit,"CompositionCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ..."
4,427159005,$39.99,FitSkinny fit,"CompositionCotton 72%, Polyester 20%, Modal 7%...",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste..."


In [220]:
df_test = df_comp.copy()

comps = []
linings = []

for idx, text in enumerate(df_test['composition']):
    # case 1 pocket lining present
    if 'Pocket' in text:
        regex = '(Shell: .*?=Pocket|Cotton.*(?=Pocket))'
        try:
            comp = re.findall( regex, text)[0]
        except:
            comp = 'NA'
    # case 2 pocket lining not present
    else:
        regex = '(Cotton.*Lining|Cotton.*%)'
        try:
            comp = re.findall( regex, text)[0]
        except:
            comp = 'NA'
        # print(df_test.loc[idx, 'sku'] + '|' + text +' | ' + comp)
    
    # geting pocket composition:
    regex = '(?<=lining: ).*'
    try:
        lining = re.findall(regex, text)[0]
    except:
        lining = 'Not Informed'
    linings.append(lining)
    
    comps.append(comp)
df_test['comp'] = comps
df_test['lining'] = linings

In [221]:
df_test.head()

Unnamed: 0,sku,price,fit,composition,description,text,comp,lining
0,427159001,$39.99,FitSkinny fit,"CompositionCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste...","Cotton 91%, Polyester 7%, Spandex 2%",Not Informed
1,427159002,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed
2,427159003,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed
3,427159004,$39.99,FitSkinny fit,"CompositionCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ...","Cotton 99%, Spandex 1%",Not Informed
4,427159005,$39.99,FitSkinny fit,"CompositionCotton 72%, Polyester 20%, Modal 7%...",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste...","Cotton 72%, Polyester 20%, Modal 7%, Spandex 1%",Not Informed


In [222]:
def composition_to_df(list_of_comp):
    """ Creates a dataframe from a list of compositions"""
    keys = []
    values = []

    for idx, element in enumerate(list_of_comp):
        # if idx is even, element is a key (column in dataframe)
        if idx % 2 == 0:
            keys.append(element)
        else:
            values.append(element.strip('%,'))

    # final dataframe
    res = dict(zip(keys,values))
    res = pd.DataFrame(res, index = [0])
    return (res)

df_comp_split = pd.DataFrame()

for composition in df_test['comp']:
    comp_list = composition.split(' ') 

    # creating a df of compositions
    df_aux = composition_to_df(comp_list)

    df_comp_split = pd.concat( [df_comp_split, df_aux], axis = 0 )

df_comp_split.reset_index(inplace = True, drop = True)

In [223]:

df_test = pd.concat( [df_test, df_comp_split], axis = 1 )
df_test.head()


Unnamed: 0,sku,price,fit,composition,description,text,comp,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester
0,427159001,$39.99,FitSkinny fit,"CompositionCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste...","Cotton 91%, Polyester 7%, Spandex 2%",Not Informed,91,7.0,2,,
1,427159002,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,
2,427159003,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,
3,427159004,$39.99,FitSkinny fit,"CompositionCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ...","Cotton 99%, Spandex 1%",Not Informed,99,,1,,
4,427159005,$39.99,FitSkinny fit,"CompositionCotton 72%, Polyester 20%, Modal 7%...",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste...","Cotton 72%, Polyester 20%, Modal 7%, Spandex 1%",Not Informed,72,20.0,1,7.0,


### Fit

In [276]:
# positive lookbehind + words I'm searching + positive lookahead
regex = "((?<=Fit).*(?= fit)|NA)"

df_test['fit'] = df_test['fit'].apply(lambda x: re.findall(regex, x)[0] )
df_test.head()

Unnamed: 0,sku,price,fit,composition,description,text,comp,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester
0,427159001,$39.99,Skinny,"CompositionCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste...","Cotton 91%, Polyester 7%, Spandex 2%",Not Informed,91,7.0,2,,
1,427159002,$39.99,Skinny,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,
2,427159003,$39.99,Skinny,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,
3,427159004,$39.99,Skinny,"CompositionCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ...","Cotton 99%, Spandex 1%",Not Informed,99,,1,,
4,427159005,$39.99,Skinny,"CompositionCotton 72%, Polyester 20%, Modal 7%...",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste...","Cotton 72%, Polyester 20%, Modal 7%, Spandex 1%",Not Informed,72,20.0,1,7.0,


### Price

In [289]:
regex = "\$\d{2}.\d{2}\$\d{2}.\d{2}"
price = '$22.22$11.11'

bool(re.match(regex, price ))

True

In [376]:
# df_test = df_test.copy()

# if there are 2 prices then there is a discount/promo
regex = "\$\d{2}.\d{2}\$\d{2}.\d{2}"
df_test['isPromo'] = df_test['price'].apply(lambda x: 1 if bool(re.match(regex, x)) else 0)

# first price
regex = "^\$\d{2}.\d{2}"
df_test['originalPrice'] = df_test['price'].apply( lambda x: re.findall(regex, x)[0] )

# smallest valuea (starts with anny amount of numbers and ends in .dd)
regex = "\$\d*\.\d{2}$"
df_test['finalPrice'] = df_test['price'].apply( lambda x: re.findall(regex, x)[0] )

df_test['originalPrice'] = df_test['originalPrice'].apply(lambda x: x.strip('$'))
df_test['finalPrice'] = df_test['finalPrice'].apply(lambda x: x.strip('$'))


In [377]:
df_test[df_test['isPromo'] == True].head()

Unnamed: 0,sku,price,fit,composition,description,text,comp,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester,isPromo,originalPrice,finalPrice
6,427159007,$39.99$18.99,Skinny,"CompositionCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste...","Cotton 91%, Polyester 7%, Spandex 2%",Not Informed,91,7.0,2.0,,,1,39.99,18.99
7,427159008,$39.99$18.99,Skinny,"CompositionCotton 98%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 98%, Spandex ...","Cotton 98%, Spandex 2%",Not Informed,98,,2.0,,,1,39.99,18.99
9,427159011,$39.99$24.99,Skinny,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1.0,,,1,39.99,24.99
10,427159017,$39.99$24.99,Skinny,CompositionCotton 100%,5-pocket jeans in washed stretch denim. Heavil...,FitSkinny fit /CompositionCotton 100% /Art. No...,Cotton 100%,Not Informed,100,,,,,1,39.99,24.99
12,427159023,$39.99$21.99,Skinny,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1.0,,,1,39.99,21.99


### Headline

In [396]:
# removing whitespace characteres

df_prods['headline'] = df_prods['headline'].apply(lambda x: x.strip('\n\t ')) 
df_prods.headline.value_counts()

Slim Jeans                        33
Skinny Jeans                      26
Regular Jeans                     23
Trashed Skinny Jeans              15
Relaxed Jeans                     14
Slim Tapered Jeans                14
Hybrid Regular Tapered Joggers     7
Tapered Jeans                      6
Regular Tapered Crop Jeans         6
Skinny Cropped Jeans               5
Freefit® Slim Jeans                5
Relaxed Tapered Pull-on Jeans      4
Loose Jeans                        4
Hybrid Regular Denim Joggers       3
Regular Denim Joggers              3
Relaxed Pull-on Jeans              3
Relaxed Denim Joggers              2
Regular Bootcut Jeans              2
Loose Carpenter Jeans              1
Cotton Denim Jeans                 1
Name: headline, dtype: int64

In [422]:
df_final = pd.concat( [df_prods, df_test.drop('sku', axis = 1)], axis =1 )

now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
df_final['date'] = now
df_final.head()

Unnamed: 0,sku,product_id,color_id,color,headline,link,price,fit,composition,description,...,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester,isPromo,originalPrice,finalPrice,date
0,427159001,427159,1,Black denim,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"CompositionCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,91,7.0,2,,,0,39.99,39.99,2022-04-13 21:49:54
1,427159002,427159,2,Blue washed out,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,93,6.0,1,,,0,39.99,39.99,2022-04-13 21:49:54
2,427159003,427159,3,Denim blue,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,93,6.0,1,,,0,39.99,39.99,2022-04-13 21:49:54
3,427159004,427159,4,Light denim blue,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"CompositionCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,99,,1,,,0,39.99,39.99,2022-04-13 21:49:54
4,427159005,427159,5,Dark denim blue,Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...,$39.99,Skinny,"CompositionCotton 72%, Polyester 20%, Modal 7%...",5-pocket jeans in washed stretch denim. Heavil...,...,Not Informed,72,20.0,1,7.0,,0,39.99,39.99,2022-04-13 21:49:54


In [418]:
df_final.columns

Index(['sku', 'product_id', 'color_id', 'color', 'headline', 'link', 'price',
       'fit', 'composition', 'description', 'text', 'comp', 'lining', 'Cotton',
       'Polyester', 'Spandex', 'Modal', 'Elastomultiester', 'isPromo',
       'originalPrice', 'finalPrice', 'date'],
      dtype='object')

In [423]:
#sku, product_id, color_id, color, fit, price, headline, 'cotton', 'polyester', 'elastane', 'elasterell_p', 'spandex' 'modal', 'viscose', pocket_lining, text
selected_cols = ['sku', 'product_id', 'color_id', 'color', 'fit', 'finalPrice', 'originalPrice', "headline", 'Cotton', 'Polyester', 'Spandex', 'Modal', 'Elastomultiester', 'isPromo', 'description', 'text', 'link', 'date'] 
df_final = df_final[selected_cols]

rename_cols = ['sku', 'product_id', 'color_id', 'color', 'fit', 'final_price', 'original_rice', "headline", 'cotton', 'polyester', 'spandex', 'modal', 'elastomultiester', 'is_promo', 'description', 'text', 'link', 'date'] 

final_cols = dict(zip(selected_cols, rename_cols))

df_final.rename(columns = final_cols, inplace = True )
df_final.head()



Unnamed: 0,sku,product_id,color_id,color,fit,final_price,original_rice,headline,cotton,polyester,spandex,modal,elastomultiester,is_promo,description,text,link,date
0,427159001,427159,1,Black denim,Skinny,39.99,39.99,Trashed Skinny Jeans,91,7.0,2,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
1,427159002,427159,2,Blue washed out,Skinny,39.99,39.99,Trashed Skinny Jeans,93,6.0,1,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
2,427159003,427159,3,Denim blue,Skinny,39.99,39.99,Trashed Skinny Jeans,93,6.0,1,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
3,427159004,427159,4,Light denim blue,Skinny,39.99,39.99,Trashed Skinny Jeans,99,,1,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
4,427159005,427159,5,Dark denim blue,Skinny,39.99,39.99,Trashed Skinny Jeans,72,20.0,1,7.0,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54


# Data Saving

In [425]:
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

df_comp.to_csv(path_or_buf='./backups/df_comp-{}.csv'.format(now), )

# saving df as a local backup
df_final.to_csv('./backups/df_backup-{}.csv'.format(now), index = False)