# Imports

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import re
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import sqlite3

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# functions

def request_soup(url_link):    
    headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'}    
    page = requests.get( url, headers = headers)
    soup_obj = BeautifulSoup(page.text, 'html.parser')
    return( soup_obj )

##  Desired Output Format

In [5]:
df_b = pd.read_csv('./backups/df_backup-2021-12-14_16_43_22.csv')

In [6]:
df_b.Fit.unique()
df_b.Composition[0]
df_b.columns

Index(['Art. No.', 'style_id', 'color_id', 'color', 'Fit', 'Price',
       'Composition', 'texts', 'Cotton', 'Polyester', 'Elastane',
       'Elasterell-P', 'Modal', 'Viscose', 'link', 'date'],
      dtype='object')

sku, product_id, color_id, color, fit, price, head_line, 'cotton', 'polyester', 'elastane', 'elasterell_p', 'spandex' 'modal', 'viscose', pocket_lining, text

# Data Requesting

##  Home Page Scraping

In [7]:
# all products url
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

# headers for request
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'}

# requesting
page = requests.get(url=url, headers=headers)

# instatiating bs4 object
soup = BeautifulSoup(page.text, 'html.parser')

In [8]:
# finding load more products element
p = soup.find('div', class_='load-more-products')

# all products
all_products = int(p.find('h2').get('data-total'))

# products per page
products_per_page = int(p.find('h2').get('data-items-shown'))

# rounding up numer of pages needed for web scraping
total_pages = np.ceil(all_products/products_per_page)


##  All products in Home Page Scraping

In [9]:
# creating a page with all products
url_all_prods = url + '?&offset=0&page-size={}'.format(int(total_pages*products_per_page))

all_prods = requests.get(url = url_all_prods, headers=headers)

In [10]:
soup = BeautifulSoup(all_prods.text, 'html.parser')#.get('li', class_='product-item')

# soup.find('li', class_ = 'product-item').find('a').get('href') #.get('item-link')  #.get('item-link') #, class_ = 'item-link')
# all find all products listed in homepage
products = soup.find_all('li', class_='product-item')

# get link to all projects
home_links = ['https://www2.hm.com' + link.find('a').get('href') for link in products ]

##  All products in Each Product Page

In [11]:
# resulting list of all products to scrap
links = []

for link in home_links:
    # scrap each product in home page list
    single_product = requests.get(link, headers = headers)
    soup = BeautifulSoup(single_product.text, 'html.parser')

    # gets the links to all products listed in a page
    products_ul = soup.find('ul', class_='inputlist clearfix')
    products = products_ul.find_all('a')

    links_ul = []
    links_ul = [ 'https://www2.hm.com' + item.get('href') for item in products]
    links.extend(links_ul)

In [12]:
# getting all unique products listed

# converting to a set and then back to list
links = list(set(links))
links.sort()

In [13]:
# defining base dataframe
# df_prods = pd.DataFrame(columns=['product_id', 'color', 'style_id', 'color_id', 'link'])
df_prods = pd.DataFrame()

for link in links:
    
    # scrap each product in home page list
    single_product = requests.get(link, headers = headers)
    soup = BeautifulSoup(single_product.text, 'html.parser')
    
    # scrap all products listed in a page
    products_ul = soup.find('ul', class_='inputlist clearfix')
    products = products_ul.find_all('a')

    # product headline
    headline = soup.find('h1', class_='primary product-item-headline').text


    for product in products:
        
        #product it
        sku = product.get('data-articlecode')
       
        # color
        color = product.get('data-color')
        
        # product id
        product_id = sku[:-3]
        
        # style id
        color_id = sku[-3:]

        # link
        link = 'https://www2.hm.com/en_us/productpage.{}.html'.format(sku)

        df_temp = pd.DataFrame( {'sku': sku, 'product_id' :product_id, 'color_id' : color_id, 'color': color, 'headline' : headline, 'link': link}, index = [0] )
        
        df_prods = pd.concat([df_prods, df_temp], axis = 0)


df_prods.drop_duplicates('sku',inplace = True)

df_prods.reset_index(inplace = True, drop = True)

In [14]:
df_prods.head()

Unnamed: 0,sku,product_id,color_id,color,headline,link
0,427159001,427159,1,Black denim,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...
1,427159002,427159,2,Blue washed out,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...
2,427159003,427159,3,Denim blue,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...
3,427159004,427159,4,Light denim blue,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...
4,427159005,427159,5,Dark denim blue,\n\t\t\t\t\t\t\t Trashed Skinny Jeans,https://www2.hm.com/en_us/productpage.04271590...


## Individual Scraping

### Scraping Data

In [34]:
# starting drive

options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)

In [16]:
links[0]

'https://www2.hm.com/en_us/productpage.0427159001.html'

### Product Description

In [17]:
driver.get("https://www2.hm.com/en_us/productpage.0811993021.html")

# class_ = "BodyText-module--general__32l6J" # if below doesn't work
class_ = "ProductDescription-module--descriptionText__1zy9P"

try: 
    content = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.CLASS_NAME, class_) ))
    desc = content.text
except:
    desc = 'NA'

content = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.CLASS_NAME, class_) ))
desc = content.text
desc

'H&M Essentials. No. 2: The Jeans. 5-pocket jeans in stretch cotton denim. Regular waist, zip fly with button, and straight legs with good room for movement over thighs and knees.'

### Text

In [18]:
# driver.get(links[0])
driver.get("https://www2.hm.com/en_us/productpage.1024256004.html")

In [19]:
elements = driver.find_elements(by=By.CLASS_NAME, value="ProductAttributesList-module--descriptionListItem__3vUL2")
for e in elements:
    print(e.text)

FitSlim fit
CompositionShell: Cotton 99%, Spandex 1%Pocket lining: Polyester 65%, Cotton 35%
More sustainable materialsShell: Recycled cotton 20%Lining: Recycled polyester 65%, Recycled cotton 10%
Art. No.1024256004


In [20]:
text = str()

text = [text + line.text  for line  in elements]
text

['FitSlim fit',
 'CompositionShell: Cotton 99%, Spandex 1%Pocket lining: Polyester 65%, Cotton 35%',
 'More sustainable materialsShell: Recycled cotton 20%Lining: Recycled polyester 65%, Recycled cotton 10%',
 'Art. No.1024256004']

In [21]:
text_raw =' /'.join(text)
text_raw

'FitSlim fit /CompositionShell: Cotton 99%, Spandex 1%Pocket lining: Polyester 65%, Cotton 35% /More sustainable materialsShell: Recycled cotton 20%Lining: Recycled polyester 65%, Recycled cotton 10% /Art. No.1024256004'

In [22]:
# searching for words fit and composition in all text retrieved from products web page
for element in text:
    if 'fit' in element:
        fit = element
    if 'Composition' in element:
        composition = element
print(fit)
print(composition)

FitSlim fit
CompositionShell: Cotton 99%, Spandex 1%Pocket lining: Polyester 65%, Cotton 35%


### Price

In [23]:
driver.get("https://www2.hm.com/en_us/productpage.0811993021.html")

class_price = "ProductPrice-module--productItemPrice__2i2Hc"

element = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, class_price) ) )
element.text

'$29.99$22.99'

### Joining Everything

In [35]:
df_comp = pd.DataFrame()

for idx, link in enumerate(df_prods['link']):
    # sku
    sku = link.split('.')[3]
    print('scraping page {}/{}: {}'.format( idx+1, len(df_prods), link))
    
    # load web page
    driver.get(link)
    
    # get price
    # price = WebDriverWait(driver, 10).until(lambda x: x.find_element(by=By.CLASS_NAME, value='ProductPrice-module--productItemPrice__2i2Hc'))
    class_price = 'ProductPrice-module--productItemPrice__2i2Hc'
    content = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, class_price)) )    
    try:
        price = content.text
    except:
        price = 'NA'
    
    # get product description   
    class_desc = "ProductDescription-module--descriptionText__1zy9P"      
    # test if description exists
    try: 
        content = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.CLASS_NAME, class_desc) ))
        desc = content.text
    except:
        desc = 'NA'
    
    # get text
    class_text = 'ProductAttributesList-module--descriptionListItem__3vUL2'
    contents = WebDriverWait(driver, timeout=10).until( EC.presence_of_all_elements_located( (By.CLASS_NAME, class_text) ) )
    
    # concatenate all lines of text
    text = str()
    # list with all text
    text = [text + line.text  for line  in contents]

    # separate fit and composition from text
    # if fit or composition is not informed they'll return NA
    fit = 'NA'
    composition = 'NA'
    for element in text:
        if 'fit' in element:
            fit = element
        if 'Composition' in element:
            composition = element    
    
    # saving raw text
    text_raw =' /'.join(text)
    
    # saving results
    df_aux = pd.DataFrame( {'sku' : sku, 'price' : price, 'fit' : fit, 'composition' : composition, 'description' : desc ,'text' : text_raw,}, index = [0] )
    df_comp = pd.concat( [df_comp, df_aux], axis = 0 )     

df_comp.reset_index(inplace = True, drop = True)

scraping page 1/177: https://www2.hm.com/en_us/productpage.0427159001.html
scraping page 2/177: https://www2.hm.com/en_us/productpage.0427159002.html
scraping page 3/177: https://www2.hm.com/en_us/productpage.0427159003.html
scraping page 4/177: https://www2.hm.com/en_us/productpage.0427159004.html
scraping page 5/177: https://www2.hm.com/en_us/productpage.0427159005.html
scraping page 6/177: https://www2.hm.com/en_us/productpage.0427159006.html
scraping page 7/177: https://www2.hm.com/en_us/productpage.0427159007.html
scraping page 8/177: https://www2.hm.com/en_us/productpage.0427159008.html
scraping page 9/177: https://www2.hm.com/en_us/productpage.0427159010.html
scraping page 10/177: https://www2.hm.com/en_us/productpage.0427159011.html
scraping page 11/177: https://www2.hm.com/en_us/productpage.0427159017.html
scraping page 12/177: https://www2.hm.com/en_us/productpage.0427159022.html
scraping page 13/177: https://www2.hm.com/en_us/productpage.0427159023.html
scraping page 14/177:

In [36]:
df_comp.head()

Unnamed: 0,sku,price,fit,composition,description,text
0,427159001,$39.99,FitSkinny fit,"CompositionCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste..."
1,427159002,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste..."
2,427159003,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste..."
3,427159004,$39.99,FitSkinny fit,"CompositionCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ..."
4,427159005,$39.99,FitSkinny fit,"CompositionCotton 72%, Polyester 20%, Modal 7%...",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste..."


In [26]:
# class_ = "ProductDescription-module--descriptionText__1zy9P"
# e = driver.find_element(by=By.CLASS_NAME, value=class_)
# e.text

class_ = "ProductDescription-module--descriptionText__1zy9P"
try: 
    content = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.CLASS_NAME, class_) ))
    desc = content.text
except:
    desc = 'NA'

desc

'Edition by is an uncompromising collection of workwear-inspired garments and accessories made from more sustainable materials. Jeans in thick denim made from recycled cotton and dyed with natural, plant-based indigo. Zip fly with button, coin pocket, front pockets, and back pockets. Straight legs. Style with other pieces from the collection for a complete look.'

In [27]:
for idx, link in enumerate(df_prods['link'][-3:]):
    print(idx, ' ', link)


0   https://www2.hm.com/en_us/productpage.1048642002.html
1   https://www2.hm.com/en_us/productpage.1049466001.html
2   https://www2.hm.com/en_us/productpage.1063426001.html


In [28]:
df_comp

Unnamed: 0,sku,price,fit,composition,description,text
0,1048642002,$39.99,FitRegular fit,"CompositionShell: Cotton 99%, Spandex 1%Pocket...",5-pocket jeans in stretch cotton denim with a ...,"FitRegular fit /CompositionShell: Cotton 99%, ..."
1,1049466001,$39.99,FitLoose fit,CompositionCotton 100%,"Jeans in thick cotton denim. Regular waist, zi...",FitLoose fit /CompositionCotton 100% /More sus...
2,1063426001,$49.99,,CompositionCotton 100%,Edition by is an uncompromising collection of ...,"SizeThe model is 188cm/6'2"" and wears a size 3..."


In [29]:
df_comp.shape
df_comp.isna().sum()

now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

df_comp.to_csv(path_or_buf='./backups/df_comp-{}.csv'.format(now), index= False)

In [30]:
driver.get("https://www2.hm.com/en_us/productpage.1024256001.html")

In [31]:
elements = driver.find_elements(by=By.CLASS_NAME, value="ProductAttributesList-module--descriptionListItem__3vUL2")
for e in elements:
    print(e.text)


SizeThe model is 185cm/6'1" and wears a size 31/32
FitSlim fit
CompositionShell: Cotton 99%, Spandex 1%Pocket lining: Polyester 65%, Cotton 35%
More sustainable materialsShell: Recycled cotton 20%Lining: Recycled cotton 20%
Art. No.1024256001


In [32]:
driver.quit()

# Data Parsing

### Composition

In [188]:
df_comp.head()

Unnamed: 0,sku,price,fit,composition,description,text
0,427159001,$39.99,FitSkinny fit,"CompositionCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste..."
1,427159002,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste..."
2,427159003,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste..."
3,427159004,$39.99,FitSkinny fit,"CompositionCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ..."
4,427159005,$39.99,FitSkinny fit,"CompositionCotton 72%, Polyester 20%, Modal 7%...",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste..."


In [220]:
df_test = df_comp.copy()

comps = []
linings = []

for idx, text in enumerate(df_test['composition']):
    # case 1 pocket lining present
    if 'Pocket' in text:
        regex = '(Shell: .*?=Pocket|Cotton.*(?=Pocket))'
        try:
            comp = re.findall( regex, text)[0]
        except:
            comp = 'NA'
    # case 2 pocket lining not present
    else:
        regex = '(Cotton.*(?=Lining)|Cotton.*(?=lining)|Cotton.*%)'
        try:
            comp = re.findall( regex, text)[0]
        except:
            comp = 'NA'
        # print(df_test.loc[idx, 'sku'] + '|' + text +' | ' + comp)
    
    # geting pocket composition:
    regex = '(?<=lining: ).*'
    try:
        lining = re.findall(regex, text)[0]
    except:
        lining = 'Not Informed'
    linings.append(lining)
    
    comps.append(comp)
df_test['comp'] = comps
df_test['lining'] = linings

In [221]:
df_test.head()

Unnamed: 0,sku,price,fit,composition,description,text,comp,lining
0,427159001,$39.99,FitSkinny fit,"CompositionCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste...","Cotton 91%, Polyester 7%, Spandex 2%",Not Informed
1,427159002,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed
2,427159003,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed
3,427159004,$39.99,FitSkinny fit,"CompositionCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ...","Cotton 99%, Spandex 1%",Not Informed
4,427159005,$39.99,FitSkinny fit,"CompositionCotton 72%, Polyester 20%, Modal 7%...",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste...","Cotton 72%, Polyester 20%, Modal 7%, Spandex 1%",Not Informed


In [222]:
def composition_to_df(list_of_comp):
    """ Creates a dataframe from a list of compositions"""
    keys = []
    values = []

    for idx, element in enumerate(list_of_comp):
        # if idx is even, element is a key (column in dataframe)
        if idx % 2 == 0:
            keys.append(element)
        else:
            values.append(element.strip('%,'))

    # final dataframe
    res = dict(zip(keys,values))
    res = pd.DataFrame(res, index = [0])
    return (res)

df_comp_split = pd.DataFrame()

for composition in df_test['comp']:
    comp_list = composition.split(' ') 

    # creating a df of compositions
    df_aux = composition_to_df(comp_list)

    df_comp_split = pd.concat( [df_comp_split, df_aux], axis = 0 )

df_comp_split.reset_index(inplace = True, drop = True)

In [223]:

df_test = pd.concat( [df_test, df_comp_split], axis = 1 )
df_test.head()


Unnamed: 0,sku,price,fit,composition,description,text,comp,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester
0,427159001,$39.99,FitSkinny fit,"CompositionCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste...","Cotton 91%, Polyester 7%, Spandex 2%",Not Informed,91,7.0,2,,
1,427159002,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,
2,427159003,$39.99,FitSkinny fit,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,
3,427159004,$39.99,FitSkinny fit,"CompositionCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ...","Cotton 99%, Spandex 1%",Not Informed,99,,1,,
4,427159005,$39.99,FitSkinny fit,"CompositionCotton 72%, Polyester 20%, Modal 7%...",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste...","Cotton 72%, Polyester 20%, Modal 7%, Spandex 1%",Not Informed,72,20.0,1,7.0,


### Fit

In [276]:
# positive lookbehind + words I'm searching + positive lookahead
regex = "((?<=Fit).*(?= fit)|NA)"

df_test['fit'] = df_test['fit'].apply(lambda x: re.findall(regex, x)[0] )
df_test.head()

Unnamed: 0,sku,price,fit,composition,description,text,comp,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester
0,427159001,$39.99,Skinny,"CompositionCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste...","Cotton 91%, Polyester 7%, Spandex 2%",Not Informed,91,7.0,2,,
1,427159002,$39.99,Skinny,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,
2,427159003,$39.99,Skinny,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1,,
3,427159004,$39.99,Skinny,"CompositionCotton 99%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ...","Cotton 99%, Spandex 1%",Not Informed,99,,1,,
4,427159005,$39.99,Skinny,"CompositionCotton 72%, Polyester 20%, Modal 7%...",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste...","Cotton 72%, Polyester 20%, Modal 7%, Spandex 1%",Not Informed,72,20.0,1,7.0,


### Price

In [289]:
regex = "\$\d{2}.\d{2}\$\d{2}.\d{2}"
price = '$22.22$11.11'

bool(re.match(regex, price ))

True

In [376]:
# df_test = df_test.copy()

# if there are 2 prices then there is a discount/promo
regex = "\$\d{2}.\d{2}\$\d{2}.\d{2}"
df_test['isPromo'] = df_test['price'].apply(lambda x: 1 if bool(re.match(regex, x)) else 0)

# first price
regex = "^\$\d{2}.\d{2}"
df_test['originalPrice'] = df_test['price'].apply( lambda x: re.findall(regex, x)[0] )

# smallest valuea (starts with anny amount of numbers and ends in .dd)
regex = "\$\d*\.\d{2}$"
df_test['finalPrice'] = df_test['price'].apply( lambda x: re.findall(regex, x)[0] )

df_test['originalPrice'] = df_test['originalPrice'].apply(lambda x: x.strip('$'))
df_test['finalPrice'] = df_test['finalPrice'].apply(lambda x: x.strip('$'))


In [377]:
df_test[df_test['isPromo'] == True].head()

Unnamed: 0,sku,price,fit,composition,description,text,comp,lining,Cotton,Polyester,Spandex,Modal,Elastomultiester,isPromo,originalPrice,finalPrice
6,427159007,$39.99$18.99,Skinny,"CompositionCotton 91%, Polyester 7%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste...","Cotton 91%, Polyester 7%, Spandex 2%",Not Informed,91,7.0,2.0,,,1,39.99,18.99
7,427159008,$39.99$18.99,Skinny,"CompositionCotton 98%, Spandex 2%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 98%, Spandex ...","Cotton 98%, Spandex 2%",Not Informed,98,,2.0,,,1,39.99,18.99
9,427159011,$39.99$24.99,Skinny,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1.0,,,1,39.99,24.99
10,427159017,$39.99$24.99,Skinny,CompositionCotton 100%,5-pocket jeans in washed stretch denim. Heavil...,FitSkinny fit /CompositionCotton 100% /Art. No...,Cotton 100%,Not Informed,100,,,,,1,39.99,24.99
12,427159023,$39.99$21.99,Skinny,"CompositionCotton 93%, Polyester 6%, Spandex 1%",5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...","Cotton 93%, Polyester 6%, Spandex 1%",Not Informed,93,6.0,1.0,,,1,39.99,21.99


### Headline

In [396]:
# removing whitespace characteres

df_prods['headline'] = df_prods['headline'].apply(lambda x: x.strip('\n\t ')) 
df_prods.headline.value_counts()

Slim Jeans                        33
Skinny Jeans                      26
Regular Jeans                     23
Trashed Skinny Jeans              15
Relaxed Jeans                     14
Slim Tapered Jeans                14
Hybrid Regular Tapered Joggers     7
Tapered Jeans                      6
Regular Tapered Crop Jeans         6
Skinny Cropped Jeans               5
Freefit® Slim Jeans                5
Relaxed Tapered Pull-on Jeans      4
Loose Jeans                        4
Hybrid Regular Denim Joggers       3
Regular Denim Joggers              3
Relaxed Pull-on Jeans              3
Relaxed Denim Joggers              2
Regular Bootcut Jeans              2
Loose Carpenter Jeans              1
Cotton Denim Jeans                 1
Name: headline, dtype: int64

In [348]:
# df_final = pd.concat( [df_prods, df_test.drop('sku', axis = 1)], axis =1 )

# now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# df_final['date'] = now
# df_final.head()

In [418]:
df_final.columns

Index(['sku', 'product_id', 'color_id', 'color', 'headline', 'link', 'price',
       'fit', 'composition', 'description', 'text', 'comp', 'lining', 'Cotton',
       'Polyester', 'Spandex', 'Modal', 'Elastomultiester', 'isPromo',
       'originalPrice', 'finalPrice', 'date'],
      dtype='object')

In [423]:
#sku, product_id, color_id, color, fit, price, headline, 'cotton', 'polyester', 'elastane', 'elasterell_p', 'spandex' 'modal', 'viscose', pocket_lining, text
selected_cols = ['sku', 'product_id', 'color_id', 'color', 'fit', 'finalPrice', 'originalPrice', "headline", 'Cotton', 'Polyester', 'Spandex', 'Modal', 'Elastomultiester', 'isPromo', 'description', 'text', 'link', 'date'] 
df_final = df_final[selected_cols]

rename_cols = ['sku', 'product_id', 'color_id', 'color', 'fit', 'final_price', 'original_price', "headline", 'cotton', 'polyester', 'spandex', 'modal', 'elastomultiester', 'is_promo', 'description', 'text', 'link', 'date'] 

final_cols = dict(zip(selected_cols, rename_cols))

df_final.rename(columns = final_cols, inplace = True )
df_final.head()



Unnamed: 0,sku,product_id,color_id,color,fit,final_price,original_rice,headline,cotton,polyester,spandex,modal,elastomultiester,is_promo,description,text,link,date
0,427159001,427159,1,Black denim,Skinny,39.99,39.99,Trashed Skinny Jeans,91,7.0,2,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
1,427159002,427159,2,Blue washed out,Skinny,39.99,39.99,Trashed Skinny Jeans,93,6.0,1,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
2,427159003,427159,3,Denim blue,Skinny,39.99,39.99,Trashed Skinny Jeans,93,6.0,1,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
3,427159004,427159,4,Light denim blue,Skinny,39.99,39.99,Trashed Skinny Jeans,99,,1,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
4,427159005,427159,5,Dark denim blue,Skinny,39.99,39.99,Trashed Skinny Jeans,72,20.0,1,7.0,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54


# Data Saving

## Inserting data into local SQLite DB

In [425]:
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

df_comp.to_csv(path_or_buf='./backups/df_comp-{}.csv'.format(now), )

# saving df as a local backup
df_final.to_csv('./backups/df_backup-{}.csv'.format(now), index = False)

## Loading data from previously scraped data

This data was previously stored locally in a sqlite db.

A cronjob ran the web scraping script for +/- 1 month and stored the data in the referred db.

Since I was doing some tests, there is some replicated/duplicated data.

I dropped these out ising the following queries in dbeaver and saving all results to csv

In [72]:
# creating a new column 
"""
ALTER TABLE showroom
ADD COLUMN day_number bigint;
"""

#  extracting day from date
"""
UPDATE showroom 
SET test=STRFTIME("%d", date) 
"""

# creating a subquery with unique dates, dense ranking over day_number (partitioning on days the insertion run more than once)
# selecting only data from the first insertion using a inner join on original data
"""
WITH unique_dates 
AS
(SELECT DISTINCT date, date_rank  
FROM
(SELECT 
	s.date,
	s.day_number 
	, DENSE_RANK() OVER(PARTITION BY s.day_number ORDER BY s.date ASC) as date_rank
FROM showroom s) rnk
WHERE rnk.date_rank=1
)
SELECT * --s2.date, s2.style_id 
FROM showroom s2 
	INNER JOIN unique_dates 
	ON s2.date = unique_dates.date
"""

''

In [3]:
# specifying dtypes
dtypes = { 'Art. No.': 'str', 'style_id' : 'str', 'color_id' : 'str' }

df_old = pd.read_csv('./previous_data.csv', dtype = dtypes)
df_old.head()

Unnamed: 0,Art. No.,style_id,color_id,color,Fit,Price,Composition,texts,Cotton,Polyester,Elastane,Elasterell-P,Modal,Viscose,link,date,day_number,date.1,date_rank
0,690449001,690449,1,Light denim blue/trashed,Skinny fit,16.99,"Cotton 99%, Elastane 1%","Cotton 99%, Elastane 1%",99,0,1,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,11,2021-09-11 17:19:36,1
1,690449002,690449,2,Denim blue,Skinny fit,14.99,"Cotton 98%, Elastane 2%","Cotton 98%, Elastane 2%",98,0,2,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,11,2021-09-11 17:19:36,1
2,690449006,690449,6,Black/washed,Skinny fit,7.99,"Lining: Cotton 100% Cotton 98%, Elastane 2%","Cotton 100% Cotton 98%, Elastane 2%",100,0,2,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,11,2021-09-11 17:19:36,1
3,690449007,690449,7,Light denim blue,Skinny fit,14.99,"Cotton 98%, Elastane 2% Lining: Cotton 100%","Cotton 98%, Elastane 2% Lining",98,0,2,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,11,2021-09-11 17:19:36,1
4,690449009,690449,9,Black washed out,Skinny fit,19.99,"Cotton 99%, Elastane 1%","Cotton 99%, Elastane 1%",99,0,1,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,11,2021-09-11 17:19:36,1


In [4]:
df_old.dtypes

Art. No.         object
style_id         object
color_id         object
color            object
Fit              object
Price           float64
Composition      object
texts            object
Cotton            int64
Polyester         int64
Elastane          int64
Elasterell-P      int64
Modal             int64
Viscose           int64
link             object
date             object
day_number        int64
date.1           object
date_rank         int64
dtype: object

In [5]:
drop_cols = ["day_number", "date.1", "date_rank"]

df_old.drop(columns = drop_cols, inplace = True)
df_old.head()

Unnamed: 0,Art. No.,style_id,color_id,color,Fit,Price,Composition,texts,Cotton,Polyester,Elastane,Elasterell-P,Modal,Viscose,link,date
0,690449001,690449,1,Light denim blue/trashed,Skinny fit,16.99,"Cotton 99%, Elastane 1%","Cotton 99%, Elastane 1%",99,0,1,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
1,690449002,690449,2,Denim blue,Skinny fit,14.99,"Cotton 98%, Elastane 2%","Cotton 98%, Elastane 2%",98,0,2,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
2,690449006,690449,6,Black/washed,Skinny fit,7.99,"Lining: Cotton 100% Cotton 98%, Elastane 2%","Cotton 100% Cotton 98%, Elastane 2%",100,0,2,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
3,690449007,690449,7,Light denim blue,Skinny fit,14.99,"Cotton 98%, Elastane 2% Lining: Cotton 100%","Cotton 98%, Elastane 2% Lining",98,0,2,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
4,690449009,690449,9,Black washed out,Skinny fit,19.99,"Cotton 99%, Elastane 1%","Cotton 99%, Elastane 1%",99,0,1,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36


In [6]:
df_old['color'].value_counts()

Denim blue                  1073
Light denim blue             928
Black                        495
Dark denim blue              443
Dark gray                    294
Dark blue                    251
White                        193
Pale denim blue              119
Gray                         117
Black/washed out             112
Black/No fade black          112
Light gray                   112
Light blue                   112
Midnight blue                 83
Blue                          67
Light denim blue/trashed      58
Black denim                   56
Black/trashed                 56
Black washed out              56
Black/washed                  56
Graphite gray                 55
Denim gray                    32
Cream                         28
Dark blue denim               28
Black washed-out              28
Gray denim                    28
Denim blue washed out         28
Black/bleached                28
Light denim gray/Trashed      28
Dark denim blue/trashed       28
Dark blue/

### Data Cleaning

In [110]:
# for comp in df_old['Composition']:
#     print(comp)

In [7]:
# saving linings
linings = []

regex = "(lining.*(?= Shell)|lining.* Cotton \d{1,}%.*(?=Cotton)|lining.*%|Lining.*Cotton \d{1,}%.*(?=Cotton))"

# searching linings according to regex
for string in df_old['Composition']:
    try:
        lining = re.findall(regex, string)[0]
        lining.strip('lining: Lining')
    except:
        lining = 'Not Informed'

    # gathering all linings for each product
    linings.append(lining)

In [8]:
df_old['lining'] = linings
df_old.head()

Unnamed: 0,Art. No.,style_id,color_id,color,Fit,Price,Composition,texts,Cotton,Polyester,Elastane,Elasterell-P,Modal,Viscose,link,date,lining
0,690449001,690449,1,Light denim blue/trashed,Skinny fit,16.99,"Cotton 99%, Elastane 1%","Cotton 99%, Elastane 1%",99,0,1,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,Not Informed
1,690449002,690449,2,Denim blue,Skinny fit,14.99,"Cotton 98%, Elastane 2%","Cotton 98%, Elastane 2%",98,0,2,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,Not Informed
2,690449006,690449,6,Black/washed,Skinny fit,7.99,"Lining: Cotton 100% Cotton 98%, Elastane 2%","Cotton 100% Cotton 98%, Elastane 2%",100,0,2,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,Lining: Cotton 100%
3,690449007,690449,7,Light denim blue,Skinny fit,14.99,"Cotton 98%, Elastane 2% Lining: Cotton 100%","Cotton 98%, Elastane 2% Lining",98,0,2,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,Not Informed
4,690449009,690449,9,Black washed out,Skinny fit,19.99,"Cotton 99%, Elastane 1%","Cotton 99%, Elastane 1%",99,0,1,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,Not Informed


In [9]:
# cleaning fit column
df_old['Fit'] = df_old['Fit'].apply(lambda x: x.strip(' fit'))

In [10]:
# renaming columns

df_old.columns

new_cols = {'Art. No.':'sku' ,
 'style_id':'product_id' ,
 'color_id': 'color_id', 
 'color': 'color', 
 'Fit': 'fit', 
 'Price':'final_price' ,
'Composition': 'Composition', 
'texts':'text' ,
'Cotton': 'cotton', 
'Polyester':'polyester' ,
'Elastane':'elastane' ,
'Elasterell-P': 'elasterell_p', 
'Modal': 'modal', 
'Viscose': 'viscose', 
'link': 'link', 
'date': 'date'}

# add headling

In [11]:
# adding missing columns and renaming
df_old['headline'] = df_old['color']
df_old['original_price'] = df_old['Price']
df_old['is_promo'] = 0
df_old['description'] = 'Not Informed'


df_old.rename(columns = new_cols, inplace = True)

In [12]:
df_old.columns

Index(['sku', 'product_id', 'color_id', 'color', 'fit', 'final_price',
       'Composition', 'text', 'cotton', 'polyester', 'elastane',
       'elasterell_p', 'modal', 'viscose', 'link', 'date', 'lining',
       'headline', 'original_price', 'is_promo', 'description'],
      dtype='object')

In [13]:
df_old.head()

Unnamed: 0,sku,product_id,color_id,color,fit,final_price,Composition,text,cotton,polyester,...,elasterell_p,modal,viscose,link,date,lining,headline,original_price,is_promo,description
0,690449001,690449,1,Light denim blue/trashed,Skinny,16.99,"Cotton 99%, Elastane 1%","Cotton 99%, Elastane 1%",99,0,...,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,Not Informed,Light denim blue/trashed,16.99,0,Not Informed
1,690449002,690449,2,Denim blue,Skinny,14.99,"Cotton 98%, Elastane 2%","Cotton 98%, Elastane 2%",98,0,...,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,Not Informed,Denim blue,14.99,0,Not Informed
2,690449006,690449,6,Black/washed,Skinny,7.99,"Lining: Cotton 100% Cotton 98%, Elastane 2%","Cotton 100% Cotton 98%, Elastane 2%",100,0,...,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,Lining: Cotton 100%,Black/washed,7.99,0,Not Informed
3,690449007,690449,7,Light denim blue,Skinny,14.99,"Cotton 98%, Elastane 2% Lining: Cotton 100%","Cotton 98%, Elastane 2% Lining",98,0,...,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,Not Informed,Light denim blue,14.99,0,Not Informed
4,690449009,690449,9,Black washed out,Skinny,19.99,"Cotton 99%, Elastane 1%","Cotton 99%, Elastane 1%",99,0,...,0,0,0,https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36,Not Informed,Black washed out,19.99,0,Not Informed


In [14]:
selected_cols = ['sku', 'product_id', 'color_id', 'color', 'fit', 'final_price',
                 'original_price', 'headline', 'cotton', 'polyester', 'elastane',
                 'elasterell_p', 'viscose', 'modal','is_promo', 'description', 
                 'text', 'link', 'date']

# 'elastane','elasterell_p', 'viscose'

df_old = df_old[selected_cols].copy()
df_old.head()

Unnamed: 0,sku,product_id,color_id,color,fit,final_price,original_price,headline,cotton,polyester,elastane,elasterell_p,viscose,modal,is_promo,description,text,link,date
0,690449001,690449,1,Light denim blue/trashed,Skinny,16.99,16.99,Light denim blue/trashed,99,0,1,0,0,0,0,Not Informed,"Cotton 99%, Elastane 1%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
1,690449002,690449,2,Denim blue,Skinny,14.99,14.99,Denim blue,98,0,2,0,0,0,0,Not Informed,"Cotton 98%, Elastane 2%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
2,690449006,690449,6,Black/washed,Skinny,7.99,7.99,Black/washed,100,0,2,0,0,0,0,Not Informed,"Cotton 100% Cotton 98%, Elastane 2%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
3,690449007,690449,7,Light denim blue,Skinny,14.99,14.99,Light denim blue,98,0,2,0,0,0,0,Not Informed,"Cotton 98%, Elastane 2% Lining",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
4,690449009,690449,9,Black washed out,Skinny,19.99,19.99,Black washed out,99,0,1,0,0,0,0,Not Informed,"Cotton 99%, Elastane 1%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36


In [15]:
df_old.head()
df_final.head()

NameError: name 'df_final' is not defined

In [16]:
# read all as string
dtypes = { 'sku' : 'str', 'product_id' : 'str', 'color_id' : 'str'}
df_final = pd.read_csv('./backups/df_backup-2022-04-13 21:51:21.csv', dtype = dtypes)
df_final.rename(columns = {'original_rice' : 'original_price'}, inplace = True)
df_final.head()

Unnamed: 0,sku,product_id,color_id,color,fit,final_price,original_price,headline,cotton,polyester,spandex,modal,elastomultiester,is_promo,description,text,link,date
0,427159001,427159,1,Black denim,Skinny,39.99,39.99,Trashed Skinny Jeans,91,7.0,2,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
1,427159002,427159,2,Blue washed out,Skinny,39.99,39.99,Trashed Skinny Jeans,93,6.0,1,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
2,427159003,427159,3,Denim blue,Skinny,39.99,39.99,Trashed Skinny Jeans,93,6.0,1,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
3,427159004,427159,4,Light denim blue,Skinny,39.99,39.99,Trashed Skinny Jeans,99,,1,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
4,427159005,427159,5,Dark denim blue,Skinny,39.99,39.99,Trashed Skinny Jeans,72,20.0,1,7.0,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54


In [17]:
df_final['spandex'].unique()

array(['2', '1', nan, '2%Lining'], dtype=object)

In [18]:
spandex = df_final['spandex']
values = []
for element in spandex:
    if element == "2%Lining":
        values.append(2)
    else:
        values.append(element)

df_final['spandex'] = values
df_final['spandex'] = df_final['spandex'].astype(float)
df_final['spandex'] = df_final['spandex'].fillna(0)
df_final['spandex'] = df_final['spandex'].astype(int)


### Inserting test data to SQLite DB

In [19]:
# creating database:

path_to_db = '/home/humberto/DS/hm/jeans_db.sqlite'
con = sqlite3.connect(path_to_db, timeout=10)

In [20]:
query = """
SELECT 
    name
FROM
    sqlite_master
WHERE
    type='table'
"""

cursor = con.cursor()
cursor.execute(query)
cursor.fetchall()

[('hm_showroom',)]

In [107]:
con.commit()
con.close()

In [21]:
# changing dtypes to datetime

df_final['date'] = pd.to_datetime(df_final['date'], errors = 'coerce')
df_final['date'].head()


0   2022-04-13 21:49:54
1   2022-04-13 21:49:54
2   2022-04-13 21:49:54
3   2022-04-13 21:49:54
4   2022-04-13 21:49:54
Name: date, dtype: datetime64[ns]

In [23]:
df_final.dtypes

sku                         object
product_id                  object
color_id                    object
color                       object
fit                         object
final_price                float64
original_price             float64
headline                    object
cotton                       int64
polyester                  float64
spandex                      int64
modal                      float64
elastomultiester           float64
is_promo                     int64
description                 object
text                        object
link                        object
date                datetime64[ns]
dtype: object

In [24]:
query = """
SELECT 
    name
FROM
    sqlite_master
WHERE
    type='table'
"""
con = sqlite3.connect(path_to_db)
cursor = con.cursor()
cursor.execute(query)
cursor.fetchall()

[('hm_showroom',)]

In [25]:
con.commit()
con.close()

In [26]:
# checking if data is there:

query = """
SELECT *
FROM hm_showroom
"""

con = sqlite3.connect(path_to_db, timeout=10)

df_sql = pd.read_sql_query(query, con = con)
df_sql

Unnamed: 0,sku,product_id,color_id,color,fit,final_price,original_price,headline,cotton,polyester,...,spandex,elastomultiester,elasterell_p,viscose,modal,is_promo,description,text,link,date
0,0690449001,0690449,001,Light denim blue/trashed,Skinny,16.99,16.99,Light denim blue/trashed,99,0,...,0,0,0.0,0.0,0,0,Not Informed,"Cotton 99%, Elastane 1%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
1,0690449002,0690449,002,Denim blue,Skinny,14.99,14.99,Denim blue,98,0,...,0,0,0.0,0.0,0,0,Not Informed,"Cotton 98%, Elastane 2%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
2,0690449006,0690449,006,Black/washed,Skinny,7.99,7.99,Black/washed,100,0,...,0,0,0.0,0.0,0,0,Not Informed,"Cotton 100% Cotton 98%, Elastane 2%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
3,0690449007,0690449,007,Light denim blue,Skinny,14.99,14.99,Light denim blue,98,0,...,0,0,0.0,0.0,0,0,Not Informed,"Cotton 98%, Elastane 2% Lining",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
4,0690449009,0690449,009,Black washed out,Skinny,19.99,19.99,Black washed out,99,0,...,0,0,0.0,0.0,0,0,Not Informed,"Cotton 99%, Elastane 1%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5555,1027852002,1027852,002,Dark gray,Relaxed,29.99,29.99,Relaxed Denim Joggers,100,0,...,0,0,0.0,0.0,0,0,"Joggers in thick cotton denim. Regular waist, ...","SizeThe model is 187cm/6'2"" and wears a size L...",https://www2.hm.com/en_us/productpage.10278520...,2022-04-13 21:49:54
5556,1027852007,1027852,007,Denim blue,Relaxed,29.99,29.99,Relaxed Denim Joggers,100,0,...,0,0,0.0,0.0,0,0,"Joggers in thick cotton denim. Regular waist, ...","SizeThe model is 185cm/6'1"" and wears a size M...",https://www2.hm.com/en_us/productpage.10278520...,2022-04-13 21:49:54
5557,1048642001,1048642,001,Denim blue,Regular,39.99,39.99,Regular Bootcut Jeans,99,0,...,1,0,0.0,0.0,0,0,5-pocket jeans in stretch cotton denim with a ...,"FitRegular fit /CompositionShell: Cotton 99%, ...",https://www2.hm.com/en_us/productpage.10486420...,2022-04-13 21:49:54
5558,1048642002,1048642,002,Denim black,Regular,39.99,39.99,Regular Bootcut Jeans,99,0,...,1,0,0.0,0.0,0,0,5-pocket jeans in stretch cotton denim with a ...,"FitRegular fit /CompositionShell: Cotton 99%, ...",https://www2.hm.com/en_us/productpage.10486420...,2022-04-13 21:49:54


In [166]:
# adding more data

df_final.to_sql('hm_showroom', con = con, if_exists='append', index = False)


177

In [167]:
con.commit()

In [281]:
# creating sqlalchemy engine for connection
path = 'sqlite:///' + path_to_db
engine = create_engine(path, echo=True)

# creating a Session class
Session = sessionmaker(bind=engine)

# creating a session
session = Session()

In [282]:
# adding data to column

try:
    # adding data
    df_final.to_sql('hm_showroom', con = engine, if_exists='append', index = False)

    # committing changes
    session.commit()
except:
    session.rollback()

finally:
    session.close()

2022-04-15 18:55:10,576 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("hm_showroom")
2022-04-15 18:55:10,638 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-15 18:55:10,642 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-04-15 18:55:10,652 INFO sqlalchemy.engine.Engine INSERT INTO hm_showroom (sku, product_id, color_id, color, fit, final_price, original_price, headline, cotton, polyester, spandex, modal, elastomultiester, is_promo, description, text, link, date) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
2022-04-15 18:55:10,653 INFO sqlalchemy.engine.Engine [generated in 0.00587s] (('0427159001', '0427159', '001', 'Black denim', 'Skinny', 39.99, 39.99, 'Trashed Skinny Jeans', 91, 7.0, '2', None, None, 0, '5-pocket jeans in washed stretch denim. Heavily distressed details, regular waist, and button fly. Skinny legs.', 'FitSkinny fit /CompositionCotton 91%, Polyester 7%, Spandex 2% /Art. No.0427159001', 'https://www2.hm.com/en_us/productpage.0427159001.ht

In [257]:
# testing adding new columns to existing ones
# adding data to column

try:
    # adding data
    df_old.to_sql('hm_showroom', con = engine, if_exists='append', index = False)

    # committing changes
    session.commit()
except:
    session.rollback()

finally:
    session.close()

In [258]:
# checking if data is there:

query = """
SELECT *
FROM hm_showroom
"""

con = sqlite3.connect(path_to_db, timeout=10)

df_sql = pd.read_sql_query(query, con = con)
df_sql

Unnamed: 0,sku,product_id,color_id,color,fit,final_price,original_price,headline,cotton,polyester,elastane,elasterell_p,viscose,modal,is_promo,description,text,link,date
0,0690449001,0690449,001,Light denim blue/trashed,Skinny,16.99,16.99,Light denim blue/trashed,99,0,1,0,0,0,0,Not Informed,"Cotton 99%, Elastane 1%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
1,0690449002,0690449,002,Denim blue,Skinny,14.99,14.99,Denim blue,98,0,2,0,0,0,0,Not Informed,"Cotton 98%, Elastane 2%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
2,0690449006,0690449,006,Black/washed,Skinny,7.99,7.99,Black/washed,100,0,2,0,0,0,0,Not Informed,"Cotton 100% Cotton 98%, Elastane 2%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
3,0690449007,0690449,007,Light denim blue,Skinny,14.99,14.99,Light denim blue,98,0,2,0,0,0,0,Not Informed,"Cotton 98%, Elastane 2% Lining",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
4,0690449009,0690449,009,Black washed out,Skinny,19.99,19.99,Black washed out,99,0,1,0,0,0,0,Not Informed,"Cotton 99%, Elastane 1%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5385,0875105011,0875105,011,Light denim blue,Relaxed,29.99,29.99,Light denim blue,100,0,0,0,0,0,0,Not Informed,Shell: Cotton 100%,https://www2.hm.com/en_us/productpage.08751050...,2021-11-30 15:14:33
5386,0875105015,0875105,015,Light denim blue,Relaxed,29.99,29.99,Light denim blue,100,0,0,0,0,0,0,Not Informed,Cotton 100%,https://www2.hm.com/en_us/productpage.08751050...,2021-11-30 15:14:33
5387,0875105023,0875105,023,Black,Relaxed,29.99,29.99,Black,100,0,0,0,0,0,0,Not Informed,Shell: Cotton 100%,https://www2.hm.com/en_us/productpage.08751050...,2021-11-30 15:14:33
5388,0875105024,0875105,024,Light denim blue,Relaxed,29.99,29.99,Light denim blue,100,0,0,0,0,0,0,Not Informed,Shell: Cotton 100% Pocket,https://www2.hm.com/en_us/productpage.08751050...,2021-11-30 15:14:33


In [272]:
# show all columns and their descriptions
query = "PRAGMA table_info(hm_showroom);"

con = sqlite3.connect(path_to_db, timeout=10)

cursor = con.cursor()
cursor.execute(query)
cols_desc = cursor.fetchall()

In [273]:
con.close()

In [279]:
# for col in cols_desc:
#     print(col[1])

In [283]:
# adding data
# df_final.to_sql('hm_showroom', con = engine, if_exists='append', index = False)

# committing changes
# session.commit()

# session.rollback()

# session.close()

In [284]:
# checking if data is there:

query = """
SELECT *
FROM hm_showroom
"""

con = sqlite3.connect(path_to_db, timeout=10)

df_sql = pd.read_sql_query(query, con = con)
df_sql

Unnamed: 0,sku,product_id,color_id,color,fit,final_price,original_price,headline,cotton,polyester,elastane,elasterell_p,viscose,modal,is_promo,description,text,link,date
0,0690449001,0690449,001,Light denim blue/trashed,Skinny,16.99,16.99,Light denim blue/trashed,99,0,1,0,0,0,0,Not Informed,"Cotton 99%, Elastane 1%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
1,0690449002,0690449,002,Denim blue,Skinny,14.99,14.99,Denim blue,98,0,2,0,0,0,0,Not Informed,"Cotton 98%, Elastane 2%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
2,0690449006,0690449,006,Black/washed,Skinny,7.99,7.99,Black/washed,100,0,2,0,0,0,0,Not Informed,"Cotton 100% Cotton 98%, Elastane 2%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
3,0690449007,0690449,007,Light denim blue,Skinny,14.99,14.99,Light denim blue,98,0,2,0,0,0,0,Not Informed,"Cotton 98%, Elastane 2% Lining",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
4,0690449009,0690449,009,Black washed out,Skinny,19.99,19.99,Black washed out,99,0,1,0,0,0,0,Not Informed,"Cotton 99%, Elastane 1%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5385,0875105011,0875105,011,Light denim blue,Relaxed,29.99,29.99,Light denim blue,100,0,0,0,0,0,0,Not Informed,Shell: Cotton 100%,https://www2.hm.com/en_us/productpage.08751050...,2021-11-30 15:14:33
5386,0875105015,0875105,015,Light denim blue,Relaxed,29.99,29.99,Light denim blue,100,0,0,0,0,0,0,Not Informed,Cotton 100%,https://www2.hm.com/en_us/productpage.08751050...,2021-11-30 15:14:33
5387,0875105023,0875105,023,Black,Relaxed,29.99,29.99,Black,100,0,0,0,0,0,0,Not Informed,Shell: Cotton 100%,https://www2.hm.com/en_us/productpage.08751050...,2021-11-30 15:14:33
5388,0875105024,0875105,024,Light denim blue,Relaxed,29.99,29.99,Light denim blue,100,0,0,0,0,0,0,Not Informed,Shell: Cotton 100% Pocket,https://www2.hm.com/en_us/productpage.08751050...,2021-11-30 15:14:33


In [198]:
session.close()

In [206]:
con.close()

In [None]:
# if using sqlite always do con.commit() to save progress
# if using sqlalchemy create a session, and then commit things

### Merging all data and inserting it to SQLite DB

In [27]:
# # creating sqlalchemy engine for connection
# path = 'sqlite:///' + path_to_db
# engine = create_engine(path, echo=False)

# # creating a Session class
# Session = sessionmaker(bind=engine)

# # creating a session
# session = Session()
# query = """
# DROP TABLE IF EXISTS hm_showroom
# """

# try:
#     engine.execute(query)
#     session.commit()
# except:
#     session.rollback()

In [28]:
df_old.isna().sum()
df_old.shape

(5390, 19)

In [29]:
df_final.head()
df_final.isna().sum()
df_final[df_final['fit'].isna()]

# filling NAs
df_final.loc[df_final['fit'].isna()] = 'Not Informed'


In [30]:
df_final.head()

Unnamed: 0,sku,product_id,color_id,color,fit,final_price,original_price,headline,cotton,polyester,spandex,modal,elastomultiester,is_promo,description,text,link,date
0,427159001,427159,1,Black denim,Skinny,39.99,39.99,Trashed Skinny Jeans,91,7.0,2,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 91%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
1,427159002,427159,2,Blue washed out,Skinny,39.99,39.99,Trashed Skinny Jeans,93,6.0,1,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
2,427159003,427159,3,Denim blue,Skinny,39.99,39.99,Trashed Skinny Jeans,93,6.0,1,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 93%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
3,427159004,427159,4,Light denim blue,Skinny,39.99,39.99,Trashed Skinny Jeans,99,,1,,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 99%, Spandex ...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54
4,427159005,427159,5,Dark denim blue,Skinny,39.99,39.99,Trashed Skinny Jeans,72,20.0,1,7.0,,0,5-pocket jeans in washed stretch denim. Heavil...,"FitSkinny fit /CompositionCotton 72%, Polyeste...",https://www2.hm.com/en_us/productpage.04271590...,2022-04-13 21:49:54


In [31]:
df_old.columns

Index(['sku', 'product_id', 'color_id', 'color', 'fit', 'final_price',
       'original_price', 'headline', 'cotton', 'polyester', 'elastane',
       'elasterell_p', 'viscose', 'modal', 'is_promo', 'description', 'text',
       'link', 'date'],
      dtype='object')

In [32]:
df_final.columns

Index(['sku', 'product_id', 'color_id', 'color', 'fit', 'final_price',
       'original_price', 'headline', 'cotton', 'polyester', 'spandex', 'modal',
       'elastomultiester', 'is_promo', 'description', 'text', 'link', 'date'],
      dtype='object')

In [33]:
df_merged = pd.concat([df_old, df_final], axis = 0)
df_merged.head()

# changing columns order:
cols = ['sku','product_id','color_id','color','fit','final_price','original_price','headline','cotton','polyester', 'elastane', 'spandex','elastomultiester', 'elasterell_p','viscose','modal','is_promo','description','text','link','date']
df_merged = df_merged[cols].copy()

df_merged.isna().sum()

sku                    0
product_id             0
color_id               0
color                  0
fit                    0
final_price            0
original_price         0
headline               0
cotton                 0
polyester            149
elastane             177
spandex             5390
elastomultiester    5555
elasterell_p         177
viscose              177
modal                169
is_promo               0
description            0
text                   0
link                   0
date                   0
dtype: int64

In [34]:
# filling NAs
df_merged.fillna(value = 0, inplace = True)

df_merged.isna().sum()

sku                 0
product_id          0
color_id            0
color               0
fit                 0
final_price         0
original_price      0
headline            0
cotton              0
polyester           0
elastane            0
spandex             0
elastomultiester    0
elasterell_p        0
viscose             0
modal               0
is_promo            0
description         0
text                0
link                0
date                0
dtype: int64

In [35]:
df_merged.head()


Unnamed: 0,sku,product_id,color_id,color,fit,final_price,original_price,headline,cotton,polyester,...,spandex,elastomultiester,elasterell_p,viscose,modal,is_promo,description,text,link,date
0,690449001,690449,1,Light denim blue/trashed,Skinny,16.99,16.99,Light denim blue/trashed,99,0,...,0,0,0.0,0.0,0,0,Not Informed,"Cotton 99%, Elastane 1%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
1,690449002,690449,2,Denim blue,Skinny,14.99,14.99,Denim blue,98,0,...,0,0,0.0,0.0,0,0,Not Informed,"Cotton 98%, Elastane 2%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
2,690449006,690449,6,Black/washed,Skinny,7.99,7.99,Black/washed,100,0,...,0,0,0.0,0.0,0,0,Not Informed,"Cotton 100% Cotton 98%, Elastane 2%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
3,690449007,690449,7,Light denim blue,Skinny,14.99,14.99,Light denim blue,98,0,...,0,0,0.0,0.0,0,0,Not Informed,"Cotton 98%, Elastane 2% Lining",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36
4,690449009,690449,9,Black washed out,Skinny,19.99,19.99,Black washed out,99,0,...,0,0,0.0,0.0,0,0,Not Informed,"Cotton 99%, Elastane 1%",https://www2.hm.com/en_us/productpage.06904490...,2021-09-11 17:19:36


In [36]:
df_merged.shape

(5567, 21)

In [39]:
# removing bug
df_merged.spandex.unique()
df_merged[df_merged.spandex == 'Not Informed']
df_merged = df_merged[ df_merged['spandex'] != 'Not Informed'].copy()


In [53]:
# converting dtypes
df_merged['final_price'] = df_merged['final_price'].astype(float)
df_merged['original_price'] = df_merged['original_price'].astype(float)
df_merged['cotton'] = df_merged['cotton'].astype(int)
df_merged['polyester'] = df_merged['polyester'].astype(int)
df_merged['elastane'] = df_merged['elastane'].astype(int)
df_merged['spandex'] = df_merged['spandex'].astype(int)
df_merged['elastomultiester'] = df_merged['elastomultiester'].astype(int)
df_merged['elasterell_p'] = df_merged['elasterell_p'].astype(int)
df_merged['viscose'] = df_merged['viscose'].astype(int)
df_merged['modal'] = df_merged['modal'].astype(int)
df_merged['is_promo'] = df_merged['is_promo'].astype(int)
df_merged['date'] = pd.to_datetime(df_merged['date'], errors = 'coerce')

In [42]:
df_merged.elastomultiester.unique()

array([0, 8, 9])

In [54]:
df_merged.columns

Index(['sku', 'product_id', 'color_id', 'color', 'fit', 'final_price',
       'original_price', 'headline', 'cotton', 'polyester', 'elastane',
       'spandex', 'elastomultiester', 'elasterell_p', 'viscose', 'modal',
       'is_promo', 'description', 'text', 'link', 'date'],
      dtype='object')

In [66]:
# checking if there are no bugs or errors
# df_merged.cotton.unique()
# df_merged['polyester'].unique()
# df_merged['elastane'].unique()
# df_merged['spandex' ].unique()
# df_merged['elastomultiester' ].unique()
# df_merged['elasterell_p' ].unique()
# df_merged['viscose' ].unique()
# df_merged['modal'].unique()
# df_merged['is_promo'].unique()

In [67]:
df_merged.dtypes

sku                         object
product_id                  object
color_id                    object
color                       object
fit                         object
final_price                float64
original_price             float64
headline                    object
cotton                       int64
polyester                    int64
elastane                     int64
spandex                      int64
elastomultiester             int64
elasterell_p                 int64
viscose                      int64
modal                        int64
is_promo                     int64
description                 object
text                        object
link                        object
date                datetime64[ns]
dtype: object

In [68]:
# Inserting processed data into SQLite DB

# creating sqlalchemy engine for connection
path = 'sqlite:///' + path_to_db
engine = create_engine(path, echo=True)

# creating a Session class
Session = sessionmaker(bind=engine)

# creating a session
session = Session()

In [69]:
# adding data to column

try:
    # adding data
    df_merged.to_sql('hm_showroom', con = engine, if_exists='append', index = False)

    # committing changes
    session.commit()
except:
    session.rollback()

finally:
    session.close()

2022-04-16 16:38:06,259 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("hm_showroom")
2022-04-16 16:38:06,259 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-16 16:38:06,261 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("hm_showroom")
2022-04-16 16:38:06,264 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-16 16:38:06,270 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-04-16 16:38:06,273 INFO sqlalchemy.engine.Engine 
CREATE TABLE hm_showroom (
	sku TEXT, 
	product_id TEXT, 
	color_id TEXT, 
	color TEXT, 
	fit TEXT, 
	final_price FLOAT, 
	original_price FLOAT, 
	headline TEXT, 
	cotton BIGINT, 
	polyester BIGINT, 
	elastane BIGINT, 
	spandex BIGINT, 
	elastomultiester BIGINT, 
	elasterell_p BIGINT, 
	viscose BIGINT, 
	modal BIGINT, 
	is_promo BIGINT, 
	description TEXT, 
	text TEXT, 
	link TEXT, 
	date DATETIME
)


2022-04-16 16:38:06,275 INFO sqlalchemy.engine.Engine [no key 0.00214s] ()
2022-04-16 16:38:06,481 INFO sqlalchemy.engine.Engine COMMIT
2022-04-16 16:3

In [78]:
df_new_col = df_merged[df_merged['date'] < '2021-10-01'].copy()

df_new_col['new_colum'] = 'test'


In [82]:
datetime.now().strftime("%Y-%m-%d")

'2022-04-16'

In [83]:
"hm_showroom_backup-{}".format(datetime.now().strftime("%Y-%m-%d"))

'hm_showroom_backup-2022-04-16'

In [84]:
# testing case a new column is added

try:
    # adding data
    df_new_col.to_sql('hm_showroom', con = engine, if_exists='append', index = False)

    # committing changes
    session.commit()
except:
    try:
        # in case scraped data returns with a new column, it will be added to a new table
        table_name = "hm_showroom_backup-{}".format(datetime.now().strftime("%Y-%m-%d"))
        df_new_col.to_sql( table_name, con = engine, if_exists='append', index = False)

        session.commit()
    except:  
        # if even this fails, undo everything      
        session.rollback()

finally:
    session.close()

# USE THIS SCRIPT TO PREVENT FAILS

2022-04-16 16:51:10,878 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("hm_showroom")
2022-04-16 16:51:10,879 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-16 16:51:10,887 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-04-16 16:51:10,919 INFO sqlalchemy.engine.Engine INSERT INTO hm_showroom (sku, product_id, color_id, color, fit, final_price, original_price, headline, cotton, polyester, elastane, spandex, elastomultiester, elasterell_p, viscose, modal, is_promo, description, text, link, date, new_colum) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
2022-04-16 16:51:10,920 INFO sqlalchemy.engine.Engine [generated in 0.02383s] (('0690449001', '0690449', '001', 'Light denim blue/trashed', 'Skinny', 16.99, 16.99, 'Light denim blue/trashed', 99, 0, 1, 0, 0, 0, 0, 0, 0, 'Not Informed', 'Cotton 99%, Elastane 1%', 'https://www2.hm.com/en_us/productpage.0690449001.html', '2021-09-11 17:19:36.000000', 'test'), ('0690449002', '0690449', '002', 'Denim bl

## TODO

- clean script
- check datatypes and use datetime for date column
- automate data insertion
- create db on AWS
- create EC2 to run script
- create Lambda with cronjob to run script