# Webscrape Gap.com

This notebook contains code to webscrape products off of [Gap](https://www.gap.com/) store.

#### Install Dependencies

In [1]:
!pip install -q beautifulsoup4
!pip install -q selenium 
!pip install -q webdriver-manager

#### Import Dependencies

In [2]:
from bs4 import BeautifulSoup
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import requests
from tqdm import tqdm

#### Define Functions

In [3]:
def setup():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    service=Service(ChromeDriverManager().install())
    return options, service

def getPage(options, service, url):
    browser = webdriver.Chrome(options=options, service=service)
    browser.get(url)
    
    time.sleep(3)
    
    total_failed = 0
    to_try = [2, 3, 4, 5, 6, 7, 8, 9]
    for i in to_try:
        try:
            browser.find_element(by=By.XPATH, value=f"/html/body/div[{i}]/div/div[2]/div[1]/button").click()
        except:
            total_failed += 1
        
    if total_failed == len(to_try):
        print('Popup was not closed :(')
        
    time.sleep(0.5)
    
    elem = browser.find_element(by=By.TAG_NAME, value="body")
    no_of_pagedowns = 40
    while no_of_pagedowns:
        elem.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.2)
        no_of_pagedowns-=1


    html = browser.page_source

    browser.quit()
    return html

def retrieve_all_products(html, elem='div', class_='product-card'):
    soup = BeautifulSoup(html, 'lxml')
    # print(soup.prettify())

    # soup.find_all('section', class_='raven-main-app')
    return soup.find_all(elem, class_=class_)



def decodeProducts(products, opts):
    productsJson = []
    
    for index, p in enumerate(products):
        
        numRatings = '0'
        rating = None

        imageurl = p.img["src"] 
            
        if 'https' in imageurl:
            full_image_url = imageurl
        else:
            full_image_url = 'https://www2.assets-gap.com' + imageurl
            
        cur_href = p.a["href"]
        if 'https' in cur_href:
            full_product_url = cur_href
        else:
            full_product_url = baseUrl + cur_href
        
        if p.find('div', class_="product-price__highlight"):
            try:
                price = p.find('span', class_="product-price__strike").get_text()
            except:
                price = p.find('span', class_="product-price__no-strike").get_text()
        else:
            price = p.find('div', class_="product-card-price").get_text()
        
        if "Now" in price:
            price = price[4:]
            
        if "-" in price:
            temp = price.split('-')
            temp[0] = float(temp[0].replace(" ", "")[1:])
            temp[1] = float(temp[1].replace(" ", "")[1:])
            avg = round((temp[0] + temp[1]) / 2, 2)
            price = '$' + str(avg)
        
        #p.find('div', class_="css-31qmff").get_text(),
        productsJson.append({
            "Label":  p.img["alt"],
            "Sex": opts['sex'],
            "Price": price,  
            "ImageURL": full_image_url,
            "ProductURL": full_product_url, 
            "Rating": rating,
            "RatingCount": numRatings,
            "Store": opts['store'],
        })
        
    return productsJson

#### Define URLs

In [4]:
baseUrl = 'https://gap.com'

productUrls = {
    "Men": [
        '/browse/category.do?cid=6998', # jeans
        '/browse/category.do?cid=5225', # t-shirts
        '/browse/category.do?cid=83056', # polos
        '/browse/category.do?cid=15043', # shirts
        '/browse/category.do?cid=1167929', # hoodies
        '/browse/category.do?cid=5180', # sweaters
        '/browse/category.do?cid=5156', # shorts
        '/browse/category.do?cid=80799', # pants
    ],
    "Women": [
        '/browse/category.do?cid=5664', # jeans
        '/browse/category.do?cid=13658', # dresses
        '/browse/category.do?cid=1152367', # jumpsuits
        '/browse/category.do?cid=17076', #t-shirts
        '/browse/category.do?cid=1041168', # sweats
        '/browse/category.do?cid=5745', # sweaters
        '/browse/category.do?cid=1011761', #pants/leggings
        '/browse/category.do?cid=1041308', # shorts
        '/browse/category.do?cid=1082574', # skirts
        
    ]
}

#### Scrape

In [5]:
options, service = setup()

# Load pages
catalog = []
for sex, categoryUrls in productUrls.items():
    for catUrl in categoryUrls:
        url = baseUrl + catUrl

        html = getPage(options, service, url)

        products = retrieve_all_products(html)

        print(f'Scraped {len(products)} products from {url}')

        catalog += decodeProducts(products, opts={'sex': sex, 'store': 'Gap'})
        
len(catalog)



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/98.0.4758.102/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\avento\.wdm\drivers\chromedriver\win32\98.0.4758.102]


Scraped 121 products from https://gap.com/browse/category.do?cid=6998
Scraped 137 products from https://gap.com/browse/category.do?cid=5225
Scraped 25 products from https://gap.com/browse/category.do?cid=83056
Scraped 152 products from https://gap.com/browse/category.do?cid=15043
Scraped 71 products from https://gap.com/browse/category.do?cid=1167929
Scraped 31 products from https://gap.com/browse/category.do?cid=5180
Scraped 134 products from https://gap.com/browse/category.do?cid=5156
Scraped 153 products from https://gap.com/browse/category.do?cid=80799
Scraped 108 products from https://gap.com/browse/category.do?cid=5664
Scraped 140 products from https://gap.com/browse/category.do?cid=13658
Scraped 31 products from https://gap.com/browse/category.do?cid=1152367
Scraped 153 products from https://gap.com/browse/category.do?cid=17076
Scraped 147 products from https://gap.com/browse/category.do?cid=1041168
Scraped 139 products from https://gap.com/browse/category.do?cid=5745
Scraped 15

1857

In [119]:
# Ensure All the Links Are Valid
for item in tqdm(catalog):
    link = item["ProductURL"]
    try
        response = requests.get(link)
        if response.status_code != 200:
            print(f'ProductURL {link} does not exist')
    except:
        print(f'An error happened with ProductURL {link}')

    link = item["ImageURL"]
    try:
        response = requests.get(link)
        if response.status_code != 200:
            print(f'ImageURL {link} does not exist')
    except:
        print(f'An error happened with ImageURL {link}')

100%|██████████████████████████████████████████████████████████████████████████████████| 72/72 [01:30<00:00,  1.25s/it]


In [10]:
catalog[56]

{'Label': 'The Everyday Jeans with GapFlex',
 'Sex': 'Men',
 'Price': '$49.95',
 'ImageURL': 'https://www3.assets-gap.com/webcontent/0019/939/030/cn19939030.jpg',
 'ProductURL': 'https://www.gap.com/browse/product.do?pid=871867002&cid=1050840&pcid=6998&vid=1',
 'Rating': None,
 'RatingCount': '0',
 'Store': 'Gap'}

#### Save catalog to JSON

In [6]:
with open('catalog_gap.json', 'w') as outfile:
    json.dump(catalog, outfile, indent=4)

In [9]:
import pathlib
import shutil
import os

image_folder = '../data/webscraped_images/gap'

# Create directory if not yet exists
pathlib.Path(image_folder).mkdir(parents=True, exist_ok=True)

for i, item in tqdm(enumerate(catalog)):
    url = item['ImageURL']
    file_name = os.path.join(image_folder, f'{i}.jpg') 
    
    res = requests.get(url, stream = True)
    if res.status_code == 200:
        with open(file_name, 'wb') as f:
            shutil.copyfileobj(res.raw, f)
    else:
        print(f'Failed to download image for catalog item #{i} from {url}')

1857it [01:27, 21.20it/s]
