# Retrieve data from webscraping

In [1]:
import requests, bs4
from bs4 import BeautifulSoup as bs
import time
import random
import pickle
import pandas as pd

## Part 1: Retrieve soup by page

### 1. Retrieve page url from ulta.com

#### About the page urls
* Moisturizers category: 1350 results (04/06)
* Pages total: 15
* Products per page: 96 (except page 15)

**Pattern as of 04/06**
* page_1 = 'https://www.ulta.com/skin-care-moisturizers?N=2796'

* page_2 = 'https://www.ulta.com/skin-care-moisturizers?N=2796&No=96&Nrpp=96'
* page_3 = 'https://www.ulta.com/skin-care-moisturizers?N=2796&No=192&Nrpp=96'
* page_4 = 'https://www.ulta.com/skin-care-moisturizers?N=2796&No=288&Nrpp=96'
* page_15 = 'https://www.ulta.com/skin-care-moisturizers?N=2796&No=1344&Nrpp=96'

#### Import functions

In [2]:
# functions that I wrote for this project, see files
import webscraping_functions as web

from webscraping_functions import create_page_url, get_page_soup,\
create_product_url_list, get_products_page

### 1. url for each page number

In [3]:
# Get url for each page
base_url = 'https://www.ulta.com/skin-care-moisturizers?N=2796'

# returns a list of url of each page
page_url_list = web.create_page_url(base_url) 

In [4]:
# should be 15
len(page_url_list)

15

### 2. Create url of each product

Products on page 1

https://www.ulta.com/p/confidence-in-a-cream-anti-aging-moisturizer-xlsImpprod13641053

https://www.ulta.com/p/dramatically-different-moisturizing-lotion-xlsImpprod10791743

In [None]:
pages_soup_list = web.get_page_soup(page_url_list)

In [11]:
# should return 15 soups 
len(pages_soup_list)

15

### 3. Create url for all products

In [12]:
products_url_list = web.create_product_url_list(pages_soup_list)

In [13]:
# should return > 1300 url
len(products_url_list)

1353

In [None]:
#Save url list to a file 
# 'w' in write mode
output_file = open('products_url.txt', 'w')

for url in products_url_list:
    output_file.write(url + '\n')
    
output_file.close()

## 4. Scraping additional

### a. Scrape global skincare page to identify non-domestic products

In [9]:
global_list = ['https://www.ulta.com/global-skin-care?N=27ig',
               'https://www.ulta.com/global-skin-care?N=27ig&No=96&Nrpp=96',
              'https://www.ulta.com/global-skin-care?N=27ig&No=192&Nrpp=96',
               'https://www.ulta.com/global-skin-care?N=27ig&No=288&Nrpp=96',
               'https://www.ulta.com/global-skin-care?N=27ig&No=384&Nrpp=96',]

In [None]:
# get page soup of each page
def get_brands(soup_list):
    my_list = []
    for soup in soup_list:
        titles_tag = soup.find_all('h4', class_="prod-title")
        for a in titles_tag:
            a = a.text
            a = a.replace('\n\n\t\t\t\t', " ").replace('\n'," ").strip()
            my_list.append(a)
            
        # eliminate duplicates 
        brands = set(my_list)
    # return a list    
    return list(brands)

In [None]:
global_brands = get_brands(global_soup)

In [None]:
# save as a pickle
with open("global_brands.pickle", "wb") as f:
    pickle.dump(global_brands, f)

In [None]:
global_brands = pd.Series(global_brands_list)
#save as a csv
global_brands.to_csv('global_brands.csv', index=False)

## Face serums

### a) First 2 pages

* ~ 200 products in 6 pages

In [8]:
url_face = ([
    'https://www.ulta.com/skin-care-treatment-serums-face-serums?N=27he',
    'https://www.ulta.com/skin-care-treatment-serums-face-serums?N=27he&No=96&Nrpp=96',
    'https://www.ulta.com/skin-care-treatment-serums-face-serums?N=27he&No=192&Nrpp=96',
    'https://www.ulta.com/skin-care-treatment-serums-face-serums?N=27he&No=288&Nrpp=96',
    'https://www.ulta.com/skin-care-treatment-serums-face-serums?N=27he&No=384&Nrpp=96',
    'https://www.ulta.com/skin-care-treatment-serums-face-serums?N=27he&No=480&Nrpp=96'
])

In [None]:
# get soup for each page number 
pages_soup_face = web.get_page_soup(url_face)

In [7]:
# get url of each product on a page
face_url_list = web.create_product_url_list(pages_soup_face)

In [None]:
#print(web.get_products_page(face_url_list, 'face_page'))

In [None]:
#print(web.get_products_page(face_url_list[0:150], 'face_page2_150'))

In [None]:
#print(web.get_products_page(face_url_list[150:300], 'face_page2_300'))

In [None]:
#print(web.get_products_page(face_url_list[300:], 'face_page2_end'))

## Part 2: Retrieve soup for each product

In [17]:
# Do this for the entire list
# test case
list_1 = web.get_products_page(products_url_list[0:2], 'test')

Progress 0
Progress 1


In [None]:
# Get 200
#print(get_products_page(products_url_list[0:200], 'Number_1_200'))

In [None]:
# Get 200
#print(get_products_page(products_url_list[200:400], 'Number_201_400')

In [None]:
#Get 600
#print(get_products_page(products_url_list[400:1000], 'Number_401_1000'))

In [None]:
# get rest - 350
# update requests to 150 and sleep after 5 min
#print(get_products_page(products_url_list[1001:], 'Number_1001_end'))