# 03. Web scraping with selenium

### 1. Selenium basics

In [1]:
from selenium import webdriver
import time

options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument("disable-gpu")

# chromedriver must be downloaded and located in same path
driver = webdriver.Chrome(executable_path='./chromedriver', options=options)
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
time.sleep(3)

print(driver.find_element_by_id("content").text)
driver.close()

Here is some important text you want to retrieve!
A button to click!


In [2]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Chrome(executable_path='./chromedriver', options=options)
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")

try:
    element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton")))
finally:
    print(driver.find_element_by_id("content").text)
    driver.close()

Here is some important text you want to retrieve!
A button to click!


In [3]:
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import StaleElementReferenceException

def wait_for_load(driver):
    elem = driver.find_element_by_tag_name("html")
    count = 0
    while True:
        count += 1
        if count > 20:
            print("Timing out after 10 seconds and returning")
            return
        
        time.sleep(.5)
        try:
            elem == driver.find_element_by_tag_name("html")
        except StaleElementReferenceException:
            return

driver = webdriver.Chrome(executable_path='./chromedriver', options=options)
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")

wait_for_load(driver)
driver.page_source

Timing out after 10 seconds and returning


'<html><head>\n<title>The Destination Page!</title>\n\n</head>\n<body>\nThis is the page you are looking for!\n\n</body></html>'

### 2. Scraping a shopping site with selenium

In [4]:
#-*- coding: utf-8 -*-

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
from datetime import datetime
import re

In [5]:
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument("disable-gpu")

# chromedriver must be downloaded and located in same path
dr = webdriver.Chrome('./chromedriver', options=options)
dr.get('http://store.musinsa.com/app/contents/onsale?d_cat_cd=&brand=&page_kind=onsale&list_kind=small&sort=pop&page=1')

html = dr.page_source

In [6]:
soup = BeautifulSoup(html, 'html.parser')

sale_pdt = soup.find_all('p', attrs={'class': 'list_info'})
sale_prc = soup.find_all('p', attrs={'class': 'price'})

In [7]:
pdt = []
prc = []

for i in range(0, len(sale_pdt)):
    pdt.append(sale_pdt[i].find('a').text.strip())
    prc.append(re.sub('[^0-9]','',sale_prc[i].text[10:]))

In [8]:
data = {'product_name': pdt,
        'price': prc}

dt = DataFrame(data)
print(dt.head())

dt.to_csv('Musinsa_discount_{0}.csv'.format(datetime.today().strftime('%Y%m%d')),encoding='utf-8',index=False)
dr.quit()

                     product_name price
0          오버사이즈 울 트렌치 코트 [BEIGE]      
1                오버사이즈 모던 체크 블레이저      
2  오버사이즈 울 트렌치 코트 [GREYISH BEIGE]      
3                 엔젤 와펜 집업 후드 그레이      
4          오버사이즈 울 트렌치 코트 [BLACK]      


### 3. Scraping wikibooks site

In [9]:
from selenium import webdriver
from bs4 import BeautifulSoup

import requests
import re
import time
import random
import json

In [10]:
def dynamic_get_page(url):
    try:
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        options.add_argument("disable-gpu")

        driver = webdriver.Chrome(executable_path='./chromedriver', options=options)
        driver.get(url)

        driver.execute_script("document.getElementById('load-more').style.width = '100%';")

        count = 0
        print("Scraping book list...")
        while driver.find_element_by_id("load-more").text != "더는 자료가 없습니다.":
            driver.find_element_by_css_selector("#load-more").click()
            count += 1
            time.sleep(random.randint(1, 3))

            print("Searching more books: {}".format(count))
        print("All book list saved!")

        html = driver.page_source
        driver.quit()
        return html

    except:
        driver.quit()
        return '0'

def scrape_list_page(html_text):
    soup = BeautifulSoup(html_text ,"lxml")
    for li in soup.find('ul', {'id':'front-book-list'}).find_all('li', {'class':'book-in-front'}):
        url = li.find('a').get('href')
        yield url

def scrape_detail_page(url):
    session = requests.Session()
    response = session.get(url).text

    root = BeautifulSoup(response, "lxml")
    normalize_space = lambda string: re.sub(r'\s+', ' ', string).strip()
    
    if root:
        try:
            book_info = {
                'url': url,
                'title': root.find("div", {'id':'content'}).find("h1", {'class':'main-title'}).text,
                'price': root.find("div", {'id':'content'}).find("ul", {'class':'book-info'}).find_all("li")[4].text.split('|')[0].strip().replace('원', '').replace(',', ''),
                'content': [normalize_space(p.text) for p in root.find("div", {'id':'toc'}).find('ul').find_all('li') if normalize_space(p.text)]
            }
        except AttributeError:
            book_info = None
    
    else:
        book_info = None
    
    return book_info

def json_save(file_name, data):
    with open(file_name+".json", "w", encoding="UTF-8-sig") as f:
        json.dump(data, fp=f, ensure_ascii=False)
    print("JSON data saved - filename: " + file_name)

def develop_data(html):
    print("Scraping book info...")
    book_list = dict()
    index = 0
    for url in scrape_list_page(html):
        book_info = scrape_detail_page(url)
        if book_info:
            book_list[str(index)] = book_info
            index += 1
            print("# of scraped book: {}".format(index))
        
        time.sleep(random.randint(1, 3))
    print("All book info saved!")
        
    print(book_list)
    json_save("wikibook_books", book_list)

In [11]:
html = dynamic_get_page("http://wikibook.co.kr/")

try:
    int(html)
    print('No data found')
except:
    develop_data(html)

Scraping book list...
Searching more books: 1
Searching more books: 2
Searching more books: 3
Searching more books: 4
Searching more books: 5
Searching more books: 6
Searching more books: 7
Searching more books: 8
Searching more books: 9
Searching more books: 10
Searching more books: 11
Searching more books: 12
Searching more books: 13
Searching more books: 14
Searching more books: 15
Searching more books: 16
Searching more books: 17
Searching more books: 18
All book list saved!
Scraping book info...
# of scraped book: 1
# of scraped book: 2
# of scraped book: 3
# of scraped book: 4
# of scraped book: 5
# of scraped book: 6
# of scraped book: 7
# of scraped book: 8
# of scraped book: 9
# of scraped book: 10
# of scraped book: 11
# of scraped book: 12
# of scraped book: 13
# of scraped book: 14
# of scraped book: 15
# of scraped book: 16
# of scraped book: 17
# of scraped book: 18
# of scraped book: 19
# of scraped book: 20
# of scraped book: 21
# of scraped book: 22
# of scraped book: 

KeyboardInterrupt: 