# 01. Web scraping basics

###  1. Robotparser

1. A file introducing if the web page is crawled
   - file name: robots.txt
   - url address: http://ip(domain):80/robots.txt

2. Related python library: robotparser

In [1]:
import urllib.robotparser

rp = urllib.robotparser.RobotFileParser()

In [2]:
# wikibook
rp.set_url("http://wikibook.co.kr/robots.txt")
rp.read()

data = rp.can_fetch("mybot", "http://wikibook.co.kr")
print(data)

False


In [3]:
# google
rp.set_url("http://www.google.com/robots.txt")
rp.read()

data = rp.can_fetch("mybot", "http://www.google.com")
print(data)

False


### 2. Usage of Regex

In [4]:
import re
  
# test data - tag structure of 'http://www.hanbit.co.kr' 
data = '<td class="left"><a href="/store/books/look.php?p_code=B7198274060">book info</a></td>'

In [5]:
# extract ''/store/books/look.php?p_code=B7198274060' from data  
d1 = re.search(r'<a href="(.*)">', data).group(1)
d1

'/store/books/look.php?p_code=B7198274060'

In [6]:
# extract 'book_info' from data
d2 = re.sub(r'<.*?>', '', data)
d2

'book info'

In [7]:
# B7198274060
d3 = re.search(r'p_code=(.*)', d1).group(1)
d3 
   

'B7198274060'

- 'a\nb' is not matched if the regular expression is 'a.b', because \n is not matched to meta character(.)  
- You could use re.DOTALL option to match with \n character

### 3. Introduction to request

- Getting web response informations with 'requests' library

In [8]:
import requests

In [9]:
r = requests.get('http://hanbit.co.kr')

In [10]:
# http status code
r.status_code

200

In [11]:
# check all header information from response
r.headers

{'Date': 'Thu, 02 Sep 2021 11:02:59 GMT', 'Server': 'Apache', 'X-Powered-By': 'PHP/5.3.3', 'Set-Cookie': 'PHPSESSID=m4e17kf9dpljf9onjkc1k9b345; path=/, PHPSESSID=m4e17kf9dpljf9onjkc1k9b345; path=/; secure; SameSite=None', 'Expires': 'Thu, 19 Nov 1981 08:52:00 GMT', 'Cache-Control': 'no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Pragma': 'no-cache', 'Connection': 'close', 'Transfer-Encoding': 'chunked', 'Content-Type': 'text/html; charset=UTF-8'}

In [12]:
# extract content-type information from header
r.headers['content-type']

'text/html; charset=UTF-8'

1. Requests to 'http://www.naver.com/' -> Response data is Korean
    : Encodings for Korean character
        utf-8/cp949/euc-kr
2. Requests to 'http://www.yahoo.com/' -> Response data is English
    : Encodings for Alphabets
        ascii/cp037/cp437

In [13]:
# encoding information based on HTTP header
r.encoding

'UTF-8'

- Opening URL

In [14]:
from urllib.request import urlopen

In [15]:
html = urlopen("http://pythonscraping.com/pages/page1.html")
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


### 4. Simple scraping with SQLite

http://www.hanbit.co.kr/store/books/full_book_list.html

1. Understanding flow of scraping with python
   - All books list page -> Scrape link information of each book -> Save to DB
    

2. Saved information
   - Book title
   - Hyperlink url of a book
  
3. Process
   - Extract web page : fetch()
   - Scrape html : scrape()
   - Save data to DB : save()

4. Sample data to extract  
url : http://www.hanbit.co.kr/store/books/look.php?p_code=B7198274060  
title :  재미있고 빠른 한글 1권 : 기본 모음과 자음

In [16]:
import re
import sqlite3
import ssl
from urllib.parse import urljoin
from urllib.request import urlopen
from html import unescape

In [17]:
def fetch(url):
    """
    extracts web page from 'url' parameter
    encoding is extracted from Content-Type header
    returns: HTML with 'str' type
    """
    context = ssl._create_unverified_context()
    f = urlopen(url, context=context)
    
    encoding = f.info().get_content_charset(failobj="utf-8")
    print("Encoding : ", encoding)
    
    # decode characters based on extracted encoding
    html = f.read().decode(encoding)
    
    print("All document scrapped")

    return html
    

def scrape(html):
    """
    extracts book informations from html document with regex
    returns: book list(dict type)
    """
    books = list()

    for partial_html in re.findall(r'<td class="left"><a.*?</td>', html, re.DOTALL):
        url = re.search(r'<a href="(.*?)"', partial_html).group(1)
        url = urljoin("http://www.hanbit.co.kr", url)

        title = re.sub(r'<.*?>', '', partial_html)
        title = unescape(title)

        print("Extracted url : ", url)
        print("Title : ", title)
        
        books.append({'url':url, 'title':title})

    return books
   

def save(db_path, books):
    """
    save book list to SQList database
    returns: None
    """
    conn = sqlite3.connect(db_path)

    c = conn.cursor()

    c.execute('DROP TABLE IF EXISTS books')
    c.execute('CREATE TABLE books(title text, url text)')
    c.executemany('INSERT INTO books VALUES (:title, :url)', books)

    conn.commit()
    conn.close()

In [18]:
# calls fetch(), scrape(), save() function
html = fetch('http://www.hanbit.co.kr/store/books/full_book_list.html')

books = scrape(html)
save('books.db', books)

Encoding :  utf-8
All document scrapped
Extracted url :  http://www.hanbit.co.kr/store/books/look.php?p_code=B4448108740
Title :  웹 브라우저 속 머신러닝 TensorFlow.js
Extracted url :  http://www.hanbit.co.kr/store/books/look.php?p_code=B6434597961
Title :  IT CookBook, 처음 만나는 전기기기(2판)
Extracted url :  http://www.hanbit.co.kr/store/books/look.php?p_code=B9923613168
Title :  모바일 UX/UI 디자인 강의 with Adobe XD
Extracted url :  http://www.hanbit.co.kr/store/books/look.php?p_code=B9575488572
Title :  처음 배우는 플라스크 웹 프로그래밍
Extracted url :  http://www.hanbit.co.kr/store/books/look.php?p_code=B9234684497
Title :  인문학 거저보기 : 서양철학 편
Extracted url :  http://www.hanbit.co.kr/store/books/look.php?p_code=B4633012179
Title :  IT CookBook, 현대 센서공학(5판)
Extracted url :  http://www.hanbit.co.kr/store/books/look.php?p_code=B9037188841
Title :  방구석 심리학 실험실
Extracted url :  http://www.hanbit.co.kr/store/books/look.php?p_code=B1265976064
Title :  IT CookBook, 모바일 웹 서비스 구현을 위한 JSP 웹 프로그래밍
Extracted url :  http://www.hanbit.

### 5. HTML handling

1. HTML has a tree structure
   - root -> html tag
   - tag is named as 'element'
   - all child tag could be managed with 'html' tag
   
2. lxml.html: An api to deal with tree of html document

In [19]:
import requests
import lxml.html

Topic of sample scraping: Extract permant links and table of contents from 'new book list' page

- First page: http://www.hanbit.co.kr/store/books/new_book_list.html
- Detail page: http://www.hanbit.co.kr/store/books/look.php?p_code=B2417558084
- Table of contents: http://www.hanbit.co.kr/store/books/look.php?p_code=B2417558084  

In [20]:
response = requests.get("http://www.hanbit.co.kr/store/books/new_book_list.html")
response

<Response [200]>

In [21]:
response.content

b'<!DOCTYPE html>\r\n<html lang="ko">\r\n<head>\r\n<!--[if lte IE 8]>\r\n<script>\r\n  location.replace(\'/support/explorer_upgrade.html\');\r\n</script>\r\n<![endif]-->\r\n<meta charset="utf-8"/>\r\n<title>\xed\x95\x9c\xeb\xb9\x9b\xec\xb6\x9c\xed\x8c\x90\xeb\x84\xa4\xed\x8a\xb8\xec\x9b\x8c\xed\x81\xac</title>\r\n<link rel="shortcut icon" href="https://www.hanbit.co.kr/images/common/hanbit.ico"> \r\n<meta http-equiv="X-UA-Compatible" content="IE=Edge" />\r\n<meta property="og:type" content="website"/>\r\n<meta property="og:title" content="\xed\x95\x9c\xeb\xb9\x9b\xec\xb6\x9c\xed\x8c\x90\xeb\x84\xa4\xed\x8a\xb8\xec\x9b\x8c\xed\x81\xac"/>\r\n<meta property="og:description" content="\xeb\x8d\x94 \xeb\x84\x93\xec\x9d\x80 \xec\x84\xb8\xec\x83\x81, \xeb\x8d\x94 \xeb\x82\x98\xec\x9d\x80 \xeb\xaf\xb8\xeb\x9e\x98\xeb\xa5\xbc \xec\x9c\x84\xed\x95\x9c \xec\x95\x84\xec\x8b\x9c\xec\x95\x84 \xec\xb6\x9c\xed\x8c\x90 \xeb\x84\xa4\xed\x8a\xb8\xec\x9b\x8c\xed\x81\xac :: \xed\x95\x9c\xeb\xb9\x9b\xeb\xaf\

In [22]:
root = lxml.html.fromstring(response.content)
root

<Element html at 0x7fe6a044f228>

In [23]:
# convert to absolute path
root.make_links_absolute(response.url)

# cssselect library is needed
for a in root.cssselect('.view_box .book_tit a'):
    url = a.get('href')
    print(url)

https://www.hanbit.co.kr/store/books/look.php?p_code=B4448108740
https://www.hanbit.co.kr/store/books/look.php?p_code=B6434597961
https://www.hanbit.co.kr/store/books/look.php?p_code=B9923613168
https://www.hanbit.co.kr/store/books/look.php?p_code=B9234684497
https://www.hanbit.co.kr/store/books/look.php?p_code=B9575488572
https://www.hanbit.co.kr/store/books/look.php?p_code=B4633012179
https://www.hanbit.co.kr/store/books/look.php?p_code=B9037188841
https://www.hanbit.co.kr/store/books/look.php?p_code=B1265976064
https://www.hanbit.co.kr/store/books/look.php?p_code=B6220676898
https://www.hanbit.co.kr/store/books/look.php?p_code=B6156831644
https://www.hanbit.co.kr/store/books/look.php?p_code=B9134095146
https://www.hanbit.co.kr/store/books/look.php?p_code=B8955111301
https://www.hanbit.co.kr/store/books/look.php?p_code=B9824309282
https://www.hanbit.co.kr/store/books/look.php?p_code=B6758255857
https://www.hanbit.co.kr/store/books/look.php?p_code=B9737048306
https://www.hanbit.co.kr/

### 6. Session

1. Session
   - Describe and manages user information from log-in to log-out
   - Maintained status
   - Continuous connection with server
   - Why using sessions in scraping?
      - Continuous connecton with server
      - Save resources of server
   
2. Refactoring
   - Develop code to be optimal, with same oparation and function

In [24]:
import requests
import lxml.html
import re
import time
import random

Extend the above code: refactoring

Process
   - Create session object: requests.Session()
   - Scraping: get()
   - Extract data with lxml: lxml.html.fromstring()

In [25]:
def scrape_list_page(response):
    root = lxml.html.fromstring(response.content)
    root.make_links_absolute(response.url)

    for a in root.cssselect('.view_box .book_tit a'):
        url = a.get('href')
        yield url

'''
detailed information is processed by dict type
key - url, title, price, content
value - cssselect()
'''
def scrape_detail_page(response):
    root = lxml.html.fromstring(response.content)

    '''
    1. remove meaningless character: if statement
    2. process multiple spaces between characters: strip() and re.sub('\s+', ' ', data)
    '''
    normalize_space = lambda string: re.sub(r'\s+', ' ', string).strip()
    
    #cssselect() return type: list
    #text_content() return type: str
    bookInfo = {
        'url': response.url,
        'title': root.cssselect('h3')[0].text_content(),
        'price': root.cssselect('.pbr del')[0].text_content(),
        'content': [normalize_space(p.text_content()) for p in root.cssselect('#tabs_3 .hanbit_edit_view p') if normalize_space(p.text_content())]
    }

    return bookInfo

In [None]:
session = requests.Session()
response = session.get("http://www.hanbit.co.kr/store/books/new_book_list.html")

urls = scrape_list_page(response)
for url in urls:
    response = session.get(url)
    book_info = scrape_detail_page(response)
    print(book_info)
    time.sleep(random.randint(1, 3))

{'url': 'https://www.hanbit.co.kr/store/books/look.php?p_code=B4448108740', 'title': 'BOOK', 'price': '25,000원', 'content': ['1부 머신러닝의 원리와 TensorFlow.js 사용법', 'CHAPTER 1 웹에서의 머신러닝', '_1.1 개발 환경', '_1.2 머신러닝을 웹에서 돌리는 이유', '_1.3 연산 그래프', '_1.3 연산 그래프 시각화하기', '_1.4 TensorFlow.js란?', '_1.5 TensorFlow.js 설치하기', '_1.6 저수준 API', '_1.7 Layers API', '_1.8 마치며', '_1.9 연습 문제', '_1.10 더 읽을거리', 'CHAPTER 2 사전 학습된 모델을 TensorFlow.js로 가져오기', '_2.1 개발 환경', '_2.2 포터블 모델 형식', '_2.3 텐서플로에서 모델 내보내기', '_2.4 tfjs-converter를 사용하여 모델 변환하기', '_2.5 TensorFlow.js에서 모델 불러오기', '_2.6 마치며', '_2.7 연습 문제', '_2.8 더 읽을거리', 'CHAPTER 3 TensorFlow.js 에코시스템', '_3.1 개발 환경', '_3.2 왜 고수준 라이브러리가 필요한가?', '_3.3 기존 모델 사용하기', '_3.4 다양한 종류의 스토리지에서 데이터 불러오기', '_3.5 ML_5.js를 이용한 자세 추정', '_3.6 Magenta.js로 고양이 그리기', '_3.7 machinelearn_.js를 사용한 XOR 분류', '_3.8 마치며', '_3.9 연습 문제', '_3.10 더 읽을거리', '2부 TensorFlow.js를 활용한 실제 애플리케이션 사례', 'CHAPTER 4 다항 회귀', '_4.1 개발 환경', '_4.2 다항 회귀란?', '_4.3 2차원 곡선 피팅', '_4.4 마치며', '_4.5 연습 문제', '_4.6 더 읽을거리', '