ⓒWeb Scraping with Python / 라이언 미첼 저

#### Web Scraping(웹 스크레이핑): 데이터를 수집하는 작업 전체
<sup>API를 활용하는 프로그램이나 사람이 직접 웹 브라우저를 조작하는 방법 제외</sup>
- 데이터 분석, 자연어 처리, 정보 보안 등 다양한 프로그래밍 테크닉과 기술을 포괄.
- 프로그램을 만들어 웹 서버에 쿼리를 보내 데이터(HTML 등)를 요청하고, 이를 파싱해 필요한 정보를 추천하는 작업을 자동으로 함.

[urllib 라이브러리](https://docs.python.org/3/library/urllib.html): 웹을 통해 데이터를 요청하는 함수, 쿠키를 처리하는 함수, 헤저나 유저 에이전트 등 메타데이터를 바꾸는 함수 등.
- urlopen: 네트워크를 통해 원격 객체를 읽음. html파일이나 이미지 파일, 기타 파일 스트림을 열 수 있는 매우 범용적인 함수.

In [None]:
from urllib.request import urlopen

html = urlopen('http://python.cyber.co.kr/pds/books/python2nd/test2.html')
print(html.read())

#### BeautifulSoup: 잘못된 HTML을 수정하여 XML 형식의 파이썬 객체로 변환
아름다운 수프, 풍부한 녹색 <br>
그릇에서 기다리거라! <br>
누가 이 맛있는 것에 숙이지 않으리? <br>
저녁 수프, 아름다운 수프! <br>
    - Alice in Wonderland
    
BeautifulSoup(객체의 근간인 HTML 텍스트, 구문 분석기)

In [None]:
from bs4 import BeautifulSoup

html = urlopen('http://python.cyber.co.kr/pds/books/python2nd/test2.html')
# bs = BeautifulSoup(html.read(), "html.parser")
bs = BeautifulSoup(html, "html.parser")
print(bs)
print(bs.h1) # bs.html.body.h1 || bs.body.h1 || bs.html.h1

In [None]:
# lxml: 닫히지 않은 태그, 계층 구조가 잘못된 태그, <head><body>태그가 없는 등의 문제를 수정해줌.
! pip install lxml
bs = BeautifulSoup(html.read(), 'lxml')
print(bs)

In [None]:
# html5lib: 더 많은 문제를 해결해주나 느림.
! pip install html5lib
bs = BeautifulSoup(html.read(), 'html5lib')
print(bs)

- 페이지를 찾을 수 없거나, URL 해석에 에러가 생긴 경우 <br>
"404 Page Not Found", "500 Internal Server Error" => HTTPError
- 서버를 찾을 수 없는 경우 <br>
=> None 객체를 반환해 AttributeError

In [None]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bs = BeautifulSoup(html.read(), 'html.parser')
        title = bs.body.h2
    except AttributeError as e:
        return None
    return title

title = getTitle('http://python.cyber.co.kr/pds/books/python2nd/test2.html')
if title == None:
    print('Title could not be found')
else: 
    print(title)

findAll(tagName, tagAttributes)

- findAll(tag, attributes, recursive, text, limit, keywords)
- find(tag, attributes, recursive, text, keywords)

recursive은 True or False(최상위 태그만 찾음) <br>
ex) <br>
bs.findAll({'h1', 'h2', 'h3', 'h4', 'h5'})
bs.findAll('span', {'class': {'green', 'red'}})

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html= urlopen('http://www.pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html.read(), 'html.parser')
nameList = bs.findAll('span', {'class':'green'})
for name in nameList:
    print(name.get_text)

In [None]:
nameList = bs.findAll(text = 'the prince')
print(len(nameList))

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html= urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html.read(), 'html.parser')

print('자식과 자손')
for child in bs.find('table', {'id': 'giftList'}).children:
    print(child)

print('형제 다루기')
for sibling in bs.find('table', {'id': 'giftList'}).tr.next_siblings:
    print(sibling)
    
print('부모 다루기:', bs.find('img', {'src': '../img/gifts/img1.jpg'}).parent.previous_sibling.get_text())

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html= urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html.read(), 'html.parser')
images = bs.findAll('img', {'src': re.compile('\.\.\/img\/gifts/img.*\.jpg')}) # ../img/gifts/img로 시작해서 .jpg로 끝남
for image in images:
    print(image['src'])

In [None]:
# 속성이 두 개인 태그 모두 추출
bs.findAll(lambda tag: len(tag.attrs) == 2)

In [None]:
bs.findAll(lambda tag: tag.get_text() == 'Or maybe he\'s only resting?')

In [None]:
# 임의의 위키백과 페이지를 가져와 페이지에 들어있는 링크 목록을 가져옴
from urllib.request import urlopen
from bs4 import BeautifulSoup 

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a'):
    if 'href' in link.attrs:
        print(link.attrs['href'])

In [None]:
# 정규 표현식 사용 => 항목 페이지를 가리키는 링크만 가져옴.
from urllib.request import urlopen 
from bs4 import BeautifulSoup 
import re

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'id':'bodyContent'}).find_all(
    'a', href=re.compile('^(/wiki/)((?!:).)*$')):
    if 'href' in link.attrs:
        print(link.attrs['href'])

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now().timestamp())

# /wiki/<article_name>형태의 위키백과 항목 url을 받아, 링크된 항목 url 목록 전체를 반환하는 함수
def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org{}'.format(articleUrl))
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))

links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)

In [None]:
# 전체 사이트 크롤링, 중복 크롤링 없이.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set() # set: 순서 X, 중복X
def getLinks(pageUrl):
    global pages
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    try:
        print(bs.h1.get_text())
        print(bs.find(id ='mw-content-text').find_all('p')[0])
        print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! Continuing.')
    
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('') 

In [None]:
# 인터넷 크롤링, http://oreilly.com에서 시작해 외부 링크에서 외부 링크로 무작위 이동.
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now().timestamp())

#Retrieves a list of all Internal links found on a page
# 페이지에서 발견된 내부 링크를 모두 목록으로 만듦.
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
    internalLinks = []
    #Finds all links that begin with a "/", /로 시작하는 링크를 모두 찾음
    for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks
            
#Retrieves a list of all external links found on a page
# 페이지에서 발견된 외부 링크를 모두 목록으로 만듦
def getExternalLinks(bs, excludeUrl):
    externalLinks = []
    #Finds all links that start with "http" that do
    #not contain the current URL
    # 현재 url을 포함하지 않으면서 http나 www로 시작하는 링크를 모두 찾음.
    for link in bs.find_all('a', href=re.compile('^(http|www)((?!'+excludeUrl+').)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bs = BeautifulSoup(html, 'html.parser')
    externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print('No external links, looking around the site for one')
        domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
        internalLinks = getInternalLinks(bs, domain)
        return getRandomExternalLink(internalLinks[random.randint(0,
                                    len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]
    
def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print('Random external link is: {}'.format(externalLink))
    followExternalOnly(externalLink)
            
followExternalOnly('http://oreilly.com')

In [None]:
# Collects a list of all external URLs found on the site
# 사이트에서 찾은 외부 URL을 모두 리스트로 수집함.
allExtLinks = set()
allIntLinks = set()

def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = '{}://{}'.format(urlparse(siteUrl).scheme,
                              urlparse(siteUrl).netloc)
    bs = BeautifulSoup(html, 'html.parser')
    internalLinks = getInternalLinks(bs, domain)
    externalLinks = getExternalLinks(bs, domain)

    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)

allIntLinks.add('http://oreilly.com')
getAllExternalLinks('http://oreilly.com')

In [None]:
# 레이아웃 다루기
import requests
from bs4 import BeautifulSoup
class Content:
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body


def getPage(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser')


def scrapeNYTimes(url):
    bs = getPage(url)
    title = bs.find('h1').text
    lines = bs.select('div.StoryBodyCompanionColumn div p')
    body = '\n'.join([line.text for line in lines])
    return Content(url, title, body)

def scrapeBrookings(url):
    bs = getPage(url)
    title = bs.find('h1').text
    body = bs.find('div', {'class', 'post-body'}).text
    return Content(url, title, body)


url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'
content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

url = 'https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html'
content = scrapeNYTimes(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

In [None]:
# 검색을 통한 사이트 크롤링

class Content:
    """Common base class for all articles/pages"""
    """글/페이지 전체에 사용할 기반 클래스"""

    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.title = title
        self.body = body
        self.url = url

    def print(self):
        """Flexible printing function controls output"""
        """출력 결과를 원하는 대로 바꿀 수 있는 함수"""

        print('New article found for topic: {}'.format(self.topic))
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))
        
class Website:
    """Contains information about website structure"""
    """웹사이트 구조에 관한 정보를 저장할 클래스"""
    def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        
import requests
from bs4 import BeautifulSoup

class Crawler:

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return childObj[0].get_text()
        return ''

    def search(self, topic, site):
        """Searches a given website for a given topic and records all pages found"""
        """주어진 검색어로 주어진 웹사이트를 검색해 결과 페이지를 모두 기록"""
        bs = self.getPage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)
        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs['href']
            # Check to see whether it's a relative or an absolute URL
            # 상대 URL인지 절대 URL인지 확인
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print('Something was wrong with that page or URL. Skipping!')
                return
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic, title, body, url)
                content.print()


crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=',
        'article.product-result', 'p.title a', True, 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content',
        'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',
        'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']
]
sites = []
for row in siteData:
    sites.append(Website(row[0], row[1], row[2],
                         row[3], row[4], row[5], row[6], row[7]))

topics = ['python', 'data science']
for topic in topics:
    print('GETTING INFO ABOUT: ' + topic)
    for targetSite in sites:
        crawler.search(topic, targetSite)

In [None]:
# 링크를 통헌 사이트 크롤링
class Website:

    def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.targetPattern = targetPattern
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag


class Content:

    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))
        
import re


class Crawler:
    def __init__(self, site):
        self.site = site
        self.visited = []

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, url):
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, self.site.titleTag)
            body = self.safeGet(bs, self.site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

    def crawl(self):
        """Get pages from website home page"""
        """사이트 홈페이지에서 페이지를 가져옴"""
        bs = self.getPage(self.site.url)
        targetPages = bs.findAll('a', href=re.compile(self.site.targetPattern))
        for targetPage in targetPages:
            targetPage = targetPage.attrs['href']
            if targetPage not in self.visited:
                self.visited.append(targetPage)
                if not self.site.absoluteUrl:
                    targetPage = '{}{}'.format(self.site.url, targetPage)
                self.parse(targetPage)


reuters = Website('Reuters', #Website.name
                  'https://www.reuters.com', #Website.url
                  '^(/article/)', #Website.targetPattern
                  False, #Website.absoluteUrl
                  'h1', #Website..titleTag
                  'div.StandardArticleBody_body_1gnLA') #Website.bodyTag
crawler = Crawler(reuters)
crawler.crawl()

In [None]:
# 원격 url 파일 내려받기
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
imageLocation = bs.find('a', {'id': 'logo'}).find('img')['src']
urlretrieve (imageLocation, 'logo.jpg')

In [None]:
# 페이지에서 src 속성이 있는 태그에 연결된 내부 파일을 모두 내려받음.
import os
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

downloadDirectory = 'downloaded'
baseUrl = 'http://pythonscraping.com'

def getAbsoluteURL(baseUrl, source):
    if source.startswith('http://www.'):
        url = 'http://{}'.format(source[11:])
    elif source.startswith('http://'):
        url = source
    elif source.startswith('www.'):
        url = source[4:]
        url = 'http://{}'.format(source)
    else:
        url = '{}/{}'.format(baseUrl, source)
    if baseUrl not in url:
        return None
    return url

def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
    path = absoluteUrl.replace('www.', '')
    path = path.replace(baseUrl, '')
    path = downloadDirectory+path
    directory = os.path.dirname(path)

    if not os.path.exists(directory):
        os.makedirs(directory)

    return path

html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
downloadList = bs.findAll(src=True)

for download in downloadList:
    fileUrl = getAbsoluteURL(baseUrl, download['src'])
    if fileUrl is not None:
        print(fileUrl)

urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))

In [None]:
# CSV(Comma-separated values): 쉼표로 구분된 값
import csv

csvFile = open('test.csv', 'w+') # test.csv가 없으면 생성함, 있으면 경고 없이 덮어씌움.
try:
    writer = csv.writer(csvFile)
    writer.writerow(('number', 'number plus 2', 'number times 2'))
    for i in range(10):
        writer.writerow( (i, i+2, i*2))
finally:
    csvFile.close()

In [None]:
# HTML 테이블을 가져와 CSV 파일 만들기
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://en.wikipedia.org/wiki/Comparison_of_text_editors')
bs = BeautifulSoup(html, 'html.parser')
# The main comparison table is currently the first table on the page
# 비교 테이블은 현재 페이지의 첫 번째 테이블.
table = bs.findAll('table',{'class':'wikitable'})[0]
rows = table.findAll('tr')

csvFile = open('editors.csv', 'wt+', encoding='utf-8')
writer = csv.writer(csvFile)
try:
    for row in rows:
        csvRow = []
        for cell in row.findAll(['td', 'th']):
            csvRow.append(cell.get_text())
        writer.writerow(csvRow)
finally:
    csvFile.close()

In [None]:
# MySQL과 연결
# !pip install pymysql
import pymysql

# 연결 객체: 데이터베이스 연결, 정보 전송, 롤백, 커서 객체 생성
conn = pymysql.connect(host='127.0.0.1',
                       user='root', passwd='1234', db='mysql')
# 커서 객체: 어떤 데이터베이스 사용 중인지 상태 정보 추적, 마지막 쿼리 결과도 저장함.
cur = conn.cursor()
cur.execute('USE scraping')
cur.execute("SELECT * FROM pages WHERE id=1")
print(cur.fetchone()) # .fetchone(): 정보 접근
cur.close()
conn.close() # => 해야 connection leak(연결 누수) 방지

In [None]:
# 위키백과의 이것저것 데이터베이스에 저장
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import pymysql
import re

conn = pymysql.connect(host='127.0.0.1',
                       user='root', passwd='1234', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute('USE scraping')

random.seed(datetime.datetime.now().timestamp())

def store(title, content):
    cur.execute('INSERT INTO pages (title, content) VALUES ("%s", "%s")', (title, content))
    cur.connection.commit()

def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org'+articleUrl)
    bs = BeautifulSoup(html, 'html.parser')
    title = bs.find('h1').get_text()
    content = bs.find('div', {'id':'mw-content-text'}).find('p').get_text()
    store(title, content)
    return bs.find('div', {'id':'bodyContent'}).findAll('a', href=re.compile('^(/wiki/)((?!:).)*$'))

links = getLinks('/wiki/Kevin_Bacon')
try:
    while len(links) > 0:
         newArticle = links[random.randint(0, len(links)-1)].attrs['href']
         print(newArticle)
         links = getLinks(newArticle)
finally:
    cur.close()
    conn.close()

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import pymysql
from random import shuffle

conn = pymysql.connect(host='127.0.0.1',
                       user='root', passwd='1234', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute('USE wikipedia')

# 새 페이지 발견할 때마다 저장. 페이지 중복 저장X, +새 링크 생성시 사용할 pageId를 검색하는 역할.
def insertPageIfNotExists(url):
    cur.execute('SELECT * FROM pages WHERE url = %s', (url))
    if cur.rowcount == 0:
        cur.execute('INSERT INTO pages (url) VALUES (%s)', (url))
        conn.commit()
        return cur.lastrowid
    else:
        return cur.fetchone()[0]

# 이론적으로 필요없는 경우 있음. 새 페이지에 방문해야하는지 결정하는데 사용.
def loadPages():
    cur.execute('SELECT * FROM pages')
    pages = [row[1] for row in cur.fetchall()]
    return pages

# 데이터베이스에 링크를 기록. 다수 실행시도 데이터베이스의 무결성 보장.
def insertLink(fromPageId, toPageId):
    cur.execute('SELECT * FROM links WHERE fromPageId = %s AND toPageId = %s', 
                  (int(fromPageId), int(toPageId)))
    if cur.rowcount == 0:
        cur.execute('INSERT INTO links (fromPageId, toPageId) VALUES (%s, %s)', 
                    (int(fromPageId), int(toPageId)))
        conn.commit()
def pageHasLinks(pageId):
    cur.execute('SELECT * FROM links WHERE fromPageId = %s', (int(pageId)))
    rowcount = cur.rowcount
    if rowcount == 0:
        return False
    return True

def getLinks(pageUrl, recursionLevel, pages):
    if recursionLevel > 4:
        return

    pageId = insertPageIfNotExists(pageUrl)
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    links = bs.findAll('a', href=re.compile('^(/wiki/)((?!:).)*$'))
    links = [link.attrs['href'] for link in links]

    for link in links:
        linkId = insertPageIfNotExists(link)
        insertLink(pageId, linkId)
        if not pageHasLinks(linkId):
            # 새 페이지를 만났으니 추가하고 링크를 검색.
            print("PAGE HAS NO LINKS: {}".format(link))
            pages.append(link)
            getLinks(link, recursionLevel+1, pages)
        
        
getLinks('/wiki/Kevin_Bacon', 0, loadPages()) 
cur.close()
conn.close()

In [None]:
# 이메일 보내기
import smtplib
from email.mime.text import MIMEText

msg = MIMEText('The body of the email is here')

msg['Subject'] = 'An Email Alert'
msg['From'] = 'ryan@pythonscraping.com'
msg['To'] = 'webmaster@pythonscraping.com'

s = smtplib.SMTP('localhost')
s.send_message(msg)
s.quit()

In [None]:
# 한 시간에 한 번씩 https://isitchristmas.com 웹사이트를 체크. NO외 의 것이면 크리스마스 이메일이 올 것.
import smtplib
from email.mime.text import MIMEText
from bs4 import BeautifulSoup
from urllib.request import urlopen
import time

def sendMail(subject, body):
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] ='christmas_alerts@pythonscraping.com'
    msg['To'] = 'ryan@pythonscraping.com'

    s = smtplib.SMTP('localhost')
    s.send_message(msg)
    s.quit()

bs = BeautifulSoup(urlopen('https://isitchristmas.com/'), 'html.parser')
while(bs.find('a', {'id':'answer'}).attrs['title'] == 'NO'):
    print('It is not Christmas yet.')
    time.sleep(3600)
    bs = BeautifulSoup(urlopen('https://isitchristmas.com/'), 'html.parser')
sendMail('It\'s Christmas!', 
         'According to http://itischristmas.com, it is Christmas!')

In [None]:
# 문서 인코딩 - txt
from urllib.request import urlopen
textPage = urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1.txt')
print(textPage.read())

In [None]:
# 문서 인코딩 - txt, incoding
from urllib.request import urlopen
# textPage = urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1-ru.txt')
textPage = urlopen(
             'http://www.pythonscraping.com/pages/warandpeace/chapter1-ru.txt')
print(str(textPage.read(), 'utf-8')) # 키릴 문자로 출력

In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
bs = BeautifulSoup(html, "html.parser")
content = bs.find("div", {"id":"mw-content-text"}).get_text()
content = bytes(content, "UTF-8")
content = content.decode("UTF-8")
print(content)

In [None]:
# 문서 인코딩 - csv(몬티 파이튼 앨범 목록)
from urllib.request import urlopen
from io import StringIO
import csv

data = urlopen('http://pythonscraping.com/files/MontyPythonAlbums.csv').read().decode('ascii', 'ignore')
dataFile = StringIO(data)
csvReader = csv.reader(dataFile) # iterable(순환체)를 반환함, 리스트 객체

# for row in csvReader:
#     print(row)
#     print("The album \""+row[0]+"\" was released in "+str(row[1]))
for row in csvReader:
    print("The album \""+row[0]+"\" was released in "+str(row[1]))

In [None]:
# DictReader: csvReader 보다 오래 걸리지만 첫 번째 행 무시나 처리 등을 간편하게 해줌.
from urllib.request import urlopen
from io import StringIO
import csv

data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii', 'ignore')
dataFile = StringIO(data)
dictReader = csv.DictReader(dataFile)

print(dictReader.fieldnames)

for row in dictReader:
    print(row)

In [None]:
# 문서 인코딩 - PDF, 로컬 파일 객체로 바꿔 문자열로 읽음.
# !pip install pdfminer3k
from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open

def readPDF(pdfFile):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)

    process_pdf(rsrcmgr, device, pdfFile)
    device.close()

    content = retstr.getvalue()
    retstr.close()
    return content

pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()

In [None]:
# 문서 인코딩 - DOCX( XML 읽기 )
# 원격 워드 문서를 바이너리 파일 객체로 읽고 압축을 풀어 XML을 읽는다.
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
from bs4 import BeautifulSoup

wordFile = urlopen('http://pythonscraping.com/pages/AWordDocument.docx').read()
wordFile = BytesIO(wordFile)
document = ZipFile(wordFile)
xml_content = document.read('word/document.xml')
# print(xml_content.decode('utf-8')) # xml 읽기
wordObj = BeautifulSoup(xml_content.decode('utf-8'), 'xml')
textStrings = wordObj.find_all('w:t')

for textElem in textStrings:
    print(textElem.text)

In [None]:
# Beatuiful Soup의 내비게이션 기능을 활용
textStrings = wordObj.find_all('w:t')

for textElem in textStrings:
    style = textElem.parent.parent.find('w:pStyle')
    if style is not None and style['w:val'] == 'Title':
        print('Title is: {}'.format(textElem.text))
    else:
        print(textElem.text)

**n-그램**: 텍스트나 연설에서 연속으로 나타난 단어 n개.
- 자연어를 분석할 때 공통적으로 나타나는 n-그램
- 자주 함께 쓰이는 단어 집합

In [None]:
# 정확한 형태를 갖춘 n-그램을 찾기, 위키백과 항목에서 찾은 2-그램 목록 반환
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string

# 문장을 단어로 분할, 구두점과 공백 제거, 한 글자로 이루어진 단어(I, a 제외) 제거
def cleanSentence(sentence):
    sentence = sentence.split(' ')
    sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
    sentence = [word for word in sentence if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')]
    return sentence

# 줄바꿈 문자와 인용 기호 제거, 마침표 뒤 공백 기준으로 텍스트를 '문장'으로 분할
def cleanInput(content):
    content = content.upper()
    content = re.sub('\n|[[\d+\]]', ' ', content)
    content = bytes(content, "UTF-8")
    content = content.decode("ascii", "ignore")
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]

# n-그램을 만드는 핵심 기능, getNgrams에서 매 문장마다 호출함(문장에 걸치는 n-그램 형성 제어).
def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

# 기본적인 진입점
def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = []
    for sentence in content:
        ngrams.extend(getNgramsFromSentence(sentence, n))
    return(ngrams)
        
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
ngrams = getNgrams(content, 2)
print(ngrams)
print('2-grams count is: '+str(len(ngrams)))

In [None]:
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
print(len(getNgrams(content, 2)))

In [None]:
# 중복 n-그램 제거
from collections import Counter

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, n)]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return(ngrams)

print(getNgrams(content, 2))

In [None]:
# 데이터 요약: n-그램을 찾고 정렬
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import Counter

def cleanSentence(sentence):
    sentence = sentence.split(' ')
    sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
    sentence = [word for word in sentence if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')]
    return sentence

def cleanInput(content):
    content = content.upper()
    content = re.sub('\n', ' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]

def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, n)]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return(ngrams)


content = str(
      urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(),
              'utf-8')
ngrams = getNgrams(content, 3)
print(ngrams)

In [None]:
# 필요없는 단어들 필터링.
# 현대 미국 영어 자료 (http://corpus.byu.edu/coca/)
def isCommon(ngram):
    commonWords = ['THE', 'BE', 'AND', 'OF', 'A', 'IN', 'TO', 'HAVE', 'IT', 'I', 'THAT', 'FOR', 'YOU', 'HE', 'WITH', 'ON', 'DO', 
                   'SAY', 'THIS', 'THEY', 'IS', 'AN', 'AT', 'BUT', 'WE', 'HIS', 'FROM', 'THAT', 'NOT', 'BY', 'SHE', 'OR', 'AS', 
                   'WHAT', 'GO', 'THEIR', 'CAN', 'WHO', 'GET', 'IF', 'WOULD', 'HER', 'ALL', 'MY', 'MAKE', 'ABOUT', 'KNOW', 'WILL', 
                   'AS', 'UP', 'ONE', 'TIME', 'HAS', 'BEEN', 'THERE', 'YEAR', 'SO', 'THINK', 'WHEN', 'WHICH', 'THEM', 'SOME', 'ME', 
                   'PEOPLE', 'TAKE', 'OUT', 'INTO', 'JUST', 'SEE', 'HIM', 'YOUR', 'COME', 'COULD', 'NOW', 'THAN', 'LIKE', 'OTHER', 
                   'HOW', 'THEN', 'ITS', 'OUR', 'TWO', 'MORE', 'THESE', 'WANT', 'WAY', 'LOOK', 'FIRST', 'ALSO', 'NEW', 'BECAUSE', 
                   'DAY', 'MORE', 'USE', 'NO', 'MAN', 'FIND', 'HERE', 'THING', 'GIVE', 'MANY', 'WELL']
    for word in ngram:
        if word in commonWords:
            return True
    return False

def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        if not isCommon(content[i:i+n]):
            output.append(content[i:i+n])
    return output

ngrams = getNgrams(content, 3)
print(ngrams)

In [None]:
# 주어진 텍스트(content)에서 각 문장을 대문자로 변환, 
# n-gram이 해당 문장에 포함되어 있는지 확인, 
# 만약 n-gram이 포함되어 있다면 해당 문장을 반환하고, 그렇지 않으면 빈 문자열을 반환
def getFirstSentenceContaining(ngram, content):
    #print(ngram)
    sentences = content.upper().split(". ")
    for sentence in sentences: 
        if ngram in sentence:
            return sentence+'\n'
    return ""


print(getFirstSentenceContaining('EXCLUSIVE METALLIC CURRENCY', content))
print(getFirstSentenceContaining('EXECUTIVE DEPARTMENT', content))
print(getFirstSentenceContaining('GENERAL GOVERNMENT', content))
print(getFirstSentenceContaining('CALLED UPON', content))
print(getFirstSentenceContaining('CHIEF MAGISTRATE', content))

In [None]:
# 마르코프 모델: 어떤 특정 사건이 다른 특정 사건에 뒤이어, 일정 확률로 일어나는 대규모 무작위 분포를 분석할 때 쓰임.
from urllib.request import urlopen
from random import randint

def wordListSum(wordList):
    sum = 0
    for word, value in wordList.items():
        sum += value
    return sum

def retrieveRandomWord(wordList):
    randIndex = randint(1, wordListSum(wordList))
    for word, value in wordList.items():
        randIndex -= value
        if randIndex <= 0:
            return word

def buildWordDict(text):
    # Remove newlines and quotes
    # 줄바꿈 문자와 따옴표를 제거.
    text = text.replace('\n', ' ');
    text = text.replace('"', '');

    # Make sure punctuation marks are treated as their own "words,"
    # so that they will be included in the Markov chain
    # 구두점 역시 단어로 취급해서 마르코프 체인에 들어가도록 함.
    punctuation = [',','.',';',':']
    for symbol in punctuation:
        text = text.replace(symbol, ' {} '.format(symbol));

    words = text.split(' ')
    # Filter out empty words
    # 빈 단어를 제거.
    words = [word for word in words if word != '']

    wordDict = {}
    for i in range(1, len(words)):
        if words[i-1] not in wordDict:
                # Create a new dictionary for this word
                # 이 단어에 필요한 새 딕셔너리 생성.
            wordDict[words[i-1]] = {}
        if words[i] not in wordDict[words[i-1]]:
            wordDict[words[i-1]][words[i]] = 0
        wordDict[words[i-1]][words[i]] += 1
    return wordDict

text = str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt')
          .read(), 'utf-8')
wordDict = buildWordDict(text)

#Generate a Markov chain of length 100
# 길이가 100인(자유 설정) 마르코프 체인 생성.
length = 100
chain = ['I']
for i in range(0, length):
    newWord = retrieveRandomWord(wordDict[chain[-1]])
    chain.append(newWord)

print(' '.join(chain))

**너비 우선 탐색**; 방향성 그래프(시작 페이지에서 목표 페이지에 도달하는 링크 체인을 찾는 문제)에서 가장 짧은 경로를 찾을 때 가장 좋은 방법<br>
우선 시작 페이지에서 출발하는 링크를 모두 검색. 검색 링크에 목표 페이지가 없으면 2단계 링크, <br>
시작 페이지에서 링크된 페이지에서 다시 링크된 페이지를 찾음

In [None]:
import pymysql

conn = pymysql.connect(host='127.0.0.1', user='root', passwd='1234', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute('USE wikipedia')

# 페이지 ID를 받아서 데이터베이스에서 URL을 가져오는 보조 함수.
def getUrl(pageId):
    cur.execute('SELECT url FROM pages WHERE id = %s', (int(pageId)))
    return cur.fetchone()[0]

# 현재 페이지를 나타내는 정수(fromPageId)를 받아서, 현재 페이지에서 링크한 ID를 전부 가져오는 보조 함수.
def getLinks(fromPageId):
    cur.execute('SELECT toPageId FROM links WHERE fromPageId = %s', (int(fromPageId)))
    if cur.rowcount == 0:
        return []
    return [x[0] for x in cur.fetchall()]

# 메인 함수. 검색 페이지에서 출발해 대상 페이지까지 도달하는 경로를 만날 때까지, 
# 재귀적으로 동작하면서 가능한 경로를 전부 리스트에 담는다.
def searchBreadth(targetPageId, paths=[[1]]):
    newPaths = []
    for path in paths:
        links = getLinks(path[-1])
        for link in links:
            if link == targetPageId:
                return path + [link]
            else:
                newPaths.append(path+[link])
    return searchBreadth(targetPageId, newPaths)
                
nodes = getLinks(1)
targetPageId = 500
pageIds = searchBreadth(targetPageId)
for pageId in pageIds:
    print(getUrl(pageId))

In [None]:
import requests

params = {'firstname': 'Jieun', 'lastname': 'Park'}
r = requests.post("http://pythonscraping.com/pages/processing.php", data=params)
print(r.text)

In [None]:
!pip uninstall selenium

In [None]:
# JSON 파싱
from urllib.request import urlopen
import json

def getCountry(ipAddress):
    url = 'http://api.ipstack.com/' + ipAddress
    url += '?access_key=ACCESS_KEY&amp;format=1'
    response = urlopen(url).read().decode('utf-8')
    responseJson = json.loads(response)
    return responseJson.get('country_code')

print(getCountry('50.78.253.58')) # IP주소가 50.78.253.58인 국가 코드(US) 출력

In [None]:
# 위키백과를 크롤링해 개정 내역 페이지를 찾아보고 그 페이지에서 IP 주소를 찾아내는 스크립트
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import datetime
import random
import re

random.seed(datetime.datetime.now().timestamp())
def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org{}'.format(articleUrl))
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).findAll('a', 
        href=re.compile('^(/wiki/)((?!:).)*$'))

def getHistoryIPs(pageUrl):
    #Format of revision history pages is: 개정 히스토리 페이지 형식은 다음과 같음: 
    #http://en.wikipedia.org/w/index.php?title=Title_in_URL&action=history
    pageUrl = pageUrl.replace('/wiki/', '')
    historyUrl = 'http://en.wikipedia.org/w/index.php?title={}&action=history'.format(pageUrl)
    print('history url is: {}'.format(historyUrl))
    html = urlopen(historyUrl)
    bs = BeautifulSoup(html, 'html.parser')
    #finds only the links with class "mw-anonuserlink" which has IP addresses 
    #instead of usernames
    # 클래스가 "mw-anonuserlink"인, 사용자 이름이 아니라 IP 주소가 들어있는 링크만 찾음. 
    ipAddresses = bs.findAll('a', {'class':'mw-anonuserlink'})
    addressList = set()
    for ipAddress in ipAddresses:
        addressList.add(ipAddress.get_text())
    return addressList

links = getLinks('/wiki/Python_(programming_language)')

while(len(links) > 0):
    for link in links:
        print('-'*20) 
        historyIPs = getHistoryIPs(link.attrs['href'])
        for historyIP in historyIPs:
            print(historyIP)

    newLink = links[random.randint(0, len(links)-1)].attrs['href']
    links = getLinks(newLink)

In [None]:
def getCountry(ipAddress):
    try:
        response = urlopen(
            'http://freegeoip.net/json/{}'.format(ipAddress)).read().decode('utf-8')
    except HTTPError:
        return None
    responseJson = json.loads(response)
    return responseJson.get('country_code')
    
links = getLinks('/wiki/Python_(programming_language)')

while(len(links) > 0):
    for link in links:
        print('-'*20) 
        historyIPs = getHistoryIPs(link.attrs["href"])
        for historyIP in historyIPs:
            country = getCountry(historyIP)
            if country is not None:
                print('{} is from {}'.format(historyIP, country))

    newLink = links[random.randint(0, len(links)-1)].attrs['href']
    links = getLinks(newLink)

In [None]:
# unittest: 단위 테스트 모듈
import unittest

class TestAddition(unittest.TestCase):
    def setUp(self):
        print('Setting up the test')

    def tearDown(self):
        print('Tearing down the test')

    def test_twoPlusTwo(self):
        total = 2+2
        print(total)
        self.assertEqual(4, total);

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

In [None]:
# 웹사이트 프론트엔드 테스트
from urllib.request import urlopen
from bs4 import BeautifulSoup
import unittest

class TestWikipedia(unittest.TestCase):
    bs = None
    def setUpClass():
        url = 'http://en.wikipedia.org/wiki/Monty_Python'
        TestWikipedia.bs = BeautifulSoup(urlopen(url), 'html.parser')

    def test_titleText(self):
        pageTitle = TestWikipedia.bs.find('h1').get_text() # 페이지 타이틀이 Monty_Python인가 확인
        self.assertEqual('Monty Python', pageTitle);

    def test_contentExists(self):
        content = TestWikipedia.bs.find('div',{'id':'mw-content-text'})  # 페이지에 콘텐츠 div가 있는지 확인
        self.assertIsNotNone(content)


if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)
    %reset

In [None]:
# 웹사이트 프론트엔드 테스트, 반복 실행
from urllib.request import urlopen
from bs4 import BeautifulSoup
import unittest
import re
import random
from urllib.parse import unquote

class TestWikipedia(unittest.TestCase):

    def test_PageProperties(self):
        self.url = 'http://en.wikipedia.org/wiki/Monty_Python'
        #Test the first 10 pages we encounter
        # 만드는 순서에 따라 페이지 10개를 테스트함.
        for i in range(1, 10):
            self.bs = BeautifulSoup(urlopen(self.url), 'html.parser')
            titles = self.titleMatchesURL()
            self.assertEqual(titles[0], titles[1])
            self.assertTrue(self.contentExists())
            self.url = self.getNextLink()
        print('Done!')

    def titleMatchesURL(self):
        pageTitle = self.bs.find('h1').get_text()
        urlTitle = self.url[(self.url.index('/wiki/')+6):]
        urlTitle = urlTitle.replace('_', ' ')
        urlTitle = unquote(urlTitle)
        return [pageTitle.lower(), urlTitle.lower()]

    def contentExists(self):
        content = self.bs.find('div',{'id':'mw-content-text'})
        if content is not None:
            return True
        return False

    def getNextLink(self):
        # Returns random link on page, using technique from Chapter 3
        links = self.bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
        randomLink = random.SystemRandom().choice(links)
        return 'https://wikipedia.org{}'.format(randomLink.attrs['href'])
    

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)
    %reset

### 병렬 웹 크롤링

In [None]:
# 멀티 스레드 크롤링
import _thread
import time

def print_time(threadName, delay, iterations):
    start = int(time.time())
    for i in range(0,iterations):
        time.sleep(delay)
        seconds_elapsed = str(int(time.time()) - start)
        print (threadName if threadName else seconds_elapsed)

try:
    _thread.start_new_thread(print_time, (None, 1, 100))
    _thread.start_new_thread(print_time, ("Fizz", 3, 33))
    _thread.start_new_thread(print_time, ("Buzz", 5, 20))
except:
    print ("Error: unable to start thread")

while 1:
    pass

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random

import _thread
import time

visited = [] # 재방문 제어
def getLinks(thread_name, bsObj):
    print('Getting links in {}'.format(thread_name))
    links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
    return [link for link in links if link not in visited]

# Define a function for the thread
# 스레드에서 실행할 함수
def scrape_article(thread_name, path):
    visited.append(path)
    html = urlopen('http://en.wikipedia.org{}'.format(path))
    time.sleep(5)
    bsObj = BeautifulSoup(html, 'html.parser')
    title = bsObj.find('h1').get_text()
    print('Scraping {} in thread {}'.format(title, thread_name))
    links = getLinks(thread_name, bsObj)
    if len(links) > 0:
        newArticle = links[random.randint(0, len(links)-1)].attrs['href']
        print(newArticle)
        scrape_article(thread_name, newArticle)


# Create two threads as follows
# 스레드 두 개를 만듦
try:
   _thread.start_new_thread(scrape_article, ('Thread 1', '/wiki/Kevin_Bacon',))
   _thread.start_new_thread(scrape_article, ('Thread 2', '/wiki/Monty_Python',))
except:
   print ('Error: unable to start threads')

while 1:
    pass

In [None]:
# queue(큐): 선입선출 or 후입선출 방식으로 작동하는 리스트 비슷 객체.
# queue.put(): 어떤 스레드로부터 메시지를 받음, queue.get(): 호출하는 어떤 스레드에든 메시지 전달.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
import _thread
from queue import Queue
import time
import pymysql


def storage(queue):
    conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='', db='mysql', charset='utf8')
    cur = conn.cursor()
    cur.execute('USE wiki_threads')
    while 1:
        if not queue.empty():
            article = queue.get()
            cur.execute('SELECT * FROM pages WHERE path = %s', (article["path"]))
            if cur.rowcount == 0:
                print("Storing article {}".format(article["title"]))
                cur.execute('INSERT INTO pages (title, path) VALUES (%s, %s)', (article["title"], article["path"]))
                conn.commit()
            else:
                print("Article already exists: {}".format(article['title']))

visited = []
def getLinks(thread_name, bsObj):
    print('Getting links in {}'.format(thread_name))
    links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
    return [link for link in links if link not in visited]

def scrape_article(thread_name, path, queue):
    visited.append(path)
    html = urlopen('http://en.wikipedia.org{}'.format(path))
    time.sleep(5)
    bsObj = BeautifulSoup(html, 'html.parser')
    title = bsObj.find('h1').get_text()
    print('Added {} for storage in thread {}'.format(title, thread_name))
    queue.put({"title":title, "path":path})
    links = getLinks(thread_name, bsObj)
    if len(links) > 0:
        newArticle = links[random.randint(0, len(links)-1)].attrs['href']
        scrape_article(thread_name, newArticle, queue)

queue = Queue()
try:
   _thread.start_new_thread(scrape_article, ('Thread 1', '/wiki/Kevin_Bacon', queue,))
   _thread.start_new_thread(scrape_article, ('Thread 2', '/wiki/Monty_Python', queue,))
   _thread.start_new_thread(storage, (queue,))
except:
   print ('Error: unable to start threads')

while 1:
    pass

In [None]:
# threading: _thread의 모든 기능을 노출하면서 더 깨끗한 상위 인터페이스
import threading
import time

def print_time(threadName, delay, iterations):
    start = int(time.time())
    for i in range(0,iterations):
        time.sleep(delay)
        seconds_elapsed = str(int(time.time()) - start)
        print ('{} {}'.format(seconds_elapsed, threadName))

t = threading.Thread(target=print_time, args=('Fizz', 3, 33)).start()
t = threading.Thread(target=print_time, args=('Buzz', 5, 20)).start()
t = threading.Thread(target=print_time, args=('Counter', 1, 100)).start()

In [None]:
import threading

def crawler(url):
    data = threading.local() # => 로컬 스레드 데이터 쉽게 생성: 
    # 여러 스레드가 서로 다른 사이트를 스크랩하면서 각각 방문한 페이지 리스트를 관리할 때 유용
    data.visited = []
    # 크롤링 코드
threading.Thread(target=crawler, args=('http://brookings.edu')).start()

In [None]:
# threading.Thread객체 확장해 모니터링에 쓸 메서드 추가
import threading
import time

class Crawler(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.done = False
        
    # 크롤링 완료 확인
    def isDone(self):
        return self.done

    def run(self):
        time.sleep(5)
        self.done = True
        raise Exception('Something bad happened!')

t = Crawler()
t.start()

while True:
    time.sleep(1)
    if t.isDone():
        print('Done')
        break
    if not t.isAlive():
        t = Crawler()
        t.start()

In [None]:
# 멀티프로세스 크롤링
from multiprocessing import Process
import time

def print_time(threadName, delay, iterations):
    start = int(time.time())
    for i in range(0,iterations):
        time.sleep(delay)
        seconds_elapsed = str(int(time.time()) - start)
        print (threadName if threadName else seconds_elapsed)


processes = []
processes.append(Process(target=print_time, args=(None, 1, 100)))
processes.append(Process(target=print_time, args=("Fizz", 3, 33)))
processes.append(Process(target=print_time, args=("Buzz", 5, 20)))

for p in processes:
    p.start()

for p in processes:
    p.join()
    
print("Program complete")

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random

from multiprocessing import Process, Queue
import os
import time
import Thread

def getLinks(bsObj, queue):
    print('Getting links in {}'.format(os.getpid()))
    links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
    return [link for link in links if link not in queue.get()]

def scrape_article(path, queue):
    queue.get().append()
    print("Process {} list is now: {}".format(os.getpid(), visited))
    html = urlopen('http://en.wikipedia.org{}'.format(path))
    time.sleep(5)
    bsObj = BeautifulSoup(html, 'html.parser')
    title = bsObj.find('h1').get_text()
    print('Scraping {} in process {}'.format(title, os.getpid()))
    links = getLinks(bsObj)
    if len(links) > 0:
        newArticle = links[random.randint(0, len(links)-1)].attrs['href']
        print(newArticle)
        scrape_article(newArticle)

processes = []
queue = Queue()
processes.append(Process(target=scrape_article, args=('/wiki/Kevin_Bacon', queue,)))
processes.append(Process(target=scrape_article, args=('/wiki/Monty_Python', queue,)))

for p in processes:
    p.start()

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
from multiprocessing import Process, Queue
import os
import time


def task_delegator(taskQueue, foundUrlsQueue):
    #Initialize with a task for each process
    # 각 프로세스에서 처리할 작업을 초기화.
    visited = ['/wiki/Kevin_Bacon', '/wiki/Monty_Python']
    taskQueue.put('/wiki/Kevin_Bacon')
    taskQueue.put('/wiki/Monty_Python')

    while 1:
        #Check to see if there are new links in the foundUrlsQueue for processing
        # urlsQueue에 처리할 새 링크 있는지 확인
        if not foundUrlsQueue.empty():
            links = [link for link in foundUrlsQueue.get() if link not in visited]
            for link in links:
                #Add new link to the taskQueue
                # 새 링크를 taskQueue에 추가
                taskQueue.put(link)
                #Add new link to the visited list
                visited.append(link)

def get_links(bsObj):
    links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
    return [link.attrs['href'] for link in links]

def scrape_article(taskQueue, foundUrlsQueue):
    while 1:
        while taskQueue.empty():
            #Sleep 100 ms while waiting for the task queue 
            #This should be rare
            # 작업 큐가 비어있으면 0.1초 대기(드물게 발생)
            time.sleep(.1)
        path = taskQueue.get()
        html = urlopen('http://en.wikipedia.org{}'.format(path))
        time.sleep(5)
        bsObj = BeautifulSoup(html, 'html.parser')
        title = bsObj.find('h1').get_text()
        print('Scraping {} in process {}'.format(title, os.getpid()))
        links = get_links(bsObj)
        #Send these to the delegator for processing
        # 찾아낸 링크를 위임자에 보내 처리하게함.
        foundUrlsQueue.put(links)


processes = []
taskQueue = Queue()
foundUrlsQueue = Queue()
processes.append(Process(target=task_delegator, args=(taskQueue, foundUrlsQueue,)))
processes.append(Process(target=scrape_article, args=(taskQueue, foundUrlsQueue,)))
processes.append(Process(target=scrape_article, args=(taskQueue, foundUrlsQueue,)))

for p in processes:
    p.start()