In [12]:
import requests
from bs4 import BeautifulSoup
class Content:
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

def getPage(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser') # BS의 객체 return

def scrapeBrookings(url):
    bs = getPage(url)
    title = bs.find('h1').text
    body = bs.find('div', {'class': 'post-body'}).text
    return Content(url, title, body)

In [6]:
url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'

content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

Title: Delivering inclusive urban access: 3 uncomfortable truths
URL: https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/


The past few decades have been filled with a deep optimism about the role of cities and suburbs across the world. These engines of economic growth host a majority of world population, are major drivers of economic innovation, and have created pathways to opportunities for untold amounts of people.







Jeffrey Gutman

					Former Nonresident Fellow, Global Economy and Development										







Adie Tomer

					Senior Fellow - Brookings Metro 

 Twitter
AdieTomer





But all is not well within our so-called Urban Century. Rapid urbanization, rising gentrification, concentrated poverty, and shortages of basic infrastructure have combined to create spatial inequity in cities and suburbs across the globe. The challenges of housing, moving, and employing so many people have led to longer travel ti

In [13]:
getPage(url)

<!DOCTYPE html>

<html lang="en-US">
<head>
<meta charset="utf-8"/><script type="text/javascript">(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"8eaa2e2b30",applicationID:"59653415"};window.NREUM||(NREUM={}),__nr_require=function(t,e,n){function r(n){if(!e[n]){var i=e[n]={exports:{}};t[n][0].call(i.exports,function(e){var i=t[n][1][e];return r(i||e)},i,i.exports)}return e[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var i=0;i<n.length;i++)r(n[i]);return r}({1:[function(t,e,n){function r(){}function i(t,e,n,r){return function(){return s.recordSupportability("API/"+e+"/called"),o(t+e,[u.now()].concat(c(arguments)),n?null:this,r),n?void 0:this}}var o=t("handle"),a=t(9),c=t(10),f=t("ee").get("tracer"),u=t("loader"),s=t(4),d=NREUM;"undefined"==typeof window.newrelic&&(newrelic=d);var p=["setPageViewName","setCustomAttribute","setErrorHandler","finished","addToTrace","inlineHit","addRelease

In [14]:
import time
class Content:
    """
    글/페이지 전체에 사용할 기반 클래스
    """
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body
    def print(self):
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))
        print()

class Website:
    """
    웹사이트 구조에 관한 정보를 저장할 클래스
    """
    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [18]:
class Crawler:
    def getPage(self, url):
        try:
            req = requests.get(url)
            time.sleep(2) # 페이지 요청 후 응답을 기다릴 시간
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        """
        BeautifulSoup 객체와 선택자를 받아 콘텐츠 문자열을 추출하는 함수
        """
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        else:
            return ''

    def parse(self, site, url):
        """
        URL을 받아 콘텐츠를 추출함
        """
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                print('-' * 100)
                content.print()

In [19]:
crawler = Crawler()
siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'h1', 'div.content > div.metadata'],
    ['Reuters', 'http://reuters.com', 'h1', 'p.Paragraph-paragraph-2Bgue'],
    ['Brookings', 'http://www.brookings.edu', 'h1', 'div.post-body']
    ]
websites = []
for row in siteData:
    websites.append(Website(row[0], row[1], row[2], row[3]))

crawler.parse(websites[0], 'http://shop.oreilly.com/product/0636920028154.do')
crawler.parse(websites[1], 'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')
crawler.parse(websites[2], 'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')



----------------------------------------------------------------------------------------------------
URL: http://shop.oreilly.com/product/0636920028154.do
TITLE: Learning Python, 5th Edition
BODY:

Learning Python, 5th Edition
by 
Released 
Publisher(s): 
ISBN: None

Read it now on the O’Reilly learning platform with a 10-day free trial.
O’Reilly members get unlimited access to live online training experiences, plus books, videos, and digital content from O’Reilly and nearly 200 trusted publishing partners.


Start your free trial





----------------------------------------------------------------------------------------------------
URL: http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0
TITLE: EPA chief wants scientists to debate climate on TV
BODY:
WASHINGTON (Reuters) - The U.S. Environmental Protection Agency is in the early stages of launching a debate about climate change that could air on television – challenging scientists to prove the widespread view that global 

In [20]:
class Content:
    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        print('New article found for topic: {}'.format(self.topic))
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n {}'.format(self.body))

class Website:

    def __init__(self, name, url, searchUrl, resultListing, resultUrl,
                 absoluteUrl, titleTag, bodyTag):

        self.name = name
        self.url = url
        self.searchUrl = searchUrl # URL에 검색어 추가
        self.resultListing = resultListing # 각 결과에 대한 정보 저장
        self.resultUrl = resultUrl # 결과에서 정확한 URL을 추출할 때 사용
        self.absoluteUrl = absoluteUrl # 절대 경로인지, 상대 경로인지 구분
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [23]:
class Crawler:
    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return childObj[0].get_text()
        else:
            return ''

    def getAllBody(self, pageObj, selector):
        # 해당 tag를 가지는 모든 내용을 출력함
        childObj = pageObj.select(selector)
        bodyText = ""
        if childObj is not None:
            for i in range (len(childObj)):
                bodyText = bodyText + childObj[i].get_text() + '\n'
            return bodyText
        else:
            return ''
    def search(self, topic, site):
        # site: Website 객체
        print('searchUrl+topic:', site.searchUrl + topic)

        bs = self.getpage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)

        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs['href']
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(self.url + url)
            if bs is None:
                print('Something was wrong with that page or URL. Skipping')
                return
            title = self.safeGet(bs, site.titleTag)
            #body = self.safeGet(bs, site.bodyTag) # 첫 번째 paragraph만 출력
            body = self.getAllBody(bs, site.bodyTag) # 전체 기사 출력

            if title != '' and body != '':
                content = Content(topic, url, title, body)
                content.print()

In [26]:
crawler = Crawler()

siteData1 = [
    ['Reuters', # Website.name
     'http://reuters.com', # Website.url
     'http://www.reuters.com/search/news?blob=', # Website.searchUrl: 검색을 위한 URL
     'div.search-result-content', # Website.resultListing: 검색 결과에 대한 정보
     'h3.search-result-title > a', # Website.resultUrl: 결과에서 URL을 추출할 때 사용할 태그
     False, # Website.absoluteUrl 사용 여부
     'h1', # Website.titleTag
     'p.Paragraph-paragraph-2Bgue'] # Website.bodyTag
     ]

sites = []
for row in siteData1:
    sites.append(Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]))

topics = ['python']
for topic in topics:
    print('GETTING INFO ABOUT: ' + topic)
    for targetSite in sites:
        crawler.search(topic, targetSite)

GETTING INFO ABOUT: python
searchUrl+topic: http://www.reuters.com/search/news?blob=python


AttributeError: 'Crawler' object has no attribute 'getpage'

- 1. 검색을 위한 URL
- 2. ?search = '검색어'
- 3. 검색결과 활용. 내부링크 or 외부링크 검색
- 4. 해당 페이지로 이동
- 5. h1 정보, body 정보 추출(크롤링)