### 크롤링 수업 3

In [2]:
# 모듈 로딩
import pandas as pd 
import numpy as np
import requests
from bs4 import BeautifulSoup

In [1]:
# 사이트 
url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'

# 제목 텍스트 추출: h1
# 기사 텍스트 추출: post-body

In [3]:
# 클래스를 만들어서 크롤링을 효과적으로 관리
class Content:
    def __init__(self, url, title, body) -> None:
        self.url = url
        self.title = title
        self.body = body

def getPage(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser')

def scrapeBrookings(url):
    soup = getPage(url)
    title = soup.find('h1').text
    body = soup.find('div', class_ = 'post-body').text
    return Content(url, title, body)
# 더 많은 파일을 다운 받고싶다. 
content = scrapeBrookings(url)
print(f'Title: {content.title}')
print(f'URL: {content.url}')
print(content.body)

Title: Delivering inclusive urban access: 3 uncomfortable truths
URL: https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/

The past few decades have been filled with a deep optimism about the role of cities and suburbs across the world. These engines of economic growth host a majority of world population, are major drivers of economic innovation, and have created pathways to opportunities for untold amounts of people.







Jeffrey Gutman

					Former Nonresident Fellow, Global Economy and Development										







Adie Tomer

					Senior Fellow - Brookings Metro 

 Twitter
AdieTomer





But all is not well within our so-called Urban Century. Rapid urbanization, rising gentrification, concentrated poverty, and shortages of basic infrastructure have combined to create spatial inequity in cities and suburbs across the globe. The challenges of housing, moving, and employing so many people have led to longer travel tim

In [7]:
def scrapeOreilly(url):
    soup = getPage(url)
    title = soup.find('h1').text
    body = soup.find('p', class_ = 't-promo').text
    return Content(url, title, body)

content2 = scrapeOreilly('https://www.oreilly.com/library/view/learning-python-5th/9781449355722/')
print(f'Title: {content2.title}')
print(f'URL: {content2.url}')
print(content2.body)

Title: Learning Python, 5th Edition
URL: https://www.oreilly.com/library/view/learning-python-5th/9781449355722/
Read it now on the O’Reilly learning platform with a 10-day free trial.


In [9]:
# 수정해야함
def scrapeReuter(url):
    soup = getPage(url)
    title = soup.find('h1').text
    body = soup.find('p', class_ = 'Paragraph-paragraph-2Bgue').text
    return Content(url, title, body)


content3 = scrapeReuter('https://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')
print(f'Title: {content3.title}')
print(f'URL: {content3.url}')
print(content3.body)

Title: EPA chief wants scientists to debate climate on TV
URL: https://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0
WASHINGTON (Reuters) - The U.S. Environmental Protection Agency is in the early stages of launching a debate about climate change that could air on television – challenging scientists to prove the widespread view that global warming is a serious threat, the head of the agency said.


### 웹사이트의 유연성 -> 크롤링 하기 어려움
- 검색을 통한 크롤링
    - 웹페이지의 내부 링크 및 외부 링크를 거색
    - 해당 링크(내부,외부)를 사용하여 사이트 전체를 크롤링
- 검색 방법
    - URL에 검색어를 삽입해서 검색결과를 얻음
    - http://example.com?search='검색어'
    - 링크 목록 확인
    - 결과링크의 속성 저장 (절대경로 -> 외부링크 or 상대경로 -> 내부링크)
#### 단계
1. 검색을 위한 URL: ?search='검색어' 
2. 검색결과 활용, 내부링크 or 외부링크 검색
3. 해당 페이지로 이동
4. h1 정보, body 정보 접근


In [11]:
# 검색 결과에서 href 속성 추출
# 해당 url로 이동
# <h1> 태그 추출
# 기사 전체 내용 추출
class Content:
    def __init__(self, topic,  url, title, body) -> None:
        self.topic = topic
        self.url = url
        self.title = title
        self.body = body
    
    def print(self):
        print(f'Title: {self.title}')
        print(f'URL: {self.url}')
        print(self.body)

class Website:
    def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag) -> None:
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        pass

class Crawler:
    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException: return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        """
        검색된 모든 기사 내용 중 첫번째 항목만 출력
        """
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) >0 : 
            return childObj[0].text
    
    def getAllBody(self, pageObj, selector):
        """ 
        검색된 모든 기사 내용 출력
        """
        childObj = pageObj.select(selector)
        bodyText = ""
        if childObj is not None:
            for i in range(len(childObj)):
                bodyText = bodyText + childObj[i].text + '\n'
            return bodyText
        else:
            return ''
    
    def search(self, topic, site):
        print('search + topic: ', site.searchUrl + topic)

        soup = self.getPage(site.searchUrl + topic)
        searchResults = soup.select(site.resultListing)

        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs['href']

            if site.absoluteUrl:
                soup = self.getPage(url)
            else:
                soup = self.getPage(self.url + url)

            if soup is None:
                print('soup is None')
                return 

            title = self.safeGet(soup, site.titleTag)
            body = self.getAllBody(soup, site.bodyTag)

            # if title != '' and body != '':
            #     content = Content(topic, url, title, body)
            #     content.print()

crawler = Crawler()
sitedate = 



[]

In [34]:
url_search = 'https://www.reuters.com/search/news?blob=python'
soup = getPage(url_search)
href_list = pd.Series(soup.select('h3.search-result-title > a')).apply(lambda z: z['href'])
href_list = ('https://www.reuters.com' + href_list).to_list()
href_list
# href_list = soup.select('href')
# href_list

['https://www.reuters.com/article/idUSKCN11S04G',
 'https://www.reuters.com/article/idUSKBN0L31PS20150130',
 'https://www.reuters.com/article/idUSBRE9081CL20130110',
 'https://www.reuters.com/article/idUSBRE9081CL20130109',
 'https://www.reuters.com/article/idUSKBN1OD2CM',
 'https://www.reuters.com/article/idUSBRE9640KE20130705',
 'https://www.reuters.com/article/idUSKCN11W1LT',
 'https://www.reuters.com/article/idUSBREA141OP20140205',
 'https://www.reuters.com/article/idUSKBN1711E3',
 'https://www.reuters.com/article/idUSL5N0J50QB20131120']

In [54]:
import requests
from bs4 import BeautifulSoup
import re

url = 'https://www.reuters.com'
link_list = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')

data_link = soup.find_all('a', attrs={'data-testid': ['Heading', 'Link']})
i = 0
for link in data_link:
    if link['href'] not in link_list:
        link_list.append(link['href'])
        print(f"{i}: {link['href']}")
        i += 1


0: https://www.reuters.com/world/europe/
1: /world/
2: /world/europe/ukraine-russia-what-you-need-know-right-now-2022-07-03/
3: /world/europe/ukraine-says-18-medics-killed-hundreds-facilities-damaged-since-invasion-2022-07-24/
4: /world/europe/zelenskiy-says-ukraine-unbowed-even-russians-expect-defeat-2022-07-24/
5: /world/europe/lavrov-offers-reassurance-over-russian-grain-supplies-cairo-visit-2022-07-24/
6: /world/europe/russia-says-it-hit-military-boat-odesa-port-ukraine-2022-07-24/
7: /world/europe/odesa-strike-shows-it-will-not-be-easy-export-grain-via-ports-ukraine-2022-07-24/
8: /world/europe/russian-investigator-says-wants-new-tribunal-ukraine-2022-07-25/
9: /world/middle-east/ukraine-works-resume-grain-exports-flags-russian-strikes-risk-2022-07-24/
10: /world/asia-pacific/myanmar-junta-execute-four-democracy-activists-state-media-2022-07-25/
11: /world/americas/pope-arrives-canada-tour-penance-indigenous-abuse-2022-07-24/
12: /world/china/chinas-population-expected-start-shrin

In [53]:
# 예제 소스: 데이터 추출 검색어
# 특정한 단어를 포함하는 클래스나 속성 추출

link_pattern = '^media-story-card__placement-container+'
body_pattern = '^text__text__+'

data_link2 = soup.find_all('div', class_= re.compile(link_pattern))
data_link3 = soup.find_all('p', class_ = re.compile(body_pattern))
data_link3

[<p class="text__text__1FZLe text__dark-grey__3Ml43 text__regular__2N1Xr text__extra_small__1Mw6v body__base__22dCE body__extra_small_body__3QTYe media-story-card__description__2icjO" data-testid="Body">Ukraine pressed ahead on Sunday with efforts to restart grain exports from its Black Sea ports under a deal aimed at easing global food shortages but warned deliveries would suffer if a Russian missile strike on Odesa was a sign of more to come.</p>,
 <p class="text__text__1FZLe text__dark-grey__3Ml43 text__regular__2N1Xr text__extra_small__1Mw6v body__base__22dCE body__extra_small_body__3QTYe text-story-card__description__3PNIg" data-testid="Body">Myanmar's military authorities have executed four democracy activists accused of helping carry out "terror acts", state media reported on Monday, marking the first executions conducted in decades in the Southeast Asian country.</p>,
 <p class="text__text__1FZLe text__dark-grey__3Ml43 text__regular__2N1Xr text__extra_small__1Mw6v body__base__2