In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## 1.1 연결

In [1]:
from urllib.request import urlopen

html = urlopen("http://pythonscraping.com/pages/page1.html")

print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


## 1.2 BeautifulSoup 소개

- 잘못된 HTML을 수정하여 쉽게 탐색할 수 있는 XML 형식의 파이썬 객체로 변환

### 1.2.1 BeautifulSoup 설치

- 참고 사이트 : [Crummy.com](http://www.crummy.com/software/BeautifulSoup/bs4/doc/)

### 1.2.2 BeautifulSoup 실행

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://pythonscraping.com/pages/page1.html")

bsObj = BeautifulSoup(html.read(), "html.parser")

print(bsObj)

<html>
<head>
<title>A Useful Page</title>
</head>
<body>
<h1>An Interesting Title</h1>
<div>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</div>
</body>
</html>



In [4]:
print(bsObj.h1)

<h1>An Interesting Title</h1>


In [6]:
print(bsObj.html.body.h1)
print(bsObj.html.h1)
print(bsObj.body.h1)

<h1>An Interesting Title</h1>
<h1>An Interesting Title</h1>
<h1>An Interesting Title</h1>


### 1.2.3 신뢰할 수 있는 연결

In [7]:
from urllib.request import urlopen
from urllib.request import HTTPError
from bs4 import BeautifulSoup

try:
    html = urlopen("http://pythonscraping.com/pages/page1.html")
except HTTPError as e:
    print(e)
    # null을 반환하거나,break 문을 실행하거나, 기타 다른 방법을 사용
else:
    # 프로그램을 계속 실행
    # except 절에서 return이나 break를 사용했다면 else 절은 필요 없다.
    bsObj = BeautifulSoup(html.read(), "html.parser")
    print(bsObj)

<html>
<head>
<title>A Useful Page</title>
</head>
<body>
<h1>An Interesting Title</h1>
<div>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</div>
</body>
</html>



In [10]:
print(bsObj.nonExistentTag)
print(bsObj.find("nonExistent"))

None
None


  tag_name, tag_name))


In [11]:
print(bsObj.find("nonExistent").someTag)

AttributeError: 'NoneType' object has no attribute 'someTag'

In [12]:
try:
    badContent = bsObj.find("nonExistent").anotherTag
except AttributeError as e:
    print("Tag was not found")
else:
    if badContent == None:
        print("Tag was not found")
    else:
        print(badContent)

Tag was not found


In [13]:
from urllib.request import urlopen
from urllib.request import HTTPError
from bs4 import BeautifulSoup

def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None

    try:
        bsObj = BeautifulSoup(html.read(), "html.parser")
        title = bsObj.body.h1
    except AttributeError as e:
        return None
    
    return title

In [14]:
title = getTitle("http://pythonscraping.com/pages/page1.html")

if title == None:
    print("Tag was not found")
else:
    print(title)

<h1>An Interesting Title</h1>
