## 웹 페이지 가져오기

In [12]:
from urllib.request import urlopen

html = urlopen('https://www.daangn.com/hot_articles')
print(type(html))
print(html.read())

<class 'http.client.HTTPResponse'>


## BeautifulSoup 라이브러리

In [21]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs)
print(bs.h1)
print(bs.h1.string) # .string: 태그 내부의 문자열만 가져옴

<html>
<head>
<title>A Useful Page</title>
</head>
<body>
<h1>An Interesting Title</h1>
<div>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</div>
</body>
</html>

<h1>An Interesting Title</h1>
An Interesting Title


## 신뢰할 수 있는 연결과 예외 처리

In [29]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

try:
    # html = urlopen('http://www.pythonscraping.com/pages/error.html')
    html = urlopen('http://www.pythonscraping.com/pages/page1.html')
except HTTPError as e:
    print(e)
except URLError as e:
    print('The server could not be found!')
else:
    print('It worked!')

It worked!


## 존재하지 않는 태그 예외 처리

In [34]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

def getTitle(url, tag):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    
    try:
        bsObj = BeautifulSoup(html.read(), 'html.parser')
        value = bsObj.body.find(tag)
    except AttributeError as e:
        return None
    return value

tag = 'h2'
value = getTitle('http://www.pythonscraping.com/pages/page1.html', tag)

if value == None:
    print(f'{tag} coud not be found')
else:
    print(value)

h2 coud not be found


## 멜론 웹사이트 접근 #1

In [42]:
from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup

# 샘플 코드 1
# urllib.error.HTTPError: HTTP Error 406: Not Acceptable 발생

melon_url = 'https://www.melon.com/chart/index.htm'
# HTTP request 패킷 생성: Request()
urlrequest = Request(melon_url, headers = {'User-Agent': 'Mozilla/5.0'})

html = urlopen(urlrequest)

soup = BeautifulSoup(html.read().decode('utf-8'), 'html.parser')

print(soup)

<!DOCTYPE html>

<html lang="ko">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<title>멜론차트&gt;TOP100&gt;멜론</title>
<meta content="음악서비스, 멜론차트, 멜론TOP100, 최신음악, 인기가요, 뮤직비디오, 앨범, 플레이어, 스트리밍, 다운로드, 아티스트플러스, 아티스트채널" name="keywords"/>
<meta content="No.1 뮤직플랫폼 멜론! 최신 트렌드부터 나를 아는 똑똑한 음악추천까지!" name="description"/>
<meta content="ee85ff6db1fa8f2226bcb671ecb2bcdbcffb6f8b" name="naver-site-verification"/>
<meta content="q4tbTQhmxa4La3OdNLsNOCxrJ_WV6lUlBFrFW4-HqQc" name="google-site-verification"/>
<meta content="357952407588971" property="fb:app_id"/>
<meta content="Melon" property="og:title"/>
<meta content="https://cdnimg.melon.co.kr/resource/image/web/common/logo_melon142x99.png" property="og:image"/>
<meta content="음악이 필요한 순간, 멜론" property="og:description"/>
<meta content="http://www.melon.com/chart/index.htm" property="og:url"/>
<meta content="website" property="og:type"/>
<meta content="멜론" proper

## CSS 속성을 이용한 검색

In [None]:
from bs4 import BeautifulSoup

html_text = '<span class="red">Heavens! what a virulent attack!</span>'
soup = BeautifulSoup(html_text, 'html.parser')

object_tag = soup.find('span')
print('object_tag:', object_tag)
print('attrs:', object_tag.attrs)
print('value:', object_tag.attrs['class'])
print('text:', object_tag.text)

object_tag: <span class="red">Heavens! what a virulent attack!</span>
attrs: {'class': ['red']}
value: ['red']
text: Heavens! what a virulent attack!


In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://www.pythonscraping.com/pages/warandpeace.html')
soup = BeautifulSoup(html, 'html.parser')

# 등장인물의 이름: 녹색
name_list = soup.find_all('span', {'class' : 'green'})
for name in name_list:
    print(name.string)

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


## 특정 단어 찾기

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://www.pythonscraping.com/pages/warandpeace.html')
soup = BeautifulSoup(html, 'html.parser')

prince_list = soup.find_all(string='the prince')
print(prince_list)
print('the prince count: ', len(prince_list))

['the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince']
the prince count:  7


## 트리 이동: 자식과 자손

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://www.pythonscraping.com/pages/page3.html')
soup = BeautifulSoup(html, 'html.parser')

table_tag = soup.find('table', {'id' : 'giftList'})
print('children 개수: ', len(list(table_tag.children)))

index = 0
for child in table_tag.children:
    index += 1
    print(f"[{index}]: {child}")
    print('-'*30)

children 개수:  13
[1]: 

------------------------------
[2]: <tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
------------------------------
[3]: 

------------------------------
[4]: <tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
------------------------------
[5]: 

------------------------------
[6]: <tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>
------------------------------
[7]: 

-----------------------------

## 트리 이동: 자손

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://www.pythonscraping.com/pages/page3.html')
soup = BeautifulSoup(html, 'html.parser')
# 자손: descendants
desc = soup.find('table', {'id' : 'giftList'}).descendants
list_desc = list(desc)
print('descendants 개수: ', len(list_desc))

for tag in list_desc:
    print(tag)

descendants 개수:  86


<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<th>
Item Title
</th>

Item Title

<th>
Description
</th>

Description

<th>
Cost
</th>

Cost

<th>
Image
</th>

Image



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
<td>
Vegetable Basket
</td>

Vegetable Basket

<td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td>

This vegetable basket is the perfect gift for your health conscious (or overweight) friends!

<span class="excitingNote">Now with super-colorful bell peppers!</span>
Now with super-colorful bell peppers!


<td>
$15.00
</td>

$15.00

<td>
<img src="..

## 트리 이동: 형제 다루기 #1

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://www.pythonscraping.com/pages/page3.html')
soup = BeautifulSoup(html, 'html.parser')

# next_silbling 속성
for sibling in soup.find('table', {'id' : 'giftList'}).tr.next_siblings:
    print(sibling)
    print('-' * 30)



------------------------------
<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
------------------------------


------------------------------
<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>
------------------------------


------------------------------
<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</spa

## 트리 이동: 형제 다루기 #2

In [None]:
print('previous_siblings')
for sibling in soup.find('tr', {'id' : 'gift2'}).previous_siblings:
    print(sibling)

previous_siblings


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>




## 트리 이동: 형제 다루기 #3

In [None]:
sibling1 = soup.find('tr', {'id' : 'gift3'}).next_sibling
print('sibling1:', sibling1)
print(ord(sibling1)) # ord(문자): 문자의 Unicode 정수를 리턴

sibling1: 

10


In [None]:
sibling2 = soup.find('tr', {'id' : 'gift3'}).next_sibling.next_sibling
print(sibling2)

<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parrot! <span class="excitingNote">Or maybe he's only resting?</span>
</td><td>
$0.50
</td><td>
<img src="../img/gifts/img4.jpg"/>
</td></tr>


## 트리 이동: 부모 다루기 #1

In [None]:
style_tag = soup.style
print(style_tag.parent)

<head>
<style>
img{
	width:75px;
}
table{
	width:50%;
}
td{
	margin:10px;
	padding:10px;
}
.wrapper{
	width:800px;
}
.excitingNote{
	font-style:italic;
	font-weight:bold;
}
</style>
</head>


## 트리 이동: 부모 다루기 #2

In [None]:
img1 = soup.find('img', {'src': '../img/gifts/img1.jpg'})
text = img1.parent.previous_sibling.get_text()
print(text)


$15.00

