### SSL, 한글처리

In [1]:
# urllib 패키지의 request 모듈에서 urlopen() 함수를 가져옴
from urllib.request import urlopen

# Retrieve HTML string from the URL
# 한글 출력, b': 바이트 스트림을 의미, 한글 깨짐
html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


In [2]:
# 영문 사이트
html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon").read()
print(str(html, "utf-8")[:300])


<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Kevin Bacon - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFo


In [6]:
# https를 접근하기 위해 사용, 한글 위키는 https 문제가 있을 수 있어서 사용
import ssl                      # https 접근

context = ssl._create_unverified_context()

In [10]:
# 한글 출력, b': 바이트 스트림을 의미, 한글 깨짐
# https://ko.wikipedia.org/wiki/%EC%BC%80%EB%B9%88_%EB%B2%A0%EC%9D%B4%EC%BB%A8

# Encoding ERROR
# html = urlopen("https://ko.wikipedia.org/wiki/" + "케빈_베이컨", context=context).read()
# print(html[:300]) # 0 ~ 299개의 문자만 출력, 한글 깨짐

In [13]:
# 크롬이 기본적으로 사용하는 인코딩 방식
import sys
print(sys.getdefaultencoding()) # utf-8
from urllib.parse import quote  # 한글 처리 함수

utf-8


In [14]:
# https://ko.wikipedia.org/wiki/케빈_베이컨
html = urlopen("https://ko.wikipedia.org/wiki/" + quote("케빈_베이컨"), context=context).read()
print(html[:500]) # 0 ~ 299개의 문자만 출력, 한글 깨짐

b'\n<!DOCTYPE html>\n<html class="client-nojs" lang="ko" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>\xec\xbc\x80\xeb\xb9\x88 \xeb\xb2\xa0\xec\x9d\xb4\xec\xbb\xa8 - \xec\x9c\x84\xed\x82\xa4\xeb\xb0\xb1\xea\xb3\xbc, \xec\x9a\xb0\xeb\xa6\xac \xeb\xaa\xa8\xeb\x91\x90\xec\x9d\x98 \xeb\xb0\xb1\xea\xb3\xbc\xec\x82\xac\xec\xa0\x84</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"ko","wgMonthNames":["","1\xec\x9b\x94","2\xec\x9b\x94","3\xec\x9b\x94","4\xec\x9b\x94","5\xec\x9b\x94","6\xec\x9b\x94","7\xec\x9b\x94","8\xec\x9b\x94","9\xec\x9b\x94","10\xec\x9b\x94","11\xec\x9b\x94","12\xec\x9b\x94"],"wgRequestId":"15c93f3d-2f55-471e-b3ea-55e0'


In [15]:
print('한글 처리한 경우')
html = urlopen("https://ko.wikipedia.org/wiki/" + quote("케빈_베이컨"), context=context).read()
print(str(html, "utf-8")[:500]) # 응답 문자열 한글 출력

한글 처리한 경우

<!DOCTYPE html>
<html class="client-nojs" lang="ko" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>케빈 베이컨 - 위키백과, 우리 모두의 백과사전</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"ko","wgMonthNames":["","1월","2월","3월","4월","5월","6월","7월","8월","9월","10월","11월","12월"],"wgRequestId":"15c93f3d-2f55-471e-b3ea-55e0a740966e","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanon


### 웹에 접속하여 태그를 크롤링

In [16]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
bs = BeautifulSoup(html, "html.parser") # html, html.parser

# body > h1
tag = bs.html.body.h1
print(type(tag))           # <class 'bs4.element.Tag'>
print(bs.html.body.h1)     # <h1>An Interesting Title</h1>
print(bs.body.h1)          # <h1>An Interesting Title</h1>
print(bs.html.h1)          # <h1>An Interesting Title</h1>
print(bs.h1)               # <h1>An Interesting Title</h1>
print(bs.html.body.h1.get_text())

<class 'bs4.element.Tag'>
<h1>An Interesting Title</h1>
<h1>An Interesting Title</h1>
<h1>An Interesting Title</h1>
<h1>An Interesting Title</h1>
An Interesting Title


In [17]:
from urllib.error import HTTPError

def get(url):
    try:
        html = urlopen(url)
        bs = BeautifulSoup(html.read()) 
        title = bs.body.h1.get_text()
    except HTTPError as e:
        # 페이지 url이 없는 경우
        print(e)
        return None
    else:
        # 예외가 발생하지 않았다면
        return title
    finally:
        print('처리를 종료합니다.')

In [26]:
# title = get('https://www.naver.com') ??
title = get('http://www.pythonscraping.com/exercises/exercise1.html')

if title == None:
    print('요청 페이지의 타이틀이 존재하지 않습니다.')
else:
    print(title)

처리를 종료합니다.
An Interesting Title


In [30]:
from urllib.error import HTTPError

def getbs(url):
    try:
        html = urlopen(url)
        bs = BeautifulSoup(html.read(), 'html.parser') 
    except HTTPError as e:
        # 페이지 url이 없는 경우
        print(e)
        return None
    else:
        # 예외가 발생하지 않았다면
        return bs
    finally:
        print('처리를 종료합니다.')
# bs = getbs('http://www.pythonscraping.com/exercises/exercise1.html')
bs = getbs('http://www.pythonscraping.com/exercises/exercise100.html')
if bs == None:
    print('요청 페이지가 존재하지 않습니다.')
else:
    print(bs)

HTTP Error 404: Not Found
처리를 종료합니다.
요청 페이지가 존재하지 않습니다.


In [51]:
# http://www.pythonscraping.com/pages/warandpeace.html
bs = getbs('http://www.pythonscraping.com/pages/warandpeace.html')
print(type(bs)) # 타입: class 'bs4.BeautifulSoup'>
# tags = bs.find_all('span', {'class':'green'}) # 태그의 속성은 딕셔너리로 선언
# tags = bs.select('span', {'class':'green'}) # 제대로 찾지 못한다.
tags = bs.findAll('span', {'class':'green'}) 
for tag in tags[0:3]:
    print(tag.get_text())
    print('---------------------------------------------------')

처리를 종료합니다.
<class 'bs4.BeautifulSoup'>
Anna
Pavlovna Scherer
---------------------------------------------------
Empress Marya
Fedorovna
---------------------------------------------------
Prince Vasili Kuragin
---------------------------------------------------


In [57]:
# 태그 id로 하나의 태그의 값 찾기
bs = getbs('http://en.wikipedia.org/wiki/Kevin_Bacon')

item = bs.find(id="firstHeading") # <h1 id="firstHeading" class="firstHeading" lang="en">Kevin Bacon</h1>
print(type(item))
print(len(item))  # 1
print(item)
print(item.get_text()) # 태그의 내용
print(item.string) # 태그의 내용

처리를 종료합니다.
<class 'bs4.element.Tag'>
1
<h1 class="firstHeading" id="firstHeading" lang="en">Kevin Bacon</h1>
Kevin Bacon
Kevin Bacon


In [72]:
# http://www.pythonscraping.com/pages/page3.html
bs = getbs('http://www.pythonscraping.com/pages/page3.html')
# id가 gift list인 
# table 태그의 id 속성의 값이 giftList인 태그의 자식을 찾아서 iter로 리턴
tags_iter = bs.find('table', {'id': 'giftList'}).children # shift + tab을 누르면 함수 설명을 띄워줌
print(type(tags_iter))
print(tags_iter)

처리를 종료합니다.
<class 'list_iterator'>
<list_iterator object at 0x0000016461D14048>


In [82]:
# 자식 태그들, Chrome이 자동 생성한 <TBODY>는 포함안됨.
# F12 개발자 도구에는 <TBODY>가 있으나 소스에는 <TBODY>는 포함안됨

# iterator의 특징 -> 한번 가져오면 가져오지 못한다.
for tag in tags_iter:
    print(tag)
    print('-------------------------')

<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
-------------------------
<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>
-------------------------
<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>
-------------------------
<tr

In [83]:
# http://www.pythonscraping.com/pages/page3.html
bs = getbs('http://www.pythonscraping.com/pages/page3.html')
# id가 gift list인 
# findAll에서는 children 사용 불가
tags_iter = bs.findAll('tr', {'class': 'gift'}) # shift + tab을 누르면 함수 설명을 띄워줌

처리를 종료합니다.


In [84]:
# iterator를 리스트로 변환
tags = list(tags_iter)

In [90]:
for tag in tags:
    print(tag)

<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>
<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>
<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parrot! <spa

In [92]:
# 형제 태그 검색
bs = getbs("http://www.pythonscraping.com/pages/page3.html") # html, html.parser
# 첫번째 자식 태그를 지난후 그 태그의 형제 태그들, 
# 왜냐하면, TABLE의 첫번째 자식은 주로 데이터가 아님(헤더 등)
items = bs.find('table', {'id':'giftList'}).tr.next_siblings 

for item in items:
    print(item)
    print('------------------------------------------')

처리를 종료합니다.


------------------------------------------
<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
------------------------------------------


------------------------------------------
<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>
------------------------------------------


------------------------------------------
<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish

In [93]:
# 부모 태그 검색, 형제 태그 검색
item = bs.find('img', {'src':'../img/gifts/img1.jpg'})
print(item)  
# <img src="../img/gifts/img1.jpg"/>
# 실제의 소스: <img src="../img/gifts/img1.jpg">
print('------------------------------------------')
item = bs.find('img', {'src':'../img/gifts/img1.jpg'}).parent # td
print(item)
print('------------------------------------------')
item = bs.find('img', {'src':'../img/gifts/img1.jpg'}).parent.previous_sibling # 이전 td
print(item)
print('------------------------------------------')
item = item.next_sibling # 다음 td
print(item)

<img src="../img/gifts/img1.jpg"/>
------------------------------------------
<td>
<img src="../img/gifts/img1.jpg"/>
</td>
------------------------------------------
<td>
$15.00
</td>
------------------------------------------
<td>
<img src="../img/gifts/img1.jpg"/>
</td>


In [95]:
# 정규 표현식 사용
import re

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bs = BeautifulSoup(html, "html.parser") # html, html.parser
# \.: . 대응,  .*: 모든 문자
images = bs.findAll("img", {"src": re.compile("\.\./img/gifts/img.*\.jpg")})
print(type(images)) # <class 'bs4.element.ResultSet'>
for image in images:
    print(image)
    
for image in images:
    print(image['src'])

<class 'bs4.element.ResultSet'>
<img src="../img/gifts/img1.jpg"/>
<img src="../img/gifts/img2.jpg"/>
<img src="../img/gifts/img3.jpg"/>
<img src="../img/gifts/img4.jpg"/>
<img src="../img/gifts/img6.jpg"/>
../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg
