### html 분석 개념
- 복잡한 웹페이지에서 필요한 정보 가져오기
- 원하지 않는 콘텐츠 제거

- 원하는 정보는 다양한 곳에 존재
    - 페이지 타이틀
    - 페이지 URL
    - 원하는 정보가 정형화되어 있지 않은 경우, 문제 발생

#### CSS 속성을 이용한 검색

In [2]:
import requests
from bs4 import BeautifulSoup
import re

In [3]:
url = '"<span class="red">Heavens! what a virulent attack!</span>"'
# html = requests.get(url)
soup = BeautifulSoup(url, 'html.parser')
soup

"<span class="red">Heavens! what a virulent attack!</span>"

In [4]:
object_tag = soup.find('span')
object_tag.attrs['class']
object_tag.text

'Heavens! what a virulent attack!'

In [5]:
# 등장인물의 이름: 녹색 추출
url = 'https://www.pythonscraping.com/pages/warandpeace.html'
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
namelist = soup.find_all('span', class_ = 'green')
print(len(namelist))
for name in namelist:
    print(name.text)

41
Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


#### 특정단어 찾기 (검색)

In [6]:
princelist = soup.find_all(text='the prince')
print(princelist)

# 생각해볼 수 있는것: 인물들 이름과 카운트 개수가 표시된 데이터프레임

['the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince']


#### 트리 이동

In [7]:
url = 'https://www.pythonscraping.com/pages/page3.html'
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
soup

<html>
<head>
<style>
img{
	width:75px;
}
table{
	width:50%;
}
td{
	margin:10px;
	padding:10px;
}
.wrapper{
	width:800px;
}
.excitingNote{
	font-style:italic;
	font-weight:bold;
}
</style>
</head>
<body>
<div id="wrapper">
<img src="../img/gifts/logo.jpg" style="float:left;"/>
<h1>Totally Normal Gifts</h1>
<div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is
hand-curated by well-paid, free-range Tibetan monks.<p>
We haven't figured out how to make online shopping carts yet, but you can send us a check to:<br/>
123 Main St.<br/>
Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</p></div>
<table id="giftList">
<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) frien

In [8]:
# 자식으로 분할 (children 소환)
table_tag = soup.select_one('#giftList')
for child in table_tag.children:
    print(f'아이: {child}')

아이: 

아이: <tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
아이: 

아이: <tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
아이: 

아이: <tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>
아이: 

아이: <tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<im

In [9]:
# 자손(descendants)으로 분할
desc = soup.select_one('#giftList').descendants
print('descendants 개수: ', len(list(desc)))

for child in soup.select_one('#giftList').descendants:
    print(child)
    print('-'* 80)

descendants 개수:  86


--------------------------------------------------------------------------------
<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
--------------------------------------------------------------------------------
<th>
Item Title
</th>
--------------------------------------------------------------------------------

Item Title

--------------------------------------------------------------------------------
<th>
Description
</th>
--------------------------------------------------------------------------------

Description

--------------------------------------------------------------------------------
<th>
Cost
</th>
--------------------------------------------------------------------------------

Cost

--------------------------------------------------------------------------------
<th>
Image
</th>
--------------------------------------------------------------------------------

Image

---------------------------------------------

In [10]:
#트리이동: 형제 다루기(next_siblings)
# tr태그의 형제들을 찾는다.( 정확히는 테이블의 다음 행들)
for sibling in soup.select_one('#giftList').tr.next_siblings:
    print(sibling)




<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parr

In [11]:
# 트리이동: 형제 다루기(previous_siblings)
for sibling in soup.select_one('tr#gift2').previous_siblings:
    print(sibling)



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>




In [12]:
# 트리이동: 형제다루기 (next_sibling, previous_sibling)
# 태그 하나만 반환
sibling1 = soup.select_one('tr#gift3').next_sibling
print(sibling1)

# ord('문자'): 문자를 unicode 정수로 리턴 -> 체크용
print(ord(sibling1))



10


In [13]:
sibling2 = soup.select_one('tr#gift3').next_sibling.next_sibling
print(sibling2)

<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parrot! <span class="excitingNote">Or maybe he's only resting?</span>
</td><td>
$0.50
</td><td>
<img src="../img/gifts/img4.jpg"/>
</td></tr>


In [14]:
# 트리이동: 부모다루기(.parent) 사용
# style의 부모는 head이다.
style_tag = soup.style
print(style_tag.parent)

<head>
<style>
img{
	width:75px;
}
table{
	width:50%;
}
td{
	margin:10px;
	padding:10px;
}
.wrapper{
	width:800px;
}
.excitingNote{
	font-style:italic;
	font-weight:bold;
}
</style>
</head>


In [15]:
img1 = soup.find("img", {'src': '../img/gifts/img1.jpg'})
text = img1.parent.previous_sibling.text
print(text)


$15.00



In [16]:
soup.select('img')[1].parent.previous_sibling.text

'\n$15.00\n'

### 정규 표현식
- 정규 표현식: 특정한 규칙을 가진 문자열의 집합을 표현하는데 사용하는 형식 언어
- 정규 표현식 사용:
    1. 문자열과 관련된 문제 해결을 위해 사용
    2. 문자열 치환, 검색, 추출 등
    3. 문자열의 유효성 검사
- 장점
    1. 다양한 입력 문자열 처리가 간결
    2. 범용성: 다양한 프로그래밍 언어에서 지원
    3. 생산성 향상
- 단점
    1. 정규 표현식 자체의 어려움
    2. 소스코드가 어려워짐

In [20]:
import re
# compile() 사용 안함
m = re.match('[a-z]+', 'Python')
print(m)
print(re.search('apple', 'I like apple!'))


None
<re.Match object; span=(7, 12), match='apple'>


In [21]:
# compile() 사용
p = re.compile('[a-z]+') # 알파벳 소문자
m = p.match('python')
print(m)
print(p.search('I like apple 123'))


<re.Match object; span=(0, 6), match='python'>
<re.Match object; span=(2, 6), match='like'>


In [24]:
print(p.search('I like apple 123').group())

like


In [28]:
# findall은 일치하는 모든 문자열을 리스트로 반환
p = re.compile('[a-z]+')
result = p.findall('I like apple 123')
print(result)

['like', 'apple']


In [45]:
p = re.compile('[a-z]+')
p.search('I like apple 123')

# 전화번호 매치
tel = re.compile("^(\d{2,3})-(\d{3,4})-(\d{4})$")
print(tel.match('02-123-4567'))
print(tel.match('053-950-45678'))
print(tel.match('053950-4567'))

<re.Match object; span=(0, 11), match='02-123-4567'>
None
None


In [53]:
m = tel.match('02-123-4567')
m.groups()
m.group(3)

'4567'

### 전방 탐색

In [54]:
# 전방 긍정 탐색 (?=): 
# 패턴과 일치하는 문자열을 만나면 앞의 문자열 반환
lookahead1 = re.search('.+(?=won)', '1000 won')
print(lookahead1)
lookahead2 = re.search('.+(?=log:)', '2022-07-01 00:00:01 ABC.log: 전방탐색')
print(lookahead2)
# 전방 부정 탐색 (?!)
# 패턴과 일치하는 문자열을 만나지 않으면 앞의 문자열 반환
lookahead3 = re.search('\d{4}(?!-)', '010-1234-5678')
print(lookahead3)


<re.Match object; span=(0, 5), match='1000 '>
<re.Match object; span=(0, 24), match='2022-07-01 00:00:01 ABC.'>
<re.Match object; span=(9, 13), match='5678'>


### 후방 탐색

In [57]:
# 후방 긍정 탐색
lookbehind1 = re.search('(?<=log:).+', '2022-07-01 00:00:01 ABC.log: this is python')
print(lookbehind1)
lookbehind2 = re.search('(?<=:).+', 'USD: $51')
print(lookbehind2)

# 후방 부정 탐색(\b: 공백(blank)검색)
lookbehind3 = re.search(r'\b(?<!\$)\d+\b', 'I paid $30 for 100 apples.')
print(lookbehind3)
lookbehind4 = re.search('\\b(?<!\\$)\\d+\\b', 'I paid $30 for 100 apples.')
print(lookbehind4)

# 테스트
lookbehind5 = re.search(r'\b(?<=\$)\d+\b', 'I paid $30 for 100 apples.')
print(lookbehind5)

<re.Match object; span=(28, 43), match=' this is python'>
<re.Match object; span=(4, 8), match=' $51'>
<re.Match object; span=(15, 18), match='100'>
<re.Match object; span=(15, 18), match='100'>
<re.Match object; span=(8, 10), match='30'>


In [58]:
string = 'Python is fun'
match = re.search('\APython', string)
print(match)
if match:
    print('pattern found')
else:
    print('Pattern not found')

<re.Match object; span=(0, 6), match='Python'>
pattern found


In [61]:
s = 'Python 3.10 was released on October 04, 2021'

pattern = '\d{4}'
match = re.search(pattern, s)
start, end = match.span()
s[start:end]

'2021'

In [19]:
# True를 넣으면 모든 태그 나오는데, .name을 붙이면 태그 이름 반환
for tag in soup.find_all(True):
    print(tag.name)

# find,select등에 함수 넣기
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')

# tag의 속성에 'class'가 있지만 'id'가 없는 태그만 반환
soup.find_all(has_class_but_no_id)



html
head
style
body
div
img
h1
div
p
br
br
table
tr
th
th
th
th
tr
td
td
span
td
td
img
tr
td
td
span
td
td
img
tr
td
td
span
td
td
img
tr
td
td
span
td
td
img
tr
td
td
span
td
td
img
div
br


[<span class="excitingNote">Now with super-colorful bell peppers!</span>,
 <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>,
 <span class="excitingNote">Also hand-painted by trained monkeys!</span>,
 <span class="excitingNote">Or maybe he's only resting?</span>,
 <span class="excitingNote">Keep your friends guessing!</span>]

In [67]:
# 전방탐색과 후방탐색 응용
url = 'http://www.pythonscraping.com/pages/page3.html'
m = re.compile(r'(?<=http://).*(?=\.html)')
m.search(url)

<re.Match object; span=(7, 41), match='www.pythonscraping.com/pages/page3'>

In [88]:
for child in soup.select('#wrapper')[0].children:
    print(child)



<img src="../img/gifts/logo.jpg" style="float:left;"/>


<h1>Totally Normal Gifts</h1>


<div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is
hand-curated by well-paid, free-range Tibetan monks.<p>
We haven't figured out how to make online shopping carts yet, but you can send us a check to:<br/>
123 Main St.<br/>
Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</p></div>


<table id="giftList">
<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
H

### 정규 표현식과 BeautifulSoup

In [90]:
img_tag = re.compile('/img/gifts/img.*.jpg')
images = soup.find_all('img', src = img_tag)
for image in images:
    print(image, end=', ')
    print(image['src'])

<img src="../img/gifts/img1.jpg"/>, ../img/gifts/img1.jpg
<img src="../img/gifts/img2.jpg"/>, ../img/gifts/img2.jpg
<img src="../img/gifts/img3.jpg"/>, ../img/gifts/img3.jpg
<img src="../img/gifts/img4.jpg"/>, ../img/gifts/img4.jpg
<img src="../img/gifts/img6.jpg"/>, ../img/gifts/img6.jpg


### 정규 표현식과 BeautifulSoup2

In [95]:

html = requests.get('http://www.pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html.text, 'html.parser')
princeList = bs.find_all(text='the prince')
print('the prince count: ', len(princeList))

# find_all()에 정규식 사용 
prince_list = bs.find_all(text=re.compile('[T|t]{1}he prince'))
print('T|the prince count:', len(prince_list))



the prince count:  7
T|the prince count: 11
