# `beautifulSoup` 套件介紹

In [None]:
!pipenv install beautifulsoup4
!pipenv install html5lib
!pipenv install lxml

## 使用`requests`獲得網頁，並且使用`bs`來技取傳回資料的內容

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.must.edu.tw/'
#發送 GET 請求到 url，並將回應物件放到 resp
resp = requests.get(url)
# 將 resp.text 也就是 HTML 資料定義到 BeautifulSoup 物件內，並用 html5lib 解析 HTML 內容
soup = BeautifulSoup(resp.text, 'lxml')

# 輸出網頁的 title
print(soup.title.getText())

#輸出第一個尋找到的 <li> 元素的文字
print('li1')
print(soup.li.getText())

#輸出第一個尋找到的 <li> 元素的文字(相同效果)
print('li2')
print(soup.find('li').getText())

#尋找全部 <li> 元素的文字
lst = soup.find_all('li')
for li in lst:
    print(li.getText())

## 取得標籤屬性
若想在一個標籤內取得該標籤的屬性，只需像字典一樣操作即可。

例如有個標籤為 `<a href='https://www.google.com'>OwO</a>`，`soup.a['href']` 即可取得該標籤的屬性`https://www.google.com`

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.must.edu.tw/'
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html5lib')

links = soup.find_all('a')
for link in links:
    if 'href' in link.attrs:
        print(link['href'])

## BeautifulSoup 定位
* `soup.find()` : 根據條件回傳"第一個"符合的元素，由字串表示，若沒有符合的則回傳 None。
* `soup.find_all()` : 根據條件回傳"全部"符合的元素，由串列表示，若沒有符合的則回傳空串列。
* `soup.select()` : Css Selector。

可以透過標籤、 `id` 或 `class`來定位元素， `soup.find('p', id='myid', class_='myclass')` ，注意 class 後方必須加上底線，為了避免與 Python 的關鍵字`class`衝突。

In [None]:
import requests
from bs4 import BeautifulSoup
url = "https://www.ptt.cc/bbs/Beauty/index.html"
res = requests.get(url,cookies = {'over18':'1'}) 
#記得八卦版要over18=1這個cookies！
Soup = BeautifulSoup(res.text,'lxml') 
print(Soup.title) #可以讀取的網頁的title
print(Soup.title.text) 
Soup.find(class_='r-ent') #我們就可以找到第一個class為'r-ent'的元素了
Soup.find(class_='r-ent').find(class_='title') 
Soup.find(class_='r-ent').find(class_='title').find('a')
Soup.find_all(class_='r-ent')



## CSS Selector

In [None]:
# CSS Selector

Soup.select_one('.r-ent > .title > a ')
for i in Soup.select('.r-ent > .title > a '):
    print(i.text) #印出標題
    print('https://www.ptt.cc',end='') #補上不完整的網址
    print(i['href']) #讀出藏在href中的超連結網址

In [41]:
import requests
from bs4 import BeautifulSoup
homepage = requests.get('https://www.ptt.cc/bbs/hotboards.html')

soup = BeautifulSoup(homepage.text,'lxml')
board_find = soup.find_all('a', class_= 'board')
board_select = soup.select_one('a.board')
for i in board_find:
    print(i.text)


Gossiping
12829
綜合
◎[八卦] 黃牛法實名制是佛地魔嗎？


Baseball
9755
棒球
◎[棒球] 吳念庭先發2棒出戰羅德


C_Chat
4249
閒談
◎[希恰] 本屆的勝負：早坂愛的勝利


Stock
4227
學術
◎[股票] 台積電四月營收 mom+1.7%


Elephants
3577
CPBL
◎[兄弟] 助下恰上大震撼


NBA
2953
NBA.
◎[NBA] 新任板主熱烈募集中!!!              


LoL
1933
遊戲
◎[LoL] 2023 季中邀請賽淘汰賽階段


Lifeismoney
1459
省錢
◎[省錢] 省錢板 發文要有省錢點


KoreaStar
1096
韓國
◎[韓星] 史哲，實名制是佛地魔嗎？


BaseballXXXX
1052
４Ｘ
◎[ 4X ] 歡迎Josh大駕光臨


home-sale
962
房屋
◎[房屋] 禁無關看板新聞/租賃廣告


BabyMother
845
家庭
◎[寶媽] 天氣多變化 多準備衣物


HatePolitics
823
Hate
◎[政黑] 進板圖 徵至 05/22


car
750
車車
◎[汽車] 大家要注意板規1-8喔


Tech_Job
696
工作
◎[科技] 版規三政治水桶恢復至7天


movie
656
綜合
◎[電影] 請注意防雷/分類/電影點


sex
655
男女
◎[西斯] 


MobileComm
639
資訊
◎[通訊] 手機情報o資費比較o交易x


Beauty
610
聊天
◎[表特] 貼AI圖 一律水桶+退文


Military
579
軍事
◎[軍事] 誠徵 Military 板板主 


PC_Shopping
567
硬體
◎[電蝦]TWZLR50294/TWACR50160華碩


basketballTW
499
籃球
◎[台籃] 季後賽


Japan_Travel
487
旅遊
◎發問前請注意是否有事先查過資訊


SportLottery
456
博弈
◎[運彩] 量力而為 樂趣加倍


creditcard
412
理財
◎[卡板] 禁發花旗揪團團主宣傳文章


TaichungBun
409
台中
◎[台中] 節約用水


WomenTalk
350
聊天

## 測試不同parser的速度

In [37]:
import requests
from bs4 import BeautifulSoup
import time
start = time.time()
homepage = requests.get('https://www.ptt.cc/bbs/hotboards.html')
# print(homepage.text)
a = 0
b = 0
c = 0
d = 0
for i in range(1,100):
    with open ('nosoup.text','a', encoding="utf-8") as f:
        f.write(homepage.text)
    end = time.time()
    elapsed = end - start
    a = a + elapsed
    print("Time taken nosoup: ", elapsed, "seconds.")

    start = time.time()
    soup = BeautifulSoup(homepage.text,'lxml')
    # print(soup)
    with open ('lxml.text','a', encoding="utf-8") as f:
        f.write(soup.text)
    end = time.time()
    elapsed = end - start
    b = b + elapsed
    print("Time taken lxml.text: ", elapsed, "seconds.")    

    start = time.time()
    h5soup = BeautifulSoup(homepage.text,'html5lib')
    # print(h5soup)
    with open ('html5lib.text','a', encoding="utf-8") as f:
        f.write(h5soup.text)
    end = time.time()
    elapsed = end - start
    c = c + elapsed
    print("Time taken html5lib: ", elapsed, "seconds.")    

    start = time.time()    
    pasersoup = BeautifulSoup(homepage.text,'html.parser')
    # print(pasersoup)
    with open ('htmlparser.text','a', encoding="utf-8") as f:
        f.write(pasersoup.text)

    end = time.time()
    elapsed = end - start
    d = d + elapsed
    print("Time taken htmlparser: ", elapsed, "seconds.")

print("Time taken nosoup: ", a, "seconds.")
print("Time taken lxml: ", b, "seconds.")
print("Time taken html5lib: ", c, "seconds.")
print("Time taken htmlparser: ", d, "seconds.")

Time taken nosoup:  0.58732008934021 seconds.
Time taken lxml.text:  0.04574775695800781 seconds.
Time taken html5lib:  0.2320704460144043 seconds.
Time taken htmlparser:  0.041170358657836914 seconds.
Time taken nosoup:  0.041170358657836914 seconds.
Time taken lxml.text:  0.028324365615844727 seconds.
Time taken html5lib:  0.06614828109741211 seconds.
Time taken htmlparser:  0.04533219337463379 seconds.
Time taken nosoup:  0.05366253852844238 seconds.
Time taken lxml.text:  0.027497291564941406 seconds.
Time taken html5lib:  0.06992125511169434 seconds.
Time taken htmlparser:  0.09487366676330566 seconds.
Time taken nosoup:  0.11673712730407715 seconds.
Time taken lxml.text:  0.03943443298339844 seconds.
Time taken html5lib:  0.13004279136657715 seconds.
Time taken htmlparser:  0.04869651794433594 seconds.
Time taken nosoup:  0.0509028434753418 seconds.
Time taken lxml.text:  0.031079530715942383 seconds.
Time taken html5lib:  0.054123640060424805 seconds.
Time taken htmlparser:  0.1

In [45]:
article = requests.get(
            url = url,
            cookies = {'over18': 'yes'}  # ptt18歲的認證
        )
soup = BeautifulSoup(article.text,'lxml')
r_ent = soup.select('div.r-ent')[0].text
a_url = soup.select('div.title > a')[0]['href']
a_title = soup.select('div.title')[0].text
print(a_title)
a_author = soup.select('div.author')[0].text
print(a_author)
a_date = soup.select('div.date')[0].text
print(a_date)
print('https://www.ptt.cc/'+a_url)


			
				(本文已被刪除) [Flitz]
			
			
-
 5/10
https://www.ptt.cc//bbs/Beauty/M.1683702297.A.A2D.html
