# [莫凡爬蟲教學](https://morvanzhou.github.io/tutorials/data-manipulation/scraping/)

## 用Python登錄網頁

In [22]:
from urllib.request import urlopen

# if has Chinese, apply decode()
html = urlopen(
    "https://morvanzhou.github.io/static/scraping/basic-structure.html"
).read().decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>Scraping tutorial 1 | 莫烦Python</title>
	<link rel="icon" href="https://morvanzhou.github.io/static/img/description/tab_icon.png">
</head>
<body>
	<h1>爬虫测试1</h1>
	<p>
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
		<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>

</body>
</html>


In [29]:
import re
res = re.findall("<title>(.+?)</title>", html)
print("\nPage title is: ", res[0])

# Page title is:  Scraping tutorial 1 | 莫烦Python


Page title is:  Scraping tutorial 1 | 莫烦Python


In [33]:
res = re.findall(r"<p>(.*?)</p>", html, flags=re.DOTALL)    # re.DOTALL if multi line (沒放不能跑)
print("\nPage paragraph is: ", res[0])

# Page paragraph is:
#  这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
#  <a href="https://morvanzhou.github.io/tutorials/scraping">爬虫教程</a> 中的简单测试.


Page paragraph is:  
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
		<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	


In [37]:
res = re.findall(r'href="(.*?)"', html) ## 讀取所有link
print("\nAll links: ", res)
# All links:
['https://morvanzhou.github.io/static/img/description/tab_icon.png',
'https://morvanzhou.github.io/',
'https://morvanzhou.github.io/tutorials/scraping']


All links:  ['https://morvanzhou.github.io/static/img/description/tab_icon.png', 'https://morvanzhou.github.io/', 'https://morvanzhou.github.io/tutorials/data-manipulation/scraping/']


['https://morvanzhou.github.io/static/img/description/tab_icon.png',
 'https://morvanzhou.github.io/',
 'https://morvanzhou.github.io/tutorials/scraping']

---
## 我的練習

* 爬[台大社會系網頁](http://sociology.ntu.edu.tw/zh_tw/teacher/FullTime)

In [39]:
from urllib.request import urlopen
html = urlopen(
    "http://sociology.ntu.edu.tw/zh_tw/teacher/FullTime"
).read().decode("utf-8")
print(html)

<!DOCTYPE html>
<html lang="zh_tw" class="orbit">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <link href="/assets/favicon.ico" rel="shortcut icon" type="image/vnd.microsoft.icon">
  <title>台灣大學社科院-社會系</title>
  <link href="//cdnjs.cloudflare.com/ajax/libs/font-awesome/4.3.0/css/font-awesome.min.css" media="screen" rel="stylesheet">
  <link href="/assets/bootstrap/bootstrap.min-cfd64c67a341584d2fc093d1c6737cff.css" media="screen" rel="stylesheet">
  <link href="/assets/template/template-5a204b05325a2786c31792bc9937774b.css" media="screen" rel="stylesheet">
  <link rel="stylesheet" media="print" type="text/css" href="/assets/template/print.css">
  <script src="/assets/plugin/modernizr-6a5ad612c54982e085aff743118bd2d0.js"></script>
  <script src="/assets/plugin/picturefill.min-292c56cabcb10d5120d55aa48aa6e442.js"></script>
  <scr

In [44]:
res = re.findall("<title>(.+?)</title>", html)
print(res)

['台灣大學社科院-社會系']


只能做到這樣，其他位置的標記比較複雜，之後再練習。

---

## Beautifulsoup

In [45]:
!pip install beautifulsoup4



In [62]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

# if has Chinese, apply decode()
html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>Scraping tutorial 1 | 莫烦Python</title>
	<link rel="icon" href="https://morvanzhou.github.io/static/img/description/tab_icon.png">
</head>
<body>
	<h1>爬虫测试1</h1>
	<p>
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
		<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>

</body>
</html>


In [63]:
soup = BeautifulSoup(html, features='lxml')
print(soup.h1)

<h1>爬虫测试1</h1>


In [64]:
print(soup.p)

<p>
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>


In [98]:
all_href = soup.find_all('a')
for x in all_href:
 print(x['href'])
all_href = [x['href'] for x in all_href] ## 簡化以上式子
print('\n', all_href)

# ['https://morvanzhou.github.io/', 'https://morvanzhou.github.io/tutorials/scraping']

https://morvanzhou.github.io/
https://morvanzhou.github.io/tutorials/data-manipulation/scraping/

 ['https://morvanzhou.github.io/', 'https://morvanzhou.github.io/tutorials/data-manipulation/scraping/']


### 練習

In [102]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen(
    "http://sociology.ntu.edu.tw/zh_tw/teacher/FullTime"
).read().decode("utf-8")
soup = BeautifulSoup(html, features='lxml')
print(soup)

<!DOCTYPE html>
<html class="orbit" lang="zh_tw">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="/assets/favicon.ico" rel="shortcut icon" type="image/vnd.microsoft.icon"/>
<title>台灣大學社科院-社會系</title>
<link href="//cdnjs.cloudflare.com/ajax/libs/font-awesome/4.3.0/css/font-awesome.min.css" media="screen" rel="stylesheet"/>
<link href="/assets/bootstrap/bootstrap.min-cfd64c67a341584d2fc093d1c6737cff.css" media="screen" rel="stylesheet"/>
<link href="/assets/template/template-5a204b05325a2786c31792bc9937774b.css" media="screen" rel="stylesheet"/>
<link href="/assets/template/print.css" media="print" rel="stylesheet" type="text/css"/>
<script src="/assets/plugin/modernizr-6a5ad612c54982e085aff743118bd2d0.js"></script>
<script src="/assets/plugin/picturefill.min-292c56cabcb10d5120d55aa48aa6e442.js"></script>
<script src="//cdnjs

---

## 什麼是CSS 

* 觀察`class`

In [115]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

# if has Chinese, apply decode()
html = urlopen("https://morvanzhou.github.io/static/scraping/list.html").read().decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>爬虫练习 列表 class | 莫烦 Python</title>
	<style>
	.jan {
		background-color: yellow;
	}
	.feb {
		font-size: 25px;
	}
	.month {
		color: red;
	}
	</style>
</head>

<body>

<h1>列表 爬虫练习</h1>

<p>这是一个在 <a href="https://morvanzhou.github.io/" >莫烦 Python</a> 的 <a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/" >爬虫教程</a>
	里无敌简单的网页, 所有的 code 让你一目了然, 清晰无比.</p>

<ul>
	<li class="month">一月</li>
	<ul class="jan">
		<li>一月一号</li>
		<li>一月二号</li>
		<li>一月三号</li>
	</ul>
	<li class="feb month">二月</li>
	<li class="month">三月</li>
	<li class="month">四月</li>
	<li class="month">五月</li>
</ul>

</body>
</html>


In [129]:
soup = BeautifulSoup(html, features='lxml')

# use class to narrow search
month = soup.find_all('li', 
                     {'class' : 'month'}) ## 使用字典形式
for m in month:
#    print(m)
    print(m.get_text()) ## 只顯示文字，不顯示<li>

一月
二月
三月
四月
五月


In [142]:
jan = soup.find('ul', {"class" : "jan"})
d_jan = jan.find_all('li')
for d in d_jan:
    print(d.get_text())

一月一号
一月二号
一月三号


### 練習

In [166]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen(
    "http://sociology.ntu.edu.tw/zh_tw/teacher/FullTime"
).read().decode("utf-8")
soup = BeautifulSoup(html, features='lxml')
status = soup.find_all('span', {'class' : 'i-member-value member-data-value-job-title'})
name = soup.find_all('span', {'class' : 'i-member-title member-data-title-name'})

In [182]:
name = soup.find_all('span', {'class' : 'i-member-value member-data-value-name'})

怎麼把姓名和url 分開??

## 正則表達

In [6]:
from  bs4  import  BeautifulSoup
from urllib.request import urlopen
import re

# if has Chinese, apply decode()
html = urlopen("https://morvanzhou.github.io/static/scraping/table.html").read().decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>爬虫练习 表格 table | 莫烦 Python</title>

	<style>
	img {
		width: 250px;
	}
	table{
		width:50%;
	}
	td{
		margin:10px;
		padding:15px;
	}
	</style>
</head>
<body>

<h1>表格 爬虫练习</h1>

<p>这是一个在 <a href="https://morvanzhou.github.io/" >莫烦 Python</a> 的 <a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/" >爬虫教程</a>
	里无敌简单的网页, 所有的 code 让你一目了然, 清晰无比.</p>

<br>
<table id="course-list">
	<tr>
		<th>
			分类
		</th><th>
			名字
		</th><th>
			时长
		</th><th>
			预览
		</th>
	</tr>

	<tr id="course1" class="ml">
		<td>
			机器学习
		</td><td>
			<a href="https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/">
				Tensorflow 神经网络</a>
		</td><td>
			2:00
		</td><td>
			<img src="https://morvanzhou.github.io/static/img/course_cover/tf.jpg">
		</td>
	</tr>

	<tr id="course2" class="ml">
		<td>
			机器学习
		</td><td>
			<a href="https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/">
			

In [7]:
soup = BeautifulSoup(html, features='lxml')

img_links = soup.find_all("img", {"src": re.compile('.*?\.jpg')}) ## 抓出jpg檔
for link in img_links:
    print(link['src'])

https://morvanzhou.github.io/static/img/course_cover/tf.jpg
https://morvanzhou.github.io/static/img/course_cover/rl.jpg
https://morvanzhou.github.io/static/img/course_cover/scraping.jpg


## 爬百度百科

In [13]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import random

base_url = "https://baike.baidu.com"
his = ["/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711"] ## 靜態頁面

In [17]:
url = base_url + his[-1]

html = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
print(soup.find('h1').get_text(), '    url: ', his[-1]) ## 顯示標題

网络爬虫     url:  /item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711


In [22]:
# find valid urls
sub_urls = soup.find_all("a", {"target": "_blank", "href": re.compile("/item/(%.{2})+$")})

if len(sub_urls) != 0:
    his.append(random.sample(sub_urls, 1)[0]['href'])
else:
    # no valid sub link found
    his.pop()
print(his)

['/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711', '/item/%E9%94%9A%E7%82%B9', '/item/%E7%BD%91%E7%BB%9C%E6%95%B0%E6%8D%AE']


In [24]:
his = ["/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711"]

for i in range(20):
    url = base_url + his[-1]

    html = urlopen(url).read().decode('utf-8')
    soup = BeautifulSoup(html, features='lxml')
    print(i, soup.find('h1').get_text(), '    url: ', his[-1])

    # find valid urls
    sub_urls = soup.find_all("a", {"target": "_blank", "href": re.compile("/item/(%.{2})+$")})

    if len(sub_urls) != 0:
        his.append(random.sample(sub_urls, 1)[0]['href'])
    else:
        # no valid sub link found
        his.pop()

0 网络爬虫     url:  /item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711
1 主机     url:  /item/%E4%B8%BB%E6%9C%BA
2 服务器     url:  /item/%E6%9C%8D%E5%8A%A1%E5%99%A8
3 操作系统     url:  /item/%E6%93%8D%E4%BD%9C%E7%B3%BB%E7%BB%9F
4 作业     url:  /item/%E4%BD%9C%E4%B8%9A
5 闲窗括异志     url:  /item/%E9%97%B2%E7%AA%97%E6%8B%AC%E5%BC%82%E5%BF%97
6 赵昀     url:  /item/%E7%90%86%E5%AE%97
7 马廷鸾     url:  /item/%E9%A9%AC%E5%BB%B7%E9%B8%BE
8 窗间莫问唤祁嘉     url:  /item/%E7%AA%97%E9%97%B4%E8%8E%AB%E9%97%AE%E5%94%A4%E7%A5%81%E5%98%89
9 祁嘉     url:  /item/%E7%A5%81%E5%98%89
10 前凉     url:  /item/%E5%89%8D%E5%87%89
11 张芳     url:  /item/%E5%BC%A0%E8%8A%B3
12 百度百科：多义词     url:  /item/%E7%99%BE%E5%BA%A6%E7%99%BE%E7%A7%91%EF%BC%9A%E5%A4%9A%E4%B9%89%E8%AF%8D
13 赵氏孤儿大报仇     url:  /item/%E8%B5%B5%E6%B0%8F%E5%AD%A4%E5%84%BF
14 百度百科：多义词     url:  /item/%E7%99%BE%E5%BA%A6%E7%99%BE%E7%A7%91%EF%BC%9A%E5%A4%9A%E4%B9%89%E8%AF%8D
15 赵氏孤儿大报仇     url:  /item/%E8%B5%B5%E6%B0%8F%E5%AD%A4%E5%84%BF
16 义项     url:  /item/%E4%B9%89%E9%A1%B9

## 多功能的Requests

### requests: an alternative to urllib

get / post 網頁

In [45]:
!pip install requests 
import requests ## 是request`s`
import webbrowser ## 使用瀏覽器
param = {"wd": "莫烦Python"}
r = requests.get('http://www.baidu.com/s', params=param)
print(r.url)
#webbrowser.open(r.url) ## 打開瀏覽器

http://www.baidu.com/s?wd=%E8%8E%AB%E7%83%A6Python


### 練習

In [47]:
import requests
param = {'q' : 'python'}
r = requests.get('https://www.google.com.tw/search', params=param)
print(r.url)

https://www.google.com.tw/search?q=python


In [56]:
fdata = {'firstname': 'Michael', 'lastname': 'Jordan'}
r = requests.post('http://pythonscraping.com/pages/files/processing.php', data=fdata)
print(r.text)

Hello there, Michel Jordan!


### Post

In [74]:
payloadpayload = {'username': 'Morvan', 'password': 'password'}
r = requests.post('http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
print(r.cookies.get_dict())

{'loggedin': '1', 'username': 'Morvan'}


In [84]:
r = requests.get('http://pythonscraping.com/pages/cookies/login.html', 
                 cookies=r.cookies)
print(r.text)

<h2>Log In Here!</h2>
<form method="post" action="welcome.php">
Username (use anything!): <input type="text" name="username"><br>
Password (try "password"): <input type="password" name="password"><br>
<input type="submit" value="Login">
</form>


In [86]:
session = requests.Session()
payload = {'username': 'Morvan', 'password': 'password'}
r = session.post('http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
print(r.cookies.get_dict())
r = session.get("http://pythonscraping.com/pages/cookies/login.html")
print(r.text)

{'loggedin': '1', 'username': 'Morvan'}
<h2>Log In Here!</h2>
<form method="post" action="welcome.php">
Username (use anything!): <input type="text" name="username"><br>
Password (try "password"): <input type="password" name="password"><br>
<input type="submit" value="Login">
</form>


## 下載文件

In [95]:
from urllib.request import urlretrieve
IMAGE_URL = "https://morvanzhou.github.io/static/img/description/learning_step_flowchart.png"
urlretrieve(IMAGE_URL, '.image1.png')

('.image1.png', <http.client.HTTPMessage at 0x51ec278>)

In [97]:
import requests
r = requests.get(IMAGE_URL)
with open('image2.png', 'wb') as f:
    f.write(r.content)

In [98]:
r = requests.get(IMAGE_URL, stream=True)    # stream loading

with open('image3.png', 'wb') as f:
    for chunk in r.iter_content(chunk_size=32):
        f.write(chunk)

In [99]:
from bs4 import BeautifulSoup
import requests

URL = "http://www.nationalgeographic.com.cn/animals/"

In [100]:
html = requests.get(URL).text
soup = BeautifulSoup(html, 'lxml')
img_ul = soup.find_all('ul', {"class": "img_list"})

In [104]:
import os

os.makedirs('./img/', exist_ok=True)

In [105]:
for ul in img_ul:
    imgs = ul.find_all('img')
    for img in imgs:
        url = img['src']
        r = requests.get(url, stream=True)
        image_name = url.split('/')[-1]
        with open('img/%s' % image_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size=128):
                f.write(chunk)
        print('Saved %s' % image_name)

Saved 20181011112449225.jpg
Saved 20181010034625605.jpg
Saved 20181010122414791.jpg
Saved 20181008105419570.jpg
Saved 20180930112419433.jpg
Saved 20180919015152330.jpg


## Selenium

[安裝參考](https://medium.com/@NorthBei/%E5%9C%A8windows%E4%B8%8A%E5%AE%89%E8%A3%9Dpython-selenium-%E7%B0%A1%E6%98%93%E6%95%99%E5%AD%B8-eade1cd2d12d)

In [106]:
!pip install selenium



In [110]:
from selenium import webdriver
#使用chrome的webdriver


In [112]:
browser = webdriver.Chrome()
#開啟google首頁
browser.get('http://google.com/')
#如果需要執行完自動關閉，就要加上下面這一行
#browser.close()

In [114]:
from selenium import webdriver

driver = webdriver.Chrome()
driver.get("https://www.google.com")
#找到輸入框
element = driver.find_element_by_name("q");
#輸入內容
element.send_keys("AKB48");
#提交表單
element.submit();
#driver.close()

In [116]:
#!python
from selenium import webdriver

driver = webdriver.Chrome()
driver.get("http://140.113.209.24/0656000")

assert "Title" in driver.title
elem = driver.find_element_by_xpath('//*[@id="id"]')
id = (elem.text)
print(id)
elem = driver.find_element_by_xpath('//*[@id="time"]')
time = (elem.text)
print(time)
elem = driver.find_element_by_xpath('//*[@id="secret"]')
hash = (elem.text)
print(hash)
#driver.close()

KeyboardInterrupt: 

In [117]:
from selenium import webdriver

driver = webdriver.Chrome()     # 打开 Chrome 浏览器

# 将刚刚复制的帖在这
driver.get("https://morvanzhou.github.io/")
driver.find_element_by_xpath(u"//img[@alt='强化学习 (Reinforcement Learning)']").click()
driver.find_element_by_link_text("About").click()
driver.find_element_by_link_text(u"赞助").click()
driver.find_element_by_link_text(u"教程 ▾").click()
driver.find_element_by_link_text(u"数据处理 ▾").click()
driver.find_element_by_link_text(u"网页爬虫").click()

# 得到网页 html, 还能截图
html = driver.page_source       # get html
driver.get_screenshot_as_file("./img/sreenshot1.png")
driver.close()