## 抽取信息的几种方式

### 解析网页速度比较（BeautifulSoup、PyQuery、lxml、正则）

In [3]:
import re
import sys
import time
import requests
from lxml.html import fromstring
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup as bs

def Timer():
    a = time.time()
    while True:
        c = time.time()
        yield time.time()-a
        a = c
timer = Timer()
url = "http://www.python.org/"
html = requests.get(url).text
num = 10000
print ('\n==== Python version: %s =====' %sys.version)
print ('\n==== Total trials: %s =====' %num)
next(timer)
soup = bs(html, 'lxml')
for x in range(num):
    paragraphs = soup.findAll('p')
t = next(timer)
print ('bs4 total time: %.1f' %t)
d = pq(html)
for x in range(num):
    paragraphs = d('p')
t = next(timer)
print ('pq total time: %.1f' %t)
tree = fromstring(html)
for x in range(num):
    paragraphs = tree.cssselect('p')
t = next(timer)
print ('lxml (cssselect) total time: %.1f' %t)
tree = fromstring(html)
for x in range(num):
    paragraphs = tree.xpath('.//p')
t = next(timer)
print ('lxml (xpath) total time: %.1f' %t)
for x in range(num):
    paragraphs = re.findall('<[p ]>.*?</p>', html)
t = next(timer)
print ('regex total time: %.1f (doesn\'t find all p)\n' %t)


==== Python version: 3.7.4 (v3.7.4:e09359112e, Jul  8 2019, 14:36:03) 
[GCC 4.2.1 (Apple Inc. build 5666) (dot 3)] =====

==== Total trials: 10000 =====
bs4 total time: 6.7
pq total time: 1.0
lxml (cssselect) total time: 0.9
lxml (xpath) total time: 0.6
regex total time: 1.0 (doesn't find all p)



## 抓取数据的四个途径

- 静态网页
- 接口
- 利用第三方库
- 动态网页 phantomjs, headless_chrome, headless_firefox, pyppeteer, Splash

In [4]:
# 静态网页

import requests
from lxml import etree

url = 'http://www.xicidaili.com/nn/1/'

header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                      '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }

response = requests.get(url, headers=header, timeout=5)

tree = etree.HTML(response.text)

for item in tree.xpath('//table[@id="ip_list"]/tr[position()>1]'):
    ip = item.xpath('./td[2]/text()')[0]
    port = item.xpath('./td[3]/text()')[0]
    print(ip, port)

27.152.91.94 9999
114.239.249.22 9999
106.110.212.53 9999
120.83.109.40 9999
114.239.198.110 9999
121.233.226.59 9999
117.30.113.104 9999
121.226.188.233 9999
114.239.2.227 808
114.239.149.111 808
117.95.162.42 9999
117.69.201.141 9999
117.30.113.84 9999
123.163.122.218 9999
117.30.112.252 9999
27.152.91.29 9999
182.35.81.3 9999
114.239.254.241 9999
117.95.214.22 9999
106.110.212.220 9999
114.239.249.87 9999
182.35.81.235 9999
114.239.252.188 9999
117.57.90.66 9999
114.239.149.126 808
117.69.200.244 9999
117.30.112.135 9999
121.226.215.29 9999
114.239.42.5 9999
180.123.235.167 9999
182.35.84.11 9999
183.164.239.249 9999
183.154.50.228 9999
117.57.90.98 9999
182.34.33.71 9999
27.152.90.108 9999
27.152.90.213 9999
27.152.91.5 9999
183.166.86.163 9999
117.69.200.150 36483
117.30.112.232 9999
114.239.150.230 9999
27.152.91.231 9999
117.69.200.250 9999
120.83.122.155 9999
59.57.149.197 9999
27.152.91.8 9999
49.89.223.201 9999
113.194.29.178 9999
121.233.251.150 9999
61.145.49.246 9999
117.3

In [5]:
# 利用接口

import requests
import json

header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                      '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }

url = 'https://36kr.com/api/newsflash?&per_page=20'
response = requests.get(url,
                        headers=header,
                        timeout=5
                        )

print(json.loads(response.text))

data = json.loads(response.text)['data']

items = data['items']

for item in items:
    # print(item)
    item_info = {}
    title = item['title']
    item_info['title'] = title
    description = item['description']
    item_info['content'] = description
    published_time = item['published_at']
    item_info['published_time'] = published_time
    print(item_info)

{'code': 0, 'timestamp': 1573528987, 'timestamp_rt': 1573528987, 'data': {'items': [{'id': 190117, 'project_id': 1, 'column_id': 72, 'post_id': None, 'is_top': 0, 'pin': 0, 'title': '转转“二手双11”战报：当天下午3时30分GMV超去年全天', 'catch_title': '', 'description': '36氪获悉，二手交易平台转转发布的2019年“二手双11”战报显示，11月11日当天转转平台GMV较去年双11同比增长72%，中午12时8分，转转平台GMV突破1亿元，比去年提前了53分钟破亿；下午3时30分，GMV超过去年双11全天交易总额。11月11日凌晨1时，转转帮助超过7000名用户成功买到经过验机质检的手机，iPhoneX蝉联销冠。', 'cover': '', 'news_url_type': '', 'news_url': '', 'user_id': 13560344, 'published_at': '2019-11-12 11:14:38', 'created_at': '2019-11-12 11:14:38', 'updated_at': '2019-11-12 11:14:38', 'counters': {'view_count': 2, 'pv': 2, 'pv_mobile': 0, 'pv_app': 0, 'comment': 0}, 'extraction_tags_arr': [], 'extraction_tags': '[]', 'column': {'id': 72, 'name': '其他', 'bg_color': '#000000', 'type': 'normal'}, 'db_counters': [{'id': 297826238, 'entity_type': 'newsflash', 'entity_id': 190117, 'count_type': 'pv', 'key': 'kr_newssite_counter:newsflash_190117_pv', 'value': 2, 'created_at': 

In [9]:
# 利用第三方库

import ccxt

# pip install ccxt

print(ccxt.exchanges)

['_1btcxe', 'acx', 'adara', 'allcoin', 'anxpro', 'bcex', 'bequant', 'bibox', 'bigone', 'binance', 'binanceje', 'binanceus', 'bit2c', 'bitbank', 'bitbay', 'bitfinex', 'bitfinex2', 'bitflyer', 'bitforex', 'bithumb', 'bitkk', 'bitlish', 'bitmart', 'bitmax', 'bitmex', 'bitso', 'bitstamp', 'bitstamp1', 'bittrex', 'bitz', 'bl3p', 'bleutrade', 'braziliex', 'btcalpha', 'btcbox', 'btcchina', 'btcmarkets', 'btctradeim', 'btctradeua', 'btcturk', 'buda', 'bytetrade', 'cex', 'chilebit', 'cobinhood', 'coinbase', 'coinbaseprime', 'coinbasepro', 'coincheck', 'coinegg', 'coinex', 'coinexchange', 'coinfalcon', 'coinfloor', 'coingi', 'coinmarketcap', 'coinmate', 'coinone', 'coinspot', 'cointiger', 'coolcoin', 'coss', 'crex24', 'deribit', 'digifinex', 'dsx', 'dx', 'exmo', 'exx', 'fcoin', 'fcoinjp', 'flowbtc', 'foxbit', 'ftx', 'fybse', 'gateio', 'gemini', 'hitbtc', 'hitbtc2', 'huobipro', 'huobiru', 'ice3x', 'idex', 'independentreserve', 'indodax', 'itbit', 'kkex', 'kraken', 'kucoin', 'kuna', 'lakebtc', 'la

In [13]:
# 使用pyppeteer

# 见code/

# pyppeteer_1.py    与百度首页交互
# pyppeteer_2.py    用渲染方式抓取36kr

## Splash

Splash是一个针对js的渲染服务。它内置了一个浏览器和http接口。基于Python3和Twisted引擎。所以可以异步处理任务。

安装（只有linux和mac能安装）：
https://splash.readthedocs.io/en/stable/install.html

首先需要安装docker
docker是什么？
Docker是基于Go语言实现的开源容器项目，诞生于2013年初。
所谓容器，可以简单地理解为隔断。（桶装方便面）

docker hub：镜像服务器

安装：
docker pull scrapinghub/splash

500M

运行：
sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash

访问
http://localhost:8050

修改网址，点击“Render me!”

lua脚本语言

### splash提供的http接口

对于抓取网页，最重要的就是 : render.html

渲染html

curl 'http://localhost:8050/render.html?url=http://www.baidu.com/&timeout=30&wait=0.5'

curl 'http://10.211.10.54:8050/render.html?url=http://www.100ppi.com/news/detail-20190911-1518447.html'

参数解读：

url：必填，要请求的网址
timeout：选填，超时时间
wait：选填，页面加载完毕后，等待的时间

In [14]:
# 抓取36Kr，对比渲染和没有渲染的效果

import requests
from lxml import etree

url = 'http://localhost:8050/render.html?url=https://36kr.com/newsflashes&timeout=30&wait=0.5'
# url = 'https://36kr.com/newsflashes'

response = requests.get(url)

print(response.text)

tree = etree.HTML(response.text)

article_titles = tree.xpath('//div[@class="newsflash-item"]/a/text()')

print(article_titles)

Unable to round-trip http request to upstream: dial tcp [::1]:8050: connect: connection refused
[]


In [None]:
# python执行一段lua脚本

# 抓取京东商品信息

import json
import requests
from lxml import etree
from urllib.parse import quote

lua = '''
function main(splash, args)
    local treat = require("treat")
    local response = splash:http_get("https://search.jd.com/Search?keyword=相机&enc=utf-8")
        return {
            html = treat.as_string(response.body),
            url = response.url,
            status = response.status
        }    
end
'''

# 线上部署的服务，需要将localhost换成服务器的公网地址（不是内网地址）
url = 'http://localhost:8050/execute?lua_source=' + quote(lua)
response = requests.get(url)

html = json.loads(response.text)['html']

tree = etree.HTML(html)

# 单品
products_1 = tree.xpath('//div[@class="gl-i-wrap"]')

for item in products_1:
    try:
        name_1 = item.xpath('./div[@class="p-name p-name-type-2"]/a/em/text()')[0]
        price_1 = item.xpath('./div[@class="p-price"]/strong/@data-price | ./div[@class="p-price"]/strong/i/text()')[0]
        print(name_1)
        print(price_1)
    except:
        pass

# 套装
products_2 = tree.xpath('//div[@class="tab-content-item tab-cnt-i-selected"]')

for item in products_2:
    name_2 = item.xpath('./div[@class="p-name p-name-type-2"]/a/em/text()')[0]
    price_2 = item.xpath('./div[@class="p-price"]/strong/@data-price | ./div[@class="p-price"]/strong/i/text()')[0]
    print(name_2)
    print(price_2)

## 接口分析

In [15]:
# 抓取当当网书评
# http://product.dangdang.com/25340451.html

import json
import requests
from lxml import etree


for i in range(1,5):
    # url = 'http://product.dangdang.com/index.php?r=comment/list&productId=25340451&pageIndex=1'
    url = 'http://product.dangdang.com/index.php?r=comment/list&productId=25340451&categoryPath=01.07.07.04.00.00&mainProductId=25340451&mediumId=0&pageIndex={}'.format(i)

    header = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                              '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
            }

    response = requests.get(url,
                            headers=header,
                            timeout=5
                            )

    # print(response.text)

    result = json.loads(response.text)
    
    comment_html = result['data']['list']['html']
    
    tree = etree.HTML(comment_html)
    
    comments = tree.xpath('//div[@class="items_right"]')
    
    for item in comments:
        comment_time = item.xpath('./div[contains(@class,"starline")]/span[1]/text()')[0]
        comment_content = item.xpath('./div[contains(@class,"describe_detail")]/span[1]//text()')[0]
        print(comment_time)
        print(comment_content)

2018-10-04 12:34:43
喜欢定期不定期地给自己送礼物。最近眼前的苟且太多，又不能去看远方的田野，于是送自己一本美美的绘本。非常喜欢陈丝雨的《楚辞》，比之前的《山海经》有了更丰富的色彩、更宏大的场景、更梦幻的想象。周末点个香翻翻画，心就很舒服很宁静，偶尔也描描线，假装自己在创作……之前在书展看过这套书，特别喜欢，说是只做了样品，不能卖，当当上终于出现了。感觉出版社有心了，设计非常精美，复刻古籍古色古香，印制的色彩很饱满，纸张的颜色手感都很好。值得收藏，适合送自己，也适合送朋友送客户。反正，是很喜欢啊，正在到处安利呢
2018-10-15 20:51:55
很美的手绘本，让自已的藏书柜又多了一本喜爱的精品！内容很好，很怡情安心！装帧很精美结实！总之，感谢！
2018-10-16 13:06:42
文字正，图画工笔越看越好看，美是可以与好的灵魂共渡的。
2018-09-26 10:25:55
下班回到出租屋，立刻开一瓶酒，光着脚，在微醺中翻看着这本楚辞绘本，一时间，暴涨的房租、无尽的撕逼、迫在眉睫的死线，好像都不见了。这一刻，我才是我自己，我才可以过人的生活
2018-12-11 22:35:51
画集非常的精美，画风细腻，买来送朋友的，她特别喜欢。
2018-11-22 14:41:16
原来是画集，不过感觉好多配的字不认识，需要学习学习
2018-12-11 21:32:50
对于我来说，感觉有点失望，意境可能有了，但总觉得用来收藏吧，画面不够精细，也可能是因为大家的评价太高了。
2018-09-25 14:35:21
拿到先见本特别惊喜，被线条和色彩震惊到了，跟之前的《山海经》相比，大概是丝雨大大进化了吧，尤其是大折页视觉冲击里很强。线装书，印刷质量好，纸张质感和颜色都非常好，疯狂安利丝雨大大画的楚辞，已经安利了十几本，羞涩.jpg
2018-09-26 11:16:04
原来都是给女儿买绘本，这回奢侈一下，给自己也买了一本。绘画很好看，特别有中国的古韵，但又不是其他插画那种土土的感觉，看着特别愉悦。出乎意料的是，女儿虽然看不懂什么，也很喜欢翻呢
2018-09-26 09:07:16
这本楚辞真的很精美，纸质选材很好，在更大程度上把大大的作品细腻化了。在欣赏画集的同时感受到中国古代文化的博大精深和屈原的神奇想象力，总的来说这本《楚辞·观》很值得收藏

In [17]:
# 抓取非小号的图表接口
# https://www.feixiaohao.com/currencies/raiden-network-token/

import requests
import json

header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                      '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }

url = 'https://dncapi.bqiapp.com/api/coin/web-charts?code=raiden-network-token&type=w&webp=0'
response = requests.get(url,
                        headers=header,
                        timeout=5
                        )

result = json.loads(response.text)

print(result)

print(result.keys())

{'code': 200, 'msg': 'success', 'value': '[1572936605000,0.1599,0.0000172,8033252,1160062],[1572937173000,0.1622,0.00001747,8149579,1148984],[1572937805000,0.159,0.00001724,7987929,1140105],[1572938405000,0.1615,0.00001753,8110421,1158267],[1572939005000,0.162,0.00001757,8138311,1151043],[1572939605000,0.1622,0.00001757,8149343,1155558],[1572940205000,0.1619,0.00001755,8133336,1152696],[1572940806000,0.1589,0.00001722,7984360,1133325],[1572941405000,0.1617,0.00001753,8121857,1155028],[1572942004000,0.1588,0.00001721,7976878,1132491],[1572942605000,0.1587,0.00001717,7974287,1130274],[1572943206000,0.1615,0.00001748,8113257,1122955],[1572943774000,0.1607,0.00001738,8072852,1070222],[1572944405000,0.1614,0.00001746,8108775,968595],[1572944973000,0.1589,0.0000172,7980536,970409],[1572945605000,0.1596,0.00001722,8016021,980851],[1572946205000,0.16,0.00001724,8037980,984350],[1572946805000,0.1603,0.00001722,8054908,983574],[1572947406000,0.1628,0.00001747,8178428,992261],[1572948004000,0.162

## 模拟登录

什么是模拟登录？

要抓取的信息，只有在登录之后才能查看。这种情况下，就需要爬虫做模拟登录，绕过登录页。

cookies和session的区别：

cookie数据存放在客户的浏览器上，session数据放在服务器上；

cookie不是很安全，别人可以分析存放在本地的COOKIE并进行COOKIE欺骗，考虑到安全应当使用session；

session会增加服务器的负载；

In [None]:
# 代码见/code/github_login.py