# urllib 库的使用

## urlopen函数

urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)
可以实现网页的请求

In [None]:
import urllib.request

response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8')) ## read函数获取响应体的内容


### data参数
> data参数常用于post请求   
> data 需要为bytes类型  
> urllib.parse.urlencode() data参数编码为bytes类型  
> httpbin 为http请求测试网站   
> 

In [None]:
import urllib.parse
import urllib.request

data=bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf-8')
response=urllib.request.urlopen('http://httpbin.org/post',data=data)
print(response.read())

### timeout 参数

> 设置超时时间  


In [None]:
import urllib.request

response = urllib.request.urlopen('http://httpbin.org/get',timeout=1)
print(response.read())


In [None]:
import socket
import urllib.request
import urllib.error

try:
    response=urllib.request.urlopen('http://httpbin.org/get',timeout=0.1)
except urllib.error.URLError as e:
    if isinstance(e.reason,socket.timeout):
        print('time out')
    

## 响应

### 响应类型

In [None]:
import urllib.request

response = urllib.request.urlopen('https://www.python.org')
print(type(response))

### 响应头 状态码

In [None]:
import urllib.request

response = urllib.request.urlopen('http://httpbin.org')
print(response.status)
print(response.getheaders())        #获取响应头
print(response.getheader('Server')) #获取指定的响应头

### 响应体

In [None]:
import urllib.request

response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8')) ## read函数获取响应体的内容


## request
> 可以补充更多请求信息例如headers ,而urlopen()内没有这种参数选项   
```class urllib.request.Request（url，data = None，headers = {}，origin_req_host = None，unverifiable = False，method = None ）```  


### 基本使用

In [None]:
import urllib.request

request = urllib.request.Request('http://www.baidu.com')
reponse = urllib.request.urlopen(request)
print(reponse.read().decode('utf-8'))

### 请求中加入 headers

In [None]:
from urllib import request,parse

url = 'http://httpbin.org/post'
headers={
    'User-Agent':' Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    'Host':' httpbin.org',
}
dict={
    'name':'Gar'
}
data = bytes(urllib.parse.urlencode(dict),encoding='utf8')
req=urllib.request.Request(url=url,data=data,headers=headers,method='POST')
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

### 另外的加入header的方法
> request.add_header(key,value)  
> 可以用循环多次调用这个函数加入响应头

In [None]:
from urllib import request,parse

url = 'http://httpbin.org/post'
dict={
    'name':'Gar'
}
data = bytes(urllib.parse.urlencode(dict),encoding='utf8')
req=urllib.request.Request(url=url,data=data,method='POST')
req.add_header('User-Agent',' Mozilla/5.0 (X11; Linux x86_64)')
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

## Handler

### 代理
    urllib.request.ProxyHandler(proxies = None )  
    用于ip伪装


##### http代理

In [None]:
from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener

proxy = 'username:password@127.0.0.1:9743'
proxy_handler = ProxyHandler({
    'http': 'http://' + proxy,
    'https': 'https://' + proxy
})
opener = build_opener(proxy_handler)
try:
    response = opener.open('http://httpbin.org/get')
    print(response.read().decode('utf-8'))
except URLError as e:
    print(e.reason)

##### socks5代理

In [None]:
import socks # 需要安装PySocks 模块
import socket
from urllib import request

socks.set_default_proxy(socks.SOCKS5,'127.0.0.1',1080)
socket.socket = socks.socksocket

response  = request.urlopen('http://httpbin.org/get')
print(response.read())

### Cookie

``` class urllib.request.HTTPCookieProcessor（cookiejar = None) ```

In [None]:
import http.cookiejar
import urllib.request

cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
    print(item.name+'='+item.value)

#### cookie 保存 两种方式

```class http.cookiejar.MozillaCookieJar(filename, delayload=None, policy=None)```

In [None]:
import http.cookiejar
import urllib.request

filename='cookie.txt' # 文件存储在当前目录下
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
##FileCookieJar.save(filename=None, ignore_discard=False, ignore_expires=False)
cookie.save(filename=filename,ignore_discard=True, ignore_expires=True)


```class http.cookiejar.LWPCookieJar(filename, delayload=None, policy=None)
```

In [None]:
import http.cookiejar
import urllib.request

filename='cookie.txt' # 文件存储在当前目录下
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
##FileCookieJar.save(filename=None, ignore_discard=False, ignore_expires=False)
cookie.save(filename=filename,ignore_discard=True, ignore_expires=True)


#### cookie文件导入
```FileCookieJar.load(filename=None, ignore_discard=False, ignore_expires=False)```

In [None]:
import http.cookiejar
import urllib.request

cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie.txt',ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))


## 异常处理

### URLError
> 来自error模块 继承自OSError类,error异常模块的基类,request产生的异常可以通过捕获这个类来处理

In [76]:
# 简单的例子,打开一个不存在的页面,捕获URLError 避免程序异常终止
from urllib import request,error

try:
    response = request.urlopen('http://123.com/index.html')
except error.URLError as e:
    print('NOT FOUND')

NOT FOUND


### HTTPError
> URLError 的子类 专门处理HTTP的请求错误 例如认证错误  
> 有三个属性  
>code:返回HTTP 状态码 404...  
>reason:返回错误原因  
>headers:返回请求头  


In [5]:
from urllib import request
from urllib import error
try:
    response = request.urlopen('http://123.com/index.html')
except error.HTTPError as e:
    print(e.reason,e.code,e.headers,sep='\n')

Not Found
404
Content-Length: 1308
Content-Type: text/html
Server: Microsoft-IIS/6.0
X-Powered-By: ASP.NET
Date: Sat, 26 Jan 2019 09:19:21 GMT
Connection: close




In [8]:
# URLError是HTTPError 的父类,程序先捕获子类在捕获父类的错误

from urllib import request,error

try:
    response = request.urlopen('http://123.com/index.html')
except error.HTTPError as e:
    print(e.code,e.reason,sep='\n')
except error.URLError as e:
    print(e.reason)
else:
    print('ok')

404
Not Found


In [12]:
# reason 属性返回不是字符串,是对象

from urllib import request,error
import socket

try:
    response = request.urlopen('https://www.baidu.com',timeout=0.01)
except error.URLError as e:
    print(type(e.reason))
    if isinstance(e.reason,socket.timeout):
        print('time out')

<class 'socket.timeout'>
time out


## 解析链接  
> urllib parse模块   
> 实现URL的抽取,合并,链接转换


### urlparse  
```urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)```
> 实现URL的识别和分段  
> 按照链接的一个格式 scheme://netloc/path;params?query#fragment 来进行拆分  
> 大致为六部分  
>scheme:协议  
>netloc:域名  
>path:访问路径  
>params:参数  
>query: 查询条件  
>fragment:锚点  

* urlstring 必填 解析的URL  
* scheme 默认的协议 在URL 不包含scheme信息的时候生效,若URL包含scheme信息则返回该信息  
* allow_fragments 是否忽略fragment,为false时忽略,会被解析为path,parameters,query的一部分,fragment部分为空


In [15]:
from urllib.parse import urlparse

# 返回结果为ParseRequest类型对象,包含上述六部分
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(type(result),result)

<class 'urllib.parse.ParseResult'> ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')


#### allow_fragments=False 例子

In [16]:
#example 1 fragment部分解析到query中
from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment',allow_fragments=False)
print(result)

ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5#comment', fragment='')


In [17]:
#example 2 fragment部分解析到path中
from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html#comment',allow_fragments=False)
print(result)

ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment='')


In [20]:
#example 2 fragment部分解析到path中
from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html#comment',allow_fragments=False)
print(result.scheme,result[0],sep='\n')
# ParseRequest 实际上为元祖,可以利用索引顺序进行获取或者属性名获取

http
http


### urlunparse

In [21]:
from urllib.parse import urlunparse

data=['http','www.baidu.com','index.html','user','a=6','comment']
print(urlunparse(data))

http://www.baidu.com/index.html;user?a=6#comment


### urlsplit 
> 与urlparse类似 不再单独解析params 这一部分,params合并到path中  


In [22]:
from urllib.parse import urlsplit

result = urlsplit('http://www.baidu.com/index.html#comment')
print(result)#SplitResult为元组类型

SplitResult(scheme='http', netloc='www.baidu.com', path='/index.html', query='', fragment='comment')


### urlunsplit


In [24]:
from urllib.parse import urlunsplit

data=['http','www.baidu.com','index.html','a=6','comment']
print(urlunsplit(data))

http://www.baidu.com/index.html?a=6#comment


### urljoin
```urllib.parse.urljoin(base, url, allow_fragments=True)```
* base 提供base_url 该方法会分析base_url 的scheme,netloc,path的内容,对url部分进行补充 
* url 新链接 base用于补充url部分中上述提到三部分缺失部分

In [34]:
from urllib.parse import urljoin

print(urljoin('http://www.baidu.com','FAQ.html'))
print(urljoin('http://www.baidu.com','https://cuiqingcai.com/FAQ.html'))
print(urljoin('http://www.baidu.com/FAQ.html','https://cuiqingcai.com/FAQ.html'))
print(urljoin('http://www.baidu.com/FAQ.html','https://cuiqingcai.com/FAQ.html?question=2'))
print(urljoin('http://www.baidu.com/FAQ.html?wd=abc','https://cuiqingcai.com/FAQ.html'))
print(urljoin('www.baidu.com/','index.html'))

http://www.baidu.com/FAQ.html
https://cuiqingcai.com/FAQ.html
https://cuiqingcai.com/FAQ.html
https://cuiqingcai.com/FAQ.html?question=2
https://cuiqingcai.com/FAQ.html
www.baidu.com/index.html


### urlencode 
> 常用于构建参数  序列化


In [35]:
from urllib.parse import urlencode 

params = {
    'name':'germey',
    'age':22
}
base_url = 'https://www.baidu.com?'
url = base_url + urlencode(params)
print(url)

https://www.baidu.com?name=germey&age=22


### parse_qs()&&parse_qsl() 反序列化

In [36]:
#parse_qs 反序列成字典类型 
from urllib.parse import parse_qs

query = 'name=germey&age=22'
print(parse_qs(query))

{'name': ['germey'], 'age': ['22']}


In [38]:
#parse_qsl 反序列化成元组类型

from urllib.parse import parse_qsl

query = 'name=germey&age=22'
print(parse_qsl(query))

[('name', 'germey'), ('age', '22')]


### quote&&unquote
* quote 可以将中文参数转化为url编码
* unquote url 解码

In [42]:
from urllib.parse import quote,unquote

url = 'https://www.baidu.com/?wd='+quote('中文')
print(url)

url = 'https://www.baidu.com/?wd=%E4%B8%AD%E6%96%87'
print(unquote(url))

https://www.baidu.com/?wd=%E4%B8%AD%E6%96%87
https://www.baidu.com/?wd=中文
