### Requests

它是一个Python第三方库，处理URL资源特别方便


In [None]:
import requests      # 如果这里出错，证明你还没有安装这个库

In [None]:
r = requests.get('https://www.toutiao.com/')  # 今日头条
print("查看返回状态", r.status_code)   # 200代表成功 ，404， 403， 501这些意思可以百度查一下

In [None]:
# 查看一下内容
"""
print(r.text)               # 返回正常的网页内容, 即解压解码之后的内容
print(r.content)            # 返回byte类型的网页内容, 即值解压, 没有解码
print(r.json())             # 如果网页内容为json, 直接返回一个json对象
print(r.encoding)           # 返回网页的编码: "utf-8"
"""
r.content

In [None]:
# 网页表头信息
r.headers

In [None]:
from urllib.parse import urlencode
# 获取一些有意思的内容
def get_page(offset):
    params = {
        'offset': offset,
        'format': 'json',
        'keyword': '搞笑',
        'autoload': 'true',
        'count': '20',
        'cur_tab': '3',
        'from': 'gallery',
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(params)
    try:
        print(url)
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
    except requests.ConnectionError:
        return None

In [None]:
contents = get_page(1)

In [None]:
print(contents)

In [None]:
# 分析一下结构
data = contents.get('data')
all_images = {}
if data:
    for item in data:
        # print(item)
        image_list = item.get('image_list')
        title = item.get('title')
        item_id = item.get('id')
        # print(image_list)
        imgs = []
        for image in image_list:
            imgs.append(image.get('url')[2:])
        
        all_images[item_id] = {
            'title': title,
            'images': imgs
        }
print(all_images)

In [None]:
# 练习：保存图片   提示： os.path， 字符串处理（+http， 替换list->large， 文档操作)
# 建议使用Pycharm来写

In [None]:
# 补充知识

# 不同方式获取网页内容, 返回一个Response对象, 请求的参数可以为url或Request对象
r0 = requests.get("https://github.com/timeline.json")
r1 = requests.post("http://httpbin.org/post")
r2 = requests.put("http://httpbin.org/put")
r3 = requests.delete("http://httpbin.org/delete")
r4 = requests.head("http://httpbin.org/get")
r5 = requests.options("http://httpbin.org/get")
r6 = requests.patch("http://httpbin.org/get")

# 定制请求头: 一个字典
headers = {"user-agent": "my-app/0.0.1"}
r = requests.get("https://api.github.com/some/endpoint", headers=headers)
print(r.request.headers)    # 获取request的头部
print(r.headers)            # 获取response的头部

# 模拟一个手机的ＵＡ
# Mozilla/5.0 (Linux; Android 8.1.0; ALP-AL00 Build/HUAWEIALP-AL00; wv) 
# AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/63.0.3239.83 
# Mobile Safari/537.36 T7/10.13 baiduboxapp/10.13.0.11 (Baidu; P1 8.1.0)

# {
#     "content-encoding": "gzip",
#     "transfer-encoding": "chunked",
#     "connection": "close",
#     "server": "nginx/1.0.4",
#     "x-runtime": "148ms",
#     "etag": "e1ca502697e5c9317743dc078f67693f",
#     "content-type": "application/json"
# }
print(r.headers["Content-Type"])        # "application/json"
print(r.headers.get("content-type"))    # "application/json"

# 更加复杂的POST请求: 表单
post_dict = {"key1": "value1", "key2": "value2"}
r = requests.post("http://httpbin.org/post", data=post_dict)
print(r.text)

# 要想发送你的cookies到服务器, 可以使用cookies参数(一个字典)
cookies = {"cookies_are": "working"}
r = requests.get("http://httpbin.org/cookies", cookies=cookies)
print(r.text)

# 会话对象: 会话对象让你能够跨请求保持某些参数, 它也会在同一个Session实例发出的所有请求之间保持cookie
s = requests.Session()
s.get("http://httpbin.org/cookies/set/sessioncookie/123456789")
s.get("http://httpbin.org/cookies")
for cookie in s.cookies:
    print(cookie)

# 如果你要手动为会话添加cookie, 就是用Cookie utility函数来操纵Session.cookies
requests.utils.add_dict_to_cookiejar(s.cookies, {"cookie_key": "cookie_value"})

# 会话也可用来为请求方法提供缺省数据, 这是通过为会话对象的属性提供数据来实现的
s.auth = ("user", "pass")
s.headers.update({"x-test": "true"})
s.get("http://httpbin.org/headers", headers={"x-test2": "true"})
# both "x-test" and "x-test2" are sent