In [1]:
# 查看默认的信息. 如: user-agent: "Python-urllib/3.10"
from urllib.request import Request, urlopen


req = Request('http://httpbin.org/anything')
with urlopen(req) as response:
    status_code = response.status
    header = response.headers
    body = response.read().decode('utf-8')
    print(status_code)
    print(header)
    print(body)

200
Date: Tue, 20 Sep 2022 15:10:53 GMT
Content-Type: application/json
Content-Length: 364
Connection: close
Server: gunicorn/19.9.0
Access-Control-Allow-Origin: *
Access-Control-Allow-Credentials: true


{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.10", 
    "X-Amzn-Trace-Id": "Root=1-6329d7fd-017a78d7453ca5cc5b85f287"
  }, 
  "json": null, 
  "method": "GET", 
  "origin": "120.229.19.121", 
  "url": "http://httpbin.org/anything"
}



In [2]:
# 设定user-agent
from urllib.request import Request, urlopen

header = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"}
req = Request('http://httpbin.org/anything', headers=header)
with urlopen(req) as response:
    status_code = response.status
    header = response.headers
    body = response.read().decode('utf-8')
    print(status_code)
    print(header)
    print(body)

200
Date: Tue, 20 Sep 2022 15:10:56 GMT
Content-Type: application/json
Content-Length: 447
Connection: close
Server: gunicorn/19.9.0
Access-Control-Allow-Origin: *
Access-Control-Allow-Credentials: true


{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", 
    "X-Amzn-Trace-Id": "Root=1-6329d800-796103482b17ea7348135082"
  }, 
  "json": null, 
  "method": "GET", 
  "origin": "120.229.19.121", 
  "url": "http://httpbin.org/anything"
}



In [3]:
# ssl连接
from urllib.request import Request, urlopen
import ssl

header = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"}
context = ssl.create_default_context(capath='/etc/ssl/certs/')
req = Request('https://httpbin.org/anything', headers=header)
with urlopen(req, context=context) as response:
    status_code = response.status
    header = response.headers
    body = response.read().decode('utf-8')
    print(status_code)
    print(header)
    print(body)

200
Date: Tue, 20 Sep 2022 15:11:00 GMT
Content-Type: application/json
Content-Length: 448
Connection: close
Server: gunicorn/19.9.0
Access-Control-Allow-Origin: *
Access-Control-Allow-Credentials: true


{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", 
    "X-Amzn-Trace-Id": "Root=1-6329d804-588aeb8b0e9be18c52acc0a2"
  }, 
  "json": null, 
  "method": "GET", 
  "origin": "120.229.19.121", 
  "url": "https://httpbin.org/anything"
}



In [None]:
# socks5 proxy
from urllib.request import Request, urlopen
import ssl
import socket, socks

header = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"}
context = ssl.create_default_context(capath='/etc/ssl/certs/')
req = Request('https://httpbin.org/anything', headers=header)
# 需额外安装pysocks[1]
socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 1080)
socket.socket = socks.socksocket
with urlopen(req, context=context) as response:
    status_code = response.status
    header = response.headers
    body = response.read().decode('utf-8')
    print(status_code)
    print(header)
    print(body)

In [None]:
# 下载tar文件
from urllib.request import urlopen, Request
import ssl
import socket, socks
import tarfile

url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/"
file = "housing.tgz"
chunk_size = 1024 * 64
socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 1080)
socket.socket = socks.socksocket
context = ssl.create_default_context(capath="/etc/ssl/certs")
with urlopen(url+file, context=context) as response:
    with open(file, 'wb') as down_file:
        while chunk := response.read(chunk_size):
            down_file.write(chunk)

tgz_file = tarfile.open(file)
tgz_file.extractall()
tgz_file.close()
print("download complete!!")

In [None]:
# 最后编辑于: 2022-09-20

In [None]:
# 引用:
# [1]pysocks doc: https://pypi.org/project/PySocks/