In [1]:
import requests
import json
import logging
import re

In [2]:
def bilibili_headers(referer=None, cookie=None):
        # 请求消息的头部，模拟浏览器的行为访问Bilibili网站，否则访问会被拒绝
        ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
        headers = {'User-Agent': ua}
        if referer is not None:
            headers.update({'Referer': referer})
        if cookie is not None:
            headers.update({'Cookie': cookie})
        return headers

In [3]:
def get_content(url, headers={}, decoded=True):
        # 获取链接的html文件
        logging.debug('get_content: %s' % url)
        data = None
        session = requests.Session()
        try:
            response = session.get(url, headers=headers,verify=False)
        except:
            raise
        else:
            data = response.text
        return data

In [4]:
def match1(text, *patterns):
    """Scans through a string for substrings matched some patterns (first-subgroups only).

    Args:
        text: A string to be scanned.
        patterns: Arbitrary number of regex patterns.

    Returns:
        When only one pattern is given, returns a string (None if no match found).
        When more than one pattern are given, returns a list of strings ([] if no match found).
    """

    if len(patterns) == 1:
        pattern = patterns[0]
        match = re.search(pattern, text)
        if match:
            return match.group(1)
        else:
            return None
    else:
        ret = []
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                ret.append(match.group(1))
        return ret

In [5]:
stream_types = [
        {'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280,
         'container': 'FLV', 'video_resolution': '1080p', 'desc': '高清 1080P60'},
        # 'id': 'hdflv2', 'quality': 112?
        {'id': 'flv', 'quality': 80, 'audio_quality': 30280,
         'container': 'FLV', 'video_resolution': '1080p', 'desc': '高清 1080P'},
        {'id': 'flv720_p60', 'quality': 74, 'audio_quality': 30280,
         'container': 'FLV', 'video_resolution': '720p', 'desc': '高清 720P60'},
        {'id': 'flv720', 'quality': 64, 'audio_quality': 30280,
         'container': 'FLV', 'video_resolution': '720p', 'desc': '高清 720P'},
        {'id': 'hdmp4', 'quality': 48, 'audio_quality': 30280,
         'container': 'MP4', 'video_resolution': '720p', 'desc': '高清 720P (MP4)'},
        {'id': 'flv480', 'quality': 32, 'audio_quality': 30280,
         'container': 'FLV', 'video_resolution': '480p', 'desc': '清晰 480P'},
        {'id': 'flv360', 'quality': 16, 'audio_quality': 30216,
         'container': 'FLV', 'video_resolution': '360p', 'desc': '流畅 360P'},
        # 'quality': 15?
        {'id': 'mp4', 'quality': 0},
    ]

In [6]:
stream_qualities = {s['quality']: s for s in stream_types}
stream_qualities

{0: {'id': 'mp4', 'quality': 0},
 16: {'audio_quality': 30216,
  'container': 'FLV',
  'desc': '流畅 360P',
  'id': 'flv360',
  'quality': 16,
  'video_resolution': '360p'},
 32: {'audio_quality': 30280,
  'container': 'FLV',
  'desc': '清晰 480P',
  'id': 'flv480',
  'quality': 32,
  'video_resolution': '480p'},
 48: {'audio_quality': 30280,
  'container': 'MP4',
  'desc': '高清 720P (MP4)',
  'id': 'hdmp4',
  'quality': 48,
  'video_resolution': '720p'},
 64: {'audio_quality': 30280,
  'container': 'FLV',
  'desc': '高清 720P',
  'id': 'flv720',
  'quality': 64,
  'video_resolution': '720p'},
 74: {'audio_quality': 30280,
  'container': 'FLV',
  'desc': '高清 720P60',
  'id': 'flv720_p60',
  'quality': 74,
  'video_resolution': '720p'},
 80: {'audio_quality': 30280,
  'container': 'FLV',
  'desc': '高清 1080P',
  'id': 'flv',
  'quality': 80,
  'video_resolution': '1080p'},
 116: {'audio_quality': 30280,
  'container': 'FLV',
  'desc': '高清 1080P60',
  'id': 'flv_p60',
  'quality': 116,
  'video_

In [7]:
#https://www.bilibili.com/video/av29306544
# https://www.bilibili.com/video/av30080993
url = "https://www.bilibili.com/video/av30080993"
html_content = get_content(url, headers=bilibili_headers())
html_content



'<!DOCTYPE html><html><head itemprop="video" itemscope itemtype="http://schema.org/VideoObject"><title data-vue-meta="true">尚硅谷Java视频教程_IDEA视频教程_哔哩哔哩 (゜-゜)つロ 干杯~-bilibili</title> <meta data-vue-meta="true" http-equiv="Content-Type" content="text/html" charset="utf-8"><meta data-vue-meta="true" name="renderer" content="webkit"><meta data-vue-meta="true" http-equiv="X-UA-Compatible" content="IE=edge"><meta data-vue-meta="true" name="spm_prefix" content="333.788"><meta data-vue-meta="true" itemprop="keywords" name="keywords" content="尚硅谷Java视频教程_IDEA视频教程,IDEA,Java,尚硅谷,科技,演讲·公开课,哔哩哔哩,Bilibili,B站,弹幕"><meta data-vue-meta="true" itemprop="description" name="description" content="http://www.atguigu.com/\r\rIntelliJ IDEA在业界被公认为最好的Java开发工具之一，因其功能强悍、设置人性化，而深受Java、大数据、移动端程序员的喜爱。本着&amp;quot;工欲善其事必先利其器&amp;quot;的精神，本套视频从IDEA的介绍、安装、设置入手，讲解IDEA中多种项目的创建、模板的使用、断点调试、数据库的关联、插件的下载、Maven及版本控制工具的配置等内容"><meta data-vue-meta="true" itemprop="author" name="author" content="尚硅谷官方"><meta data-vue-meta="true" itemp

In [8]:
initial_state_text = match1(html_content,r'__INITIAL_STATE__=(.*?);\(function\(\)')  
initial_state_text

'{"aid":"30080993","p":"","videoData":{"aid":30080993,"videos":19,"tid":39,"tname":"演讲• 公开课","copyright":2,"pic":"http:\\u002F\\u002Fi2.hdslb.com\\u002Fbfs\\u002Farchive\\u002Fb23f644eb8de853e72d43154366f40bea83e1140.png","title":"尚硅谷Java视频教程_IDEA视频教程","pubdate":1534992510,"ctime":1534992507,"desc":"http:\\u002F\\u002Fwww.atguigu.com\\u002F\\r\\n\\r\\nIntelliJ IDEA在业界被公认为最好的Java开发工具之一，因其功能强悍、设置人性化，而深受Java、大数据、移动端程序员的喜爱。本着&quot;工欲善其事必先利其器&quot;的精神，本套视频从IDEA的介绍、安装、设置入手，讲解IDEA中多种项目的创建、模板的使用、断点调试、数据库的关联、插件的下载、Maven及版本控制工具的配置等内容。","state":0,"attribute":16384,"duration":14155,"rights":{"bp":0,"elec":0,"download":1,"movie":0,"pay":0,"hd5":0,"no_reprint":0,"autoplay":1,"ugc_pay":0,"is_cooperation":0,"ugc_pay_preview":0},"owner":{"mid":302417610,"name":"尚硅谷官方","face":"http:\\u002F\\u002Fi0.hdslb.com\\u002Fbfs\\u002Fface\\u002F07e341afff5f08f4afbfb37427341e261d4f93e6.jpg"},"stat":{"aid":30080993,"view":"--","danmaku":"--","reply":43,"favorite":"--","coin":"--","share":"--","now_rank":0,"his_rank

In [9]:
initial_state = json.loads(initial_state_text)
initial_state

{'aid': '30080993',
 'cidMap': {},
 'comment': {'count': 43,
  'list': ['资料链接：链接：https://pan.baidu.com/s/11biVBv9EI9yfL6Cee0r0LQ \n密码：n7hn',
   '本视频加配置文件网盘链接:https://pan.baidu.com/s/1nXxl95jrjMw9wlDK6H7Y1Q\xa0提取码:n4sh',
   '请问有用IDEA开发web项目，和maven项目的演示视频吗？']},
 'error': {},
 'insertScripts': ['//s1.hdslb.com/bfs/static/jinkela/video/stardust-video.6d881c6b7684b56dbe818cc4209d6ec79d914194.js'],
 'isClient': False,
 'p': '',
 'player': '',
 'playurl': {},
 'related': [{'aid': 34324988,
   'cid': 60669449,
   'duration': 5257,
   'owner': {'mid': 193580090, 'name': '御风大世界'},
   'pic': 'http://i0.hdslb.com/bfs/archive/a59aa37d859dceec59fde1f22ee47802719ef6fb.jpg',
   'stat': {'danmaku': 70, 'view': 8998},
   'title': 'IDEA教程-干货大合集'},
  {'aid': 43448506,
   'cid': 76148692,
   'duration': 2899,
   'owner': {'mid': 99817089, 'name': 'firecoooool'},
   'pic': 'http://i0.hdslb.com/bfs/archive/95c0d4e2c4407cff152029fa353ab46882e957ee.jpg',
   'stat': {'danmaku': 6, 'view': 1703},
   'title': 'ja

In [10]:
playinfo_text = match1(html_content, r'__playinfo__=(.*?)</script><script>')
playinfo_text
playinfo = json.loads(playinfo_text) if playinfo_text else None
playinfo

{'code': 0,
 'data': {'accept_description': ['高清 1080P', '高清 720P', '清晰 480P', '流畅 360P'],
  'accept_format': 'flv,flv720,flv480,flv360',
  'accept_quality': [80, 64, 32, 16],
  'durl': [{'ahead': 'E4g=',
    'backup_url': ['http://upos-hz-mirrorks3u.acgvideo.com/upgcxcode/25/08/52430825/52430825-1-32.flv?e=ig8euxZM2rNcNbug7WdVtWug7WdVNEVEuCIv29hEn0lqXg8Y2ENvNCImNEVEUJ1miI7MT96fqj3E9r1qNCNEto8g2ENvN03eN0B5tZlqNxTEto8BTrNvNeZVuJ10Kj_g2UB02J0mN0B5tZlqNCNEto8BTrNvNC7MTX502C8f2jmMQJ6mqF2fka1mqx6gqj0eN0B599M=&deadline=1559271845&gen=playurl&nbs=1&oi=3698564895&os=ks3u&platform=pc&trid=57d719aaeb7c4a5dacbe6f9375f8cfa9&uipk=5&upsig=29d31edcebb405371717dacff5e4790e&uparams=e,deadline,gen,nbs,oi,os,platform,trid,uipk&mid=0',
     'http://upos-hz-mirrorcosu.acgvideo.com/upgcxcode/25/08/52430825/52430825-1-32.flv?e=ig8euxZM2rNcNbug7WdVtWug7WdVNEVEuCIv29hEn0lqXg8Y2ENvNCImNEVEUJ1miI7MT96fqj3E9r1qNCNEto8g2ENvN03eN0B5tZlqNxTEto8BTrNvNeZVuJ10Kj_g2UB02J0mN0B5tZlqNCNEto8BTrNvNC7MTX502C8f2jmMQJ6mqF2fka1m

In [11]:
pn = initial_state['videoData']['videos']
pn

19

In [12]:
title = initial_state['videoData']['title']
title

'尚硅谷Java视频教程_IDEA视频教程'

In [13]:
currentQuality = playinfo['data']['quality']
currentQuality

32

In [14]:
playinfo['data']['accept_quality']

[80, 64, 32, 16]

In [15]:
qualityIDs = []
for i in playinfo['data']['accept_quality']:
    if i in (116,80,74,64,48,32,16):#我们只对stream_types里面有的视频质量进行下载
        qualityIDs.append(i)
qualityIDs

[80, 64, 32, 16]

In [16]:
acceptQualitys = [stream_qualities[id]['desc'] for id in qualityIDs]
acceptQualitys

['高清 1080P', '高清 720P', '清晰 480P', '流畅 360P']

In [17]:
p = int(match1(url, r'[\?&]p=(\d+)') or match1(url, r'/index_(\d+)') or '1')
cid = initial_state['videoData']['pages'][p - 1]['cid']
# 请求弹幕文件并解码
response = requests.get('http://comment.bilibili.com/%s.xml' % cid,headers=bilibili_headers())
danmaku = response.content.decode()
danmaku

'<?xml version="1.0" encoding="UTF-8"?><i><chatserver>chat.bilibili.com</chatserver><chatid>52430825</chatid><mission>0</mission><maxlimit>3000</maxlimit><state>0</state><real_name>0</real_name><source>k-v</source><d p="24.09300,1,25,16777215,1535006675,0,5185c473,4123263411159040">老乡？？？</d><d p="57.56500,1,25,16777215,1535023406,0,4b1926f5,4132035102244932">抓娃应用程序</d><d p="621.68300,1,25,16777215,1535117387,0,65e33995,4181308134653952">Java内裤</d><d p="632.18300,1,25,16777215,1535296188,0,8234dd96,4275051176656900">python 为何不适用pycharm</d><d p="12.14000,1,25,16777215,1539424173,0,a8bf1a49,6439304422752256">中式发音？</d><d p="19.15700,1,25,16777215,1539869129,0,83b42cdb,6672589716979712">从远古eclipse来</d><d p="117.83600,1,25,16777215,1539869553,0,83b42cdb,6672811972100096">╮(╯▽╰)╭</d><d p="317.64900,1,25,16777215,1539869753,0,83b42cdb,6672916719075328">莫名</d><d p="675.59100,1,25,16777215,1539870111,0,83b42cdb,6673104381149184">便宜</d><d p="60.08200,1,25,16777215,1540117154,0,8e3f51ce,6802625978

In [18]:
format_id = stream_qualities[currentQuality]['id']
container = stream_qualities[currentQuality]['container'].lower()
print(container)
desc = stream_qualities[currentQuality]['desc']
print(desc)
print(format_id)

flv
清晰 480P
flv480


In [19]:
streams = {}
if 'durl' in playinfo['data']:
    src, size = [], 0
    for durl in playinfo['data']['durl']:
        src.append(durl['url'])
        size += durl['size']
    streams[format_id] = {'container': container, 'quality': desc, 'size': size, 'src': src}
streams

{'flv480': {'container': 'flv',
  'quality': '清晰 480P',
  'size': 31884839,
  'src': ['http://cn-gdfs2-cc-acache-01.acgvideo.com/upgcxcode/25/08/52430825/52430825-1-32.flv?e=ig8euxZM2rNcNbug7WdVtWug7WdVNEVEuCIv29hEn0lqXg8Y2ENvNCImNEVEUJ1miI7MT96fqj3E9r1qNCNEto8g2ENvN03eN0B5tZlqNxTEto8BTrNvNeZVuJ10Kj_g2UB02J0mN0B5tZlqNCNEto8BTrNvNC7MTX502C8f2jmMQJ6mqF2fka1mqx6gqj0eN0B599M=&deadline=1559271845&gen=playurl&nbs=1&oi=3698564895&os=acache&platform=pc&trid=57d719aaeb7c4a5dacbe6f9375f8cfa9&uipk=5&upsig=6ab18847ff868f6fd959ac193aa9a4fa&uparams=e,deadline,gen,nbs,oi,os,platform,trid,uipk&mid=0',
   'http://cn-gdfs2-cc-acache-04.acgvideo.com/upgcxcode/25/08/52430825/52430825-2-32.flv?e=ig8euxZM2rNcNb4VhwdVtW4VhwdVNEVEuCIv29hEn0lqXg8Y2ENvNCImNEVEUJ1miI7MT96fqj3E9r1qNCNEto8g2ENvN03eN0B5tZlqNxTEto8BTrNvNeZVuJ10Kj_g2UB02J0mN0B5tZlqNCNEto8BTrNvNC7MTX502C8f2jmMQJ6mqF2fka1mqx6gqj0eN0B599M=&deadline=1559271845&gen=playurl&nbs=1&oi=3698564895&os=acache&platform=pc&trid=57d719aaeb7c4a5dacbe6f9375f8cfa9&uip

In [36]:
initial_state['videoData']['pages']

[{'cid': 52430825,
  'dimension': {'height': 768, 'rotate': 0, 'width': 1366},
  'duration': 1051,
  'from': 'vupload',
  'page': 1,
  'part': '1.尚硅谷-IDEA-IntelliJ IDEA的介绍和优势',
  'vid': '',
  'weblink': ''},
 {'cid': 52430409,
  'dimension': {'height': 768, 'rotate': 0, 'width': 1366},
  'duration': 576,
  'from': 'vupload',
  'page': 2,
  'part': '2.尚硅谷-IDEA-版本介绍与安装前的准备',
  'vid': '',
  'weblink': ''},
 {'cid': 52432611,
  'dimension': {'height': 768, 'rotate': 0, 'width': 1366},
  'duration': 314,
  'from': 'vupload',
  'page': 3,
  'part': '3.尚硅谷-IDEA-IDEA的卸载',
  'vid': '',
  'weblink': ''},
 {'cid': 52432650,
  'dimension': {'height': 768, 'rotate': 0, 'width': 1366},
  'duration': 348,
  'from': 'vupload',
  'page': 4,
  'part': '4.尚硅谷-IDEA-IDEA的安装',
  'vid': '',
  'weblink': ''},
 {'cid': 52432620,
  'dimension': {'height': 768, 'rotate': 0, 'width': 1366},
  'duration': 445,
  'from': 'vupload',
  'page': 5,
  'part': '5.尚硅谷-IDEA-安装目录和设置目录结构的说明',
  'vid': '',
  'weblink': ''},
 

In [20]:
from urllib import request,parse,error
def urlopen_with_retry(*args, **kwargs):
    retry_time = 3
    for i in range(retry_time):
        try:
            return request.urlopen(*args, **kwargs)
        except socket.timeout as e:
            logging.debug('request attempt %s timeout' % str(i + 1))
            if i + 1 == retry_time:
                raise e
        # try to tackle youku CDN fails
        except error.HTTPError as http_error:
            logging.debug('HTTP Error with code{}'.format(http_error.code))
            if i + 1 == retry_time:
                raise http_error


In [None]:
def urlopen_with_retry():
    
def url_size()

In [21]:
headers = bilibili_headers(referer = "https://www.bilibili.com/video/av30080993")
url = 'http://cn-gdfs2-cc-acache-01.acgvideo.com/upgcxcode/25/08/52430825/52430825-1-32.flv?e=ig8euxZM2rNcNbug7WdVtWug7WdVNEVEuCIv29hEn0lqXg8Y2ENvNCImNEVEUJ1miI7MT96fqj3E9r1qNCNEto8g2ENvN03eN0B5tZlqNxTEto8BTrNvNeZVuJ10Kj_g2UB02J0mN0B5tZlqNCNEto8BTrNvNC7MTX502C8f2jmMQJ6mqF2fka1mqx6gqj0eN0B599M=&deadline=1559271845&gen=playurl&nbs=1&oi=3698564895&os=acache&platform=pc&trid=57d719aaeb7c4a5dacbe6f9375f8cfa9&uipk=5&upsig=6ab18847ff868f6fd959ac193aa9a4fa&uparams=e,deadline,gen,nbs,oi,os,platform,trid,uipk&mid=0'
print(headers)
req = request.urlopen(request.Request(url,headers = headers))
print(req.headers)

{'Referer': 'https://www.bilibili.com/video/av30080993', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}
Server: openresty
Date: Fri, 31 May 2019 01:04:51 GMT
Content-Type: video/x-flv
Content-Length: 9566182
Connection: close
alt-svc: quic=":443"; ma=2592000; v="43,42,41,39,38,37,35"
X-Upsig-Version: 190111
Last-Modified: Thu, 23 Aug 2018 10:44:23 GMT
X-Acache-Cache: cn-gdfs2-cc-acache-05
X-Request-ID: 7ffabfb61ca679001753d37c125713df
X-Acache-Gate: cn-gdfs2-cc-acache-01
Accept-Ranges: bytes




In [22]:
headers['Range'] = 'bytes='+str(0)+'-'
headers

{'Range': 'bytes=0-',
 'Referer': 'https://www.bilibili.com/video/av30080993',
 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}

In [23]:
req = request.urlopen(request.Request(url,headers = headers))
print(req.headers)

Server: openresty
Date: Fri, 31 May 2019 01:05:02 GMT
Content-Type: video/x-flv
Content-Length: 9566182
Connection: close
alt-svc: quic=":443"; ma=2592000; v="43,42,41,39,38,37,35"
X-Upsig-Version: 190111
Last-Modified: Fri, 31 May 2019 01:05:01 GMT
X-Acache-Cache: cn-gdfs2-cc-acache-05
X-Request-ID: 9123b70bfaf5247bcdf1222a499b5af6
X-Acache-Gate: cn-gdfs2-cc-acache-01
Content-Range: bytes 0-9566181/9566182




In [24]:
a = req.read(1024*256)

In [31]:
print(match1(url, r'[\?&]p=(\d+)'))

None


In [76]:
print(req.headers['content-range'][6:].split('/')[0].split('-')[0])
print(req.headers['content-range'][6:].split('/')[0].split('-')[1])

0
9566181


In [21]:
import os
import urllib.request
def url_save(url,filepath,refer = None,is_part = False,faker = False,headers = None,timeout = None,**kwargs):
    tmp_headers = headers.copy() if header is not None else {}
    if refer is not None:
        tmp_headers['Referer'] = refer
    if type(url) is list:
        file_size = urls_size(url,faker=faker,headers = tmp_headers)
        is_chunked,urls = True,url
    else:
        file_size = url_size(url,faker=faker,headers=tmp_headers)
        is_chunked,urls = False,[url]
        # TODO 继续下载的功能
    temp_filepath = filepath+'.download' if file_size != float('inf') else filepath
    received = 0
    open_mode = 'wb'
    for url in urls:
        received_chunk = 0
        if received < file_size:
            if faker:
                tmp_headers = fake_headers
            # 只有一个链接?
            if received and not is_chunked:
                tmp_headers['Range'] = 'bytes='+str(received)+'-'
            if refer:
                tmp_headers['Referer'] = refer
            if timeout:
                response = urlopen_with_retry(
                    request.Request(url,headers=tmp_headers),timeout=timeout
                )
            else:
                response = urlopen_with_retry(
                    request.Request(url,headers=tmp_headers)
                )
            try:
                range_start = int(response.headers['content-range'][6:].split('/')[0].split('-')[0])
                end_length = int(response.headers['content-range'][6:].split('/')[1])
                range_length = end_length-range_start
            except:
                content_length = response.headers['content-length']
                range_length = int(content_length) if content_length is not None else float('inf')
            if is_chunked:
                open_mode = 'ab'
            elif file_size!=received+range_length:
                received = 0
                open_mode = 'wb'
            with open(temp_filepath,open_mode) as output:
                while True:
                    buffer = None
                    try:
                        buffer = response.read(1024*256)
                    except socked.timeout:
                        pass
                    if not buffer:
                        if is_chunked and received_chunk == range_length:
                            break
                        elif not is_chunked and received == file_size:
                            break
                        if not is_chunked:
                            tmp_headers['Range'] = 'bytes='+str(received)+'-'
                        response = urlopen_with_retry(request.Request(url,headers=tmp_headers))
                        continue
                    output.write(buffer)
                    received += len(buffer)
                    received_chunk +=len(buffer)
                    