diff --git a/test/test_compat.py b/test/test_compat.py index e233b1ae1b7..b83c8cb4100 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -23,6 +23,7 @@ compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, + compat_urllib_request, ) @@ -135,6 +136,19 @@ def test_compat_casefold(self): self.assertEqual(compat_casefold('\u03a3'), '\u03c3') self.assertEqual(compat_casefold('A\u0345\u03a3'), 'a\u03b9\u03c3') + def test_compat_urllib_request_Request(self): + self.assertEqual( + compat_urllib_request.Request('http://127.0.0.1', method='PUT').get_method(), + 'PUT') + + class PUTrequest(compat_urllib_request.Request): + def get_method(self): + return 'PUT' + + self.assertEqual( + PUTrequest('http://127.0.0.1').get_method(), + 'PUT') + if __name__ == '__main__': unittest.main() diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 1197721ff09..e005c78fc2a 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -295,6 +295,7 @@ class TestNRKSubtitles(BaseTestSubtitles): def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True + self.DL.params['format'] = 'best/bestvideo' subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['nb-ttv'])) self.assertEqual(md5(subtitles['nb-ttv']), '67e06ff02d0deaf975e68f6cb8f6a149') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 3c526a78dc5..818ccebd0a6 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -58,19 +58,26 @@ # Also fix up lack of method arg in old Pythons try: - _req = compat_urllib_request.Request - _req('http://127.0.0.1', method='GET') + type(compat_urllib_request.Request('http://127.0.0.1', method='GET')) except TypeError: - class _request(object): - def __new__(cls, url, *args, **kwargs): - method = kwargs.pop('method', None) - r = _req(url, *args, **kwargs) - if method: - r.get_method = types.MethodType(lambda _: method, r) - return r + def _add_init_method_arg(cls): - compat_urllib_request.Request = _request + init = cls.__init__ + def wrapped_init(self, *args, **kwargs): + method = kwargs.pop('method', 'GET') + init(self, *args, **kwargs) + if any(callable(x.__dict__.get('get_method')) for x in (self.__class__, self) if x != cls): + # allow instance or its subclass to override get_method() + return + if self.has_data() and method == 'GET': + method = 'POST' + self.get_method = types.MethodType(lambda _: method, self) + + cls.__init__ = wrapped_init + + _add_init_method_arg(compat_urllib_request.Request) + del _add_init_method_arg try: import urllib.error as compat_urllib_error diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0eca9f84490..d33557135b4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -596,6 +596,14 @@ def set_downloader(self, downloader): """Sets the downloader for this IE.""" self._downloader = downloader + @property + def cache(self): + return self._downloader.cache + + @property + def cookiejar(self): + return self._downloader.cookiejar + def _real_initialize(self): """Real initialization process. Redefine in subclasses.""" pass @@ -942,14 +950,47 @@ def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): else: self.report_warning(errmsg + str(ve)) - def report_warning(self, msg, video_id=None): + def __ie_msg(self, *msg): + return '[{0}] {1}'.format(self.IE_NAME, ''.join(msg)) + + # msg, video_id=None, *args, only_once=False, **kwargs + def report_warning(self, msg, *args, **kwargs): + if len(args) > 0: + video_id = args[0] + args = args[1:] + else: + video_id = kwargs.pop('video_id', None) idstr = '' if video_id is None else '%s: ' % video_id self._downloader.report_warning( - '[%s] %s%s' % (self.IE_NAME, idstr, msg)) + self.__ie_msg(idstr, msg), *args, **kwargs) def to_screen(self, msg): """Print msg to screen, prefixing it with '[ie_name]'""" - self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg)) + self._downloader.to_screen(self.__ie_msg(msg)) + + def write_debug(self, msg, only_once=False, _cache=[]): + '''Log debug message or Print message to stderr''' + if not self.get_param('verbose', False): + return + message = '[debug] ' + self.__ie_msg(msg) + logger = self.get_param('logger') + if logger: + logger.debug(message) + else: + if only_once and hash(message) in _cache: + return + self._downloader.to_stderr(message) + _cache.append(hash(message)) + + # name, default=None, *args, **kwargs + def get_param(self, name, *args, **kwargs): + default, args = (args[0], args[1:]) if len(args) > 0 else (kwargs.pop('default', None), args) + if self._downloader: + return self._downloader.params.get(name, default, *args, **kwargs) + return default + + def report_drm(self, video_id): + self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id) def report_extraction(self, id_or_name): """Report information extraction.""" @@ -977,6 +1018,15 @@ def raise_login_required(msg='This video is only available for registered users' def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None): raise GeoRestrictedError(msg, countries=countries) + def raise_no_formats(self, msg, expected=False, video_id=None): + if expected and ( + self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): + self.report_warning(msg, video_id) + elif isinstance(msg, ExtractorError): + raise msg + else: + raise ExtractorError(msg, expected=expected, video_id=video_id) + # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None, video_title=None): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3bf483c1c8a..db840fc4556 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals +import collections import itertools import json import os.path @@ -23,10 +24,10 @@ ) from ..jsinterp import JSInterpreter from ..utils import ( - ExtractorError, clean_html, dict_get, error_to_compat_str, + ExtractorError, float_or_none, extract_attributes, get_element_by_attribute, @@ -36,7 +37,9 @@ LazyList, merge_dicts, mimetype2ext, + NO_DEFAULT, parse_codecs, + parse_count, parse_duration, parse_qs, qualities, @@ -44,7 +47,9 @@ smuggle_url, str_or_none, str_to_int, + T, traverse_obj, + try_call, try_get, txt_or_none, unescapeHTML, @@ -1247,7 +1252,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'IMG 3456', 'description': '', 'upload_date': '20170613', - 'uploader': 'ElevageOrVert', + 'uploader': "l'Or Vert asbl", 'uploader_id': '@ElevageOrVert', }, 'params': { @@ -1460,6 +1465,30 @@ def __init__(self, *args, **kwargs): self._code_cache = {} self._player_cache = {} + # *ytcfgs, webpage=None + def _extract_player_url(self, *ytcfgs, **kw_webpage): + if ytcfgs and not isinstance(ytcfgs[0], dict): + webpage = kw_webpage.get('webpage') or ytcfgs[0] + if webpage: + player_url = self._search_regex( + r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', + webpage or '', 'player URL', fatal=False) + if player_url: + ytcfgs = ytcfgs + ({'PLAYER_JS_URL': player_url},) + return traverse_obj( + ytcfgs, (Ellipsis, 'PLAYER_JS_URL'), (Ellipsis, 'WEB_PLAYER_CONTEXT_CONFIGS', Ellipsis, 'jsUrl'), + get_all=False, expected_type=lambda u: urljoin('https://www.youtube.com', u)) + + def _download_player_url(self, video_id, fatal=False): + res = self._download_webpage( + 'https://www.youtube.com/iframe_api', + note='Downloading iframe API JS', video_id=video_id, fatal=fatal) + player_version = self._search_regex( + r'player\\?/([0-9a-fA-F]{8})\\?/', res or '', 'player version', fatal=fatal, + default=NO_DEFAULT if res else None) + if player_version: + return 'https://www.youtube.com/s/player/{0}/player_ias.vflset/en_US/base.js'.format(player_version) + def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) @@ -1474,46 +1503,49 @@ def _extract_player_info(cls, player_url): raise ExtractorError('Cannot identify player %r' % player_url) return id_m.group('id') - def _get_player_code(self, video_id, player_url, player_id=None): + def _load_player(self, video_id, player_url, fatal=True, player_id=None): if not player_id: player_id = self._extract_player_info(player_url) - if player_id not in self._code_cache: - self._code_cache[player_id] = self._download_webpage( - player_url, video_id, + code = self._download_webpage( + player_url, video_id, fatal=fatal, note='Downloading player ' + player_id, errnote='Download of %s failed' % player_url) - return self._code_cache[player_id] + if code: + self._code_cache[player_id] = code + return self._code_cache[player_id] if fatal else self._code_cache.get(player_id) def _extract_signature_function(self, video_id, player_url, example_sig): player_id = self._extract_player_info(player_url) # Read from filesystem cache - func_id = 'js_%s_%s' % ( + func_id = 'js_{0}_{1}'.format( player_id, self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id - cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) - if cache_spec is not None: - return lambda s: ''.join(s[i] for i in cache_spec) + self.write_debug('Extracting signature function {0}'.format(func_id)) + cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None - code = self._get_player_code(video_id, player_url, player_id) - res = self._parse_sig_js(code) + if not cache_spec: + code = self._load_player(video_id, player_url, player_id) + if code: + res = self._parse_sig_js(code) + test_string = ''.join(map(compat_chr, range(len(example_sig)))) + cache_spec = [ord(c) for c in res(test_string)] + self.cache.store('youtube-sigfuncs', func_id, cache_spec) - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = res(test_string) - cache_spec = [ord(c) for c in cache_res] - - self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) - return res + return lambda s: ''.join(s[i] for i in cache_spec) def _print_sig_code(self, func, example_sig): + if not self.get_param('youtube_print_sig_code'): + return + def gen_sig_code(idxs): def _genslice(start, end, step): starts = '' if start == 0 else str(start) ends = (':%d' % (end + step)) if end + step >= 0 else ':' steps = '' if step == 1 else (':%d' % step) - return 's[%s%s%s]' % (starts, ends, steps) + return 's[{0}{1}{2}]'.format(starts, ends, steps) step = None # Quelch pyflakes warnings - start will be set when step is set @@ -1564,143 +1596,137 @@ def _parse_sig_js(self, jscode): jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) - initial_function = jsi.extract_function(funcname) - return lambda s: initial_function([s]) - def _decrypt_signature(self, s, video_id, player_url): - """Turn the encrypted s field into a working signature""" + def _cached(self, func, *cache_id): + def inner(*args, **kwargs): + if cache_id not in self._player_cache: + try: + self._player_cache[cache_id] = func(*args, **kwargs) + except ExtractorError as e: + self._player_cache[cache_id] = e + except Exception as e: + self._player_cache[cache_id] = ExtractorError(traceback.format_exc(), cause=e) - if player_url is None: - raise ExtractorError('Cannot decrypt signature without player_url') + ret = self._player_cache[cache_id] + if isinstance(ret, Exception): + raise ret + return ret + return inner - try: - player_id = (player_url, self._signature_cache_id(s)) - if player_id not in self._player_cache: - func = self._extract_signature_function( - video_id, player_url, s - ) - self._player_cache[player_id] = func - func = self._player_cache[player_id] - if self._downloader.params.get('youtube_print_sig_code'): - self._print_sig_code(func, s) - return func(s) - except Exception as e: - tb = traceback.format_exc() - raise ExtractorError( - 'Signature extraction failed: ' + tb, cause=e) - - def _extract_player_url(self, webpage): - player_url = self._search_regex( - r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', - webpage or '', 'player URL', fatal=False) - if not player_url: - return - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urllib_parse.urljoin( - 'https://www.youtube.com', player_url) - return player_url + def _decrypt_signature(self, s, video_id, player_url): + """Turn the encrypted s field into a working signature""" + extract_sig = self._cached( + self._extract_signature_function, 'sig', player_url, self._signature_cache_id(s)) + func = extract_sig(video_id, player_url, s) + self._print_sig_code(func, s) + return func(s) # from yt-dlp # See also: # 1. https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-894619419 # 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116 # 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377 + def _decrypt_nsig(self, n, video_id, player_url): + """Turn the encrypted n field into a working signature""" + if player_url is None: + raise ExtractorError('Cannot decrypt nsig without player_url') + + try: + jsi, player_id, func_code = self._extract_n_function_code(video_id, player_url) + except ExtractorError as e: + raise ExtractorError('Unable to extract nsig jsi, player_id, func_codefunction code', cause=e) + if self.get_param('youtube_print_sig_code'): + self.to_screen('Extracted nsig function from {0}:\n{1}\n'.format( + player_id, func_code[1])) + + try: + extract_nsig = self._cached(self._extract_n_function_from_code, 'nsig func', player_url) + ret = extract_nsig(jsi, func_code)(n) + except JSInterpreter.Exception as e: + self.report_warning( + '%s (%s %s)' % ( + self.__ie_msg( + 'Unable to decode n-parameter: download likely to be throttled'), + error_to_compat_str(e), + traceback.format_exc())) + return + + self.write_debug('Decrypted nsig {0} => {1}'.format(n, ret)) + return ret + def _extract_n_function_name(self, jscode): - target = r'(?P[a-zA-Z_$][\w$]*)(?:\[(?P\d+)\])?' - nfunc_and_idx = self._search_regex( - r'\.get\("n"\)\)&&\(b=(%s)\([\w$]+\)' % (target, ), - jscode, 'Initial JS player n function name') - nfunc, idx = re.match(target, nfunc_and_idx).group('nfunc', 'idx') + func_name, idx = self._search_regex( + r'\.get\("n"\)\)&&\(b=(?P[a-zA-Z_$][\w$]*)(?:\[(?P\d+)\])?\([\w$]+\)', + jscode, 'Initial JS player n function name', group=('nfunc', 'idx')) if not idx: - return nfunc + return func_name - VAR_RE_TMPL = r'var\s+%s\s*=\s*(?P\[(?P%s)\])[;,]' - note = 'Initial JS player n function {0} (%s[%s])' % (nfunc, idx) + return self._parse_json(self._search_regex( + r'var {0}\s*=\s*(\[.+?\])\s*[,;]'.format(re.escape(func_name)), jscode, + 'Initial JS player n function list ({0}.{1})'.format(func_name, idx)), + func_name, transform_source=js_to_json)[int(idx)] - def search_function_code(needle, group): - return self._search_regex( - VAR_RE_TMPL % (re.escape(nfunc), needle), jscode, - note.format(group), group=group) + def _extract_n_function_code(self, video_id, player_url): + player_id = self._extract_player_info(player_url) + func_code = self.cache.load('youtube-nsig', player_id) + jscode = func_code or self._load_player(video_id, player_url) + jsi = JSInterpreter(jscode) - if int_or_none(idx) == 0: - real_nfunc = search_function_code(r'[a-zA-Z_$][\w$]*', group='alias') - if real_nfunc: - return real_nfunc - return self._parse_json( - search_function_code('.+?', group='name'), - nfunc, transform_source=js_to_json)[int(idx)] + if func_code: + return jsi, player_id, func_code - def _extract_n_function(self, video_id, player_url): - player_id = self._extract_player_info(player_url) - func_code = self._downloader.cache.load('youtube-nsig', player_id) + func_name = self._extract_n_function_name(jscode) + # For redundancy + func_code = self._search_regex( + r'''(?xs)%s\s*=\s*function\s*\((?P[\w$]+)\)\s* + # NB: The end of the regex is intentionally kept strict + {(?P.+?}\s*return\ [\w$]+.join\(""\))};''' % func_name, + jscode, 'nsig function', group=('var', 'code'), default=None) if func_code: - jsi = JSInterpreter(func_code) + func_code = ([func_code[0]], func_code[1]) else: - jscode = self._get_player_code(video_id, player_url, player_id) - funcname = self._extract_n_function_name(jscode) - jsi = JSInterpreter(jscode) - func_code = jsi.extract_function_code(funcname) - self._downloader.cache.store('youtube-nsig', player_id, func_code) - - if self._downloader.params.get('youtube_print_sig_code'): - self.to_screen('Extracted nsig function from {0}:\n{1}\n'.format(player_id, func_code[1])) - - return lambda s: jsi.extract_function_from_code(*func_code)([s]) - - def _n_descramble(self, n_param, player_url, video_id): - """Compute the response to YT's "n" parameter challenge, - or None - - Args: - n_param -- challenge string that is the value of the - URL's "n" query parameter - player_url -- URL of YT player JS - video_id - """ + self.write_debug('Extracting nsig function with jsinterp') + func_code = jsi.extract_function_code(func_name) - sig_id = ('nsig_value', n_param) - if sig_id in self._player_cache: - return self._player_cache[sig_id] + self.cache.store('youtube-nsig', player_id, func_code) + return jsi, player_id, func_code + + def _extract_n_function_from_code(self, jsi, func_code): + func = jsi.extract_function_from_code(*func_code) + + def extract_nsig(s): + try: + ret = func([s]) + except JSInterpreter.Exception: + raise + except Exception as e: + raise JSInterpreter.Exception(traceback.format_exc(), cause=e) - try: - player_id = ('nsig', player_url) - if player_id not in self._player_cache: - self._player_cache[player_id] = self._extract_n_function(video_id, player_url) - func = self._player_cache[player_id] - ret = func(n_param) if ret.startswith('enhanced_except_'): - raise ExtractorError('Unhandled exception in decode') - self._player_cache[sig_id] = ret - if self._downloader.params.get('verbose', False): - self._downloader.to_screen('[debug] [%s] %s' % (self.IE_NAME, 'Decrypted nsig {0} => {1}'.format(n_param, self._player_cache[sig_id]))) - return self._player_cache[sig_id] - except Exception as e: - self._downloader.report_warning( - '[%s] %s (%s %s)' % ( - self.IE_NAME, - 'Unable to decode n-parameter: download likely to be throttled', - error_to_compat_str(e), - traceback.format_exc())) + raise JSInterpreter.Exception('Signature function returned an exception') + return ret + + return extract_nsig + + def _unthrottle_format_urls(self, video_id, player_url, *formats): + + def decrypt_nsig(n): + return self._cached(self._decrypt_nsig, 'nsig', n, player_url) - def _unthrottle_format_urls(self, video_id, player_url, formats): for fmt in formats: parsed_fmt_url = compat_urllib_parse.urlparse(fmt['url']) n_param = compat_parse_qs(parsed_fmt_url.query).get('n') if not n_param: continue n_param = n_param[-1] - n_response = self._n_descramble(n_param, player_url, video_id) + n_response = decrypt_nsig(n_param)(n_param, video_id, player_url) if n_response is None: # give up if descrambling failed break - for fmt_dct in traverse_obj(fmt, (None, (None, ('fragments', Ellipsis))), expected_type=dict): - fmt_dct['url'] = update_url( - fmt_dct['url'], query_update={'n': [n_response]}) + fmt['url'] = update_url_query(fmt['url'], {'n': n_response}) # from yt-dlp, with tweaks def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): @@ -1708,16 +1734,16 @@ def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=F Extract signatureTimestamp (sts) Required to tell API what sig/player version is in use. """ - sts = int_or_none(ytcfg.get('STS')) if isinstance(ytcfg, dict) else None + sts = traverse_obj(ytcfg, 'STS', expected_type=int) if not sts: # Attempt to extract from player if player_url is None: error_msg = 'Cannot extract signature timestamp without player_url.' if fatal: raise ExtractorError(error_msg) - self._downloader.report_warning(error_msg) + self.report_warning(error_msg) return - code = self._get_player_code(video_id, player_url) + code = self._load_player(video_id, player_url, fatal=fatal) sts = int_or_none(self._search_regex( r'(?:signatureTimestamp|sts)\s*:\s*(?P[0-9]{5})', code or '', 'JS player signature timestamp', group='sts', fatal=fatal)) @@ -1733,12 +1759,18 @@ def _mark_watched(self, video_id, player_response): # cpn generation algorithm is reverse engineered from base.js. # In fact it works even with dummy cpn. CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' - cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) - - playback_url = update_url( - playback_url, query_update={ - 'ver': ['2'], - 'cpn': [cpn], + cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)) + + # more consistent results setting it to right before the end + qs = parse_qs(playback_url) + video_length = '{0}'.format(float((qs.get('len') or ['1.5'])[0]) - 1) + + playback_url = update_url_query( + playback_url, { + 'ver': '2', + 'cpn': cpn, + 'cmt': video_length, + 'el': 'detailpage', # otherwise defaults to "shorts" }) self._download_webpage( @@ -1986,8 +2018,11 @@ def feed_entry(name): else: self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + if not player_url: + player_url = self._extract_player_url(webpage) + formats = [] - itags = [] + itags = collections.defaultdict(set) itag_qualities = {} q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) CHUNK_SIZE = 10 << 20 @@ -2003,58 +2038,92 @@ def build_fragments(f): }) } for range_start in range(0, f['filesize'], CHUNK_SIZE)) + lower = lambda s: s.lower() + for fmt in streaming_formats: - if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): + if fmt.get('targetDurationSec'): continue itag = str_or_none(fmt.get('itag')) - quality = fmt.get('quality') - if itag and quality: + audio_track = traverse_obj(fmt, ('audioTrack', T(dict))) or {} + + quality = traverse_obj(fmt, (( + # The 3gp format (17) in android client has a quality of "small", + # but is actually worse than other formats + T(lambda _: 'tiny' if itag == 17 else None), + ('quality', T(lambda q: q if q and q != 'tiny' else None)), + ('audioQuality', T(lower)), + 'quality'), T(txt_or_none)), get_all=False) + if quality and itag: itag_qualities[itag] = quality # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment # (adding `&sq=0` to the URL) and parsing emsg box to determine the - # number of fragment that would subsequently requested with (`&sq=N`) + # number of fragments that would subsequently be requested with (`&sq=N`) if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF': continue fmt_url = fmt.get('url') if not fmt_url: sc = compat_parse_qs(fmt.get('signatureCipher')) - fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) - encrypted_sig = try_get(sc, lambda x: x['s'][0]) - if not (sc and fmt_url and encrypted_sig): + fmt_url = traverse_obj(sc, ('url', -1, T(url_or_none))) + encrypted_sig = traverse_obj(sc, ('s', -1)) + if not (fmt_url and encrypted_sig): continue + player_url = player_url or self._extract_player_url(webpage) if not player_url: - player_url = self._extract_player_url(webpage) - if not player_url: continue - signature = self._decrypt_signature(sc['s'][0], video_id, player_url) - sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' - fmt_url += '&' + sp + '=' + signature + try: + fmt_url = update_url_query(fmt_url, { + traverse_obj(sc, ('sp', -1)) or 'signature': + [self._decrypt_signature(encrypted_sig, video_id, player_url)], + }) + except ExtractorError as e: + self.report_warning('Signature extraction failed: Some formats may be missing', + video_id=video_id, only_once=True) + self.write_debug(error_to_compat_str(e), only_once=True) + continue - if itag: - itags.append(itag) - tbr = float_or_none( - fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) + language_preference = ( + 10 if audio_track.get('audioIsDefault') + else -10 if 'descriptive' in (traverse_obj(audio_track, ('displayName', T(lower))) or '') + else -1) + name = ( + traverse_obj(fmt, ('qualityLabel', T(txt_or_none))) + or quality.replace('audio_quality_', '')) dct = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_id': itag, - 'format_note': fmt.get('qualityLabel') or quality, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - 'quality': q(quality), - 'tbr': tbr, + 'format_id': join_nonempty(itag, fmt.get('isDrc') and 'drc'), 'url': fmt_url, - 'width': fmt.get('width'), + # Format 22 is likely to be damaged: see https://github.com/yt-dlp/yt-dlp/issues/3372 + 'source_preference': ((-5 if itag == '22' else -1) + + (100 if 'Premium' in name else 0)), + 'quality': q(quality), + 'language': join_nonempty(audio_track.get('id', '').split('.')[0], + 'desc' if language_preference < -1 else '') or None, + 'language_preference': language_preference, + # Strictly de-prioritize 3gp formats + 'preference': -2 if itag == '17' else None, } - mimetype = fmt.get('mimeType') - if mimetype: - mobj = re.match( - r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype) - if mobj: - dct['ext'] = mimetype2ext(mobj.group(1)) - dct.update(parse_codecs(mobj.group(2))) + if itag: + itags[itag].add(('https', dct.get('language'))) + self._unthrottle_format_urls(video_id, player_url, dct) + dct.update(traverse_obj(fmt, { + 'asr': ('audioSampleRate', T(int_or_none)), + 'filesize': ('contentLength', T(int_or_none)), + 'format_note': ('qualityLabel', T(lambda x: x or quality)), + # for some formats, fps is wrongly returned as 1 + 'fps': ('fps', T(int_or_none), T(lambda f: f if f > 1 else None)), + 'audio_channels': ('audioChannels', T(int_or_none)), + 'height': ('height', T(int_or_none)), + 'has_drm': ('drmFamilies', T(bool)), + 'tbr': (('averageBitrate', 'bitrate'), T(lambda t: float_or_none(t, 1000))), + 'width': ('width', T(int_or_none)), + '_duration_ms': ('approxDurationMs', T(int_or_none)), + }, get_all=False)) + mime_mobj = re.match( + r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') + if mime_mobj: + dct['ext'] = mimetype2ext(mime_mobj.group(1)) + dct.update(parse_codecs(mime_mobj.group(2))) single_stream = 'none' in (dct.get(c) for c in ('acodec', 'vcodec')) if single_stream and dct.get('ext'): dct['container'] = dct['ext'] + '_dash' @@ -2069,32 +2138,62 @@ def build_fragments(f): formats.append(dct) + def process_manifest_format(f, proto, client_name, itag, all_formats=False): + key = (proto, f.get('language')) + if not all_formats and key in itags[itag]: + return False + itags[itag].add(key) + + if itag: + f['format_id'] = ( + '{0}-{1}'.format(itag, proto) + if all_formats or any(p != proto for p, _ in itags[itag]) + else itag) + + if f.get('source_preference') is None: + f['source_preference'] = -1 + + if itag in ('616', '235'): + f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') + f['source_preference'] += 100 + + f['quality'] = q(traverse_obj(f, ( + 'format_id', T(lambda s: itag_qualities[s.split('-')[0]])), default=-1)) + if try_call(lambda: f['fps'] <= 1): + del f['fps'] + + if proto == 'hls' and f.get('has_drm'): + f['has_drm'] = 'maybe' + f['source_preference'] -= 5 + return True + hls_manifest_url = streaming_data.get('hlsManifestUrl') if hls_manifest_url: for f in self._extract_m3u8_formats( hls_manifest_url, video_id, 'mp4', fatal=False): - itag = self._search_regex( - r'/itag/(\d+)', f['url'], 'itag', default=None) - if itag: - f['format_id'] = itag - formats.append(f) + if process_manifest_format( + f, 'hls', None, self._search_regex( + r'/itag/(\d+)', f['url'], 'itag', default=None)): + formats.append(f) if self._downloader.params.get('youtube_include_dash_manifest', True): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: for f in self._extract_mpd_formats( dash_manifest_url, video_id, fatal=False): - itag = f['format_id'] - if itag in itags: - continue - if itag in itag_qualities: - f['quality'] = q(itag_qualities[itag]) - filesize = int_or_none(self._search_regex( - r'/clen/(\d+)', f.get('fragment_base_url') - or f['url'], 'file size', default=None)) - if filesize: - f['filesize'] = filesize - formats.append(f) + if process_manifest_format( + f, 'dash', None, f['format_id']): + f['filesize'] = traverse_obj(f, ( + ('fragment_base_url', 'url'), T(lambda u: self._search_regex( + r'/clen/(\d+)', u, 'file size', default=None)), + T(int_or_none)), get_all=False) + formats.append(f) + + playable_formats = [f for f in formats if not f.get('has_drm')] + if formats and not playable_formats: + # If there are no formats that definitely don't have DRM, all have DRM + self.report_drm(video_id) + formats[:] = playable_formats if not formats: if streaming_data.get('licenseInfos'): @@ -2166,6 +2265,17 @@ def build_fragments(f): video_details.get('lengthSeconds') or microformat.get('lengthSeconds')) \ or parse_duration(search_meta('duration')) + + for f in formats: + # Some formats may have much smaller duration than others (possibly damaged during encoding) + # but avoid false positives with small duration differences. + # Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 + if try_call(lambda x: float(x.pop('_duration_ms')) / duration < 500, args=(f,)): + self.report_warning( + '{0}: Some possibly damaged formats will be deprioritized'.format(video_id), only_once=True) + # Strictly de-prioritize damaged formats + f['preference'] = -10 + is_live = video_details.get('isLive') owner_profile_url = self._yt_urljoin(self._extract_author_var( @@ -2174,10 +2284,6 @@ def build_fragments(f): uploader = self._extract_author_var( webpage, 'name', videodetails=video_details, metadata=microformat) - if not player_url: - player_url = self._extract_player_url(webpage) - self._unthrottle_format_urls(video_id, player_url, formats) - info = { 'id': video_id, 'title': self._live_title(video_title) if is_live else video_title, @@ -2370,6 +2476,14 @@ def chapter_time(mmlir): 'like_count': str_to_int(like_count), 'dislike_count': str_to_int(dislike_count), }) + else: + info['like_count'] = traverse_obj(vpir, ( + 'videoActions', 'menuRenderer', 'topLevelButtons', Ellipsis, + 'segmentedLikeDislikeButtonViewModel', 'likeButtonViewModel', 'likeButtonViewModel', + 'toggleButtonViewModel', 'toggleButtonViewModel', 'defaultButtonViewModel', + 'buttonViewModel', (('title', ('accessibilityText', T(lambda s: s.split()), Ellipsis))), T(parse_count)), + get_all=False) + vsir = content.get('videoSecondaryInfoRenderer') if vsir: rows = try_get( @@ -2484,7 +2598,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner - Playlists', + 'title': r're:Igor Kleiner(?: Ph\.D\.)? - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', 'uploader': 'Igor Kleiner', 'uploader_id': '@IgorDataScience', @@ -2495,7 +2609,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner - Playlists', + 'title': r're:Igor Kleiner(?: Ph\.D\.)? - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', 'uploader': 'Igor Kleiner', 'uploader_id': '@IgorDataScience', @@ -2607,7 +2721,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Channels', + 'title': r're:lex will - (?:Home|Channels)', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'uploader': 'lex will', 'uploader_id': '@lexwill718',