In [1]:
import os
import json
import cv2

In [None]:
"""
Downloader
"""

import os
import json
import cv2


def download(video_path, ytb_id, proxy=None):
    """
    ytb_id: youtube_id
    save_folder: save video folder
    proxy: proxy url, defalut None
    """
    if proxy is not None:
        proxy_cmd = "--proxy {}".format(proxy)
    else:
        proxy_cmd = ""
    if not os.path.exists(video_path):
        down_video = " ".join([
            "yt-dlp",
            proxy_cmd,
            '-f', "'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio'",
            '--skip-unavailable-fragments',
            '--merge-output-format', 'mp4',
            "https://www.youtube.com/watch?v=" + ytb_id, "--output",
            video_path, "--external-downloader", "aria2c",
            "--external-downloader-args", '"-x 16 -k 1M"'
        ])
        print(down_video)
        status = os.system(down_video)
        if status != 0:
            print(f"video not found: {ytb_id}")


def process_ffmpeg(raw_vid_path, save_folder, save_vid_name,
                   bbox, time):
    """
    raw_vid_path:
    save_folder:
    save_vid_name:
    bbox: format: top, bottom, left, right. the values are normalized to 0~1
    time: begin_sec, end_sec
    """

    def secs_to_timestr(secs):
        hrs = secs // (60 * 60)
        min = (secs - hrs * 3600) // 60 # thanks @LeeDongYeun for finding & fixing this bug
        sec = secs % 60
        end = (secs - int(secs)) * 100
        return "{:02d}:{:02d}:{:02d}.{:02d}".format(int(hrs), int(min),
                                                    int(sec), int(end))

    def expand(bbox, ratio):
        top, bottom = max(bbox[0] - ratio, 0), min(bbox[1] + ratio, 1)
        left, right = max(bbox[2] - ratio, 0), min(bbox[3] + ratio, 1)

        return top, bottom, left, right

    def to_square(bbox):
        top, bottom, leftx, right = bbox
        h = bottom - top
        w = right - leftx
        c = min(h, w) // 2
        c_h = (top + bottom) / 2
        c_w = (leftx + right) / 2

        top, bottom = c_h - c, c_h + c
        leftx, right = c_w - c, c_w + c
        return top, bottom, leftx, right

    def denorm(bbox, height, width):
        top, bottom, left, right = \
            round(bbox[0] * height), \
            round(bbox[1] * height), \
            round(bbox[2] * width), \
            round(bbox[3] * width)

        return top, bottom, left, right

    out_path = os.path.join(save_folder, save_vid_name)

    cap = cv2.VideoCapture(raw_vid_path)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    top, bottom, left, right = to_square(
        denorm(expand(bbox, 0.02), height, width))
    start_sec, end_sec = time

    cmd = f"ffmpeg -i {raw_vid_path} -vf crop=w={right-left}:h={bottom-top}:x={left}:y={top} -ss {secs_to_timestr(start_sec)} -to {secs_to_timestr(end_sec)} -loglevel error {out_path}"
    os.system(cmd)
    return out_path


def load_data(file_path):
    with open(file_path) as f:
        data_dict = json.load(f)

    for key, val in data_dict['clips'].items():
        save_name = key+".mp4"
        ytb_id = val['ytb_id']
        time = val['duration']['start_sec'], val['duration']['end_sec']

        bbox = [val['bbox']['top'], val['bbox']['bottom'],
                val['bbox']['left'], val['bbox']['right']]
        yield ytb_id, save_name, time, bbox


if __name__ == '__main__':
    json_path = 'celebvhq_info.json'  # json file path
    raw_vid_root = './downloaded_celebvhq/raw/'  # download raw video path
    processed_vid_root = './downloaded_celebvhq/processed/'  # processed video path
    proxy = None  # proxy url example, set to None if not use

    os.makedirs(raw_vid_root, exist_ok=True)
    os.makedirs(processed_vid_root, exist_ok=True)

    for vid_id, save_vid_name, time, bbox in load_data(json_path):
        raw_vid_path = os.path.join(raw_vid_root, vid_id + ".mp4")
        # Downloading is io bounded and processing is cpu bounded.
        # It is better to download all videos firstly and then process them via mutiple cpu cores.
        download(raw_vid_path, vid_id, proxy)
        # process_ffmpeg(raw_vid_path, processed_vid_root, save_vid_name, bbox, time)

    # with open('./ytb_id_errored.log', 'r') as f:
    #     lines = f.readlines()
    # for line in lines:
    #     raw_vid_path = os.path.join(raw_vid_root, line + ".mp4")
    #     download(raw_vid_path, line)

yt-dlp  -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio' --skip-unavailable-fragments --merge-output-format mp4 https://www.youtube.com/watch?v=M2Ohb0FAaJU --output ./downloaded_celebvhq/raw/M2Ohb0FAaJU.mp4 --external-downloader aria2c --external-downloader-args "-x 16 -k 1M"




[youtube] Extracting URL: https://www.youtube.com/watch?v=M2Ohb0FAaJU
[youtube] M2Ohb0FAaJU: Downloading webpage
[youtube] M2Ohb0FAaJU: Downloading ios player API JSON
[youtube] M2Ohb0FAaJU: Downloading mweb player API JSON
[youtube] M2Ohb0FAaJU: Downloading player 4e23410d
[youtube] M2Ohb0FAaJU: Downloading m3u8 information
[info] M2Ohb0FAaJU: Downloading 1 format(s): 137+140
[download] Destination: ./downloaded_celebvhq/raw/M2Ohb0FAaJU.f137.mp4
[download]   1.4% of   18.10MiB at    9.21MiB/s ETA 00:01



[download] 100% of   18.10MiB in 00:00:01 at 16.04MiB/s    
[download] Destination: ./downloaded_celebvhq/raw/M2Ohb0FAaJU.f140.m4a
[download] 100% of    2.07MiB in 00:00:00 at 10.00MiB/s  
yt-dlp  -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio' --skip-unavailable-fragments --merge-output-format mp4 https://www.youtube.com/watch?v=_0tf2n3rlJU --output ./downloaded_celebvhq/raw/_0tf2n3rlJU.mp4 --external-downloader aria2c --external-downloader-args "-x 16 -k 1M"




[youtube] Extracting URL: https://www.youtube.com/watch?v=_0tf2n3rlJU
[youtube] _0tf2n3rlJU: Downloading webpage
[youtube] _0tf2n3rlJU: Downloading ios player API JSON
[youtube] _0tf2n3rlJU: Downloading mweb player API JSON
[youtube] _0tf2n3rlJU: Downloading m3u8 information
[info] _0tf2n3rlJU: Downloading 1 format(s): 137+140
[download] Destination: ./downloaded_celebvhq/raw/_0tf2n3rlJU.f137.mp4
[download]   1.8% of   54.96MiB at   14.41MiB/s ETA 00:03



[download] 100% of   54.96MiB in 00:00:05 at 10.40MiB/s    
[download] Destination: ./downloaded_celebvhq/raw/_0tf2n3rlJU.f140.m4a
[download] 100% of    7.52MiB in 00:00:00 at 9.96MiB/s   
yt-dlp  -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio' --skip-unavailable-fragments --merge-output-format mp4 https://www.youtube.com/watch?v=dvbePi1o_Q0 --output ./downloaded_celebvhq/raw/dvbePi1o_Q0.mp4 --external-downloader aria2c --external-downloader-args "-x 16 -k 1M"
video not found: dvbePi1o_Q0
yt-dlp  -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio' --skip-unavailable-fragments --merge-output-format mp4 https://www.youtube.com/watch?v=ekdWbe1s_d4 --output ./downloaded_celebvhq/raw/ekdWbe1s_d4.mp4 --external-downloader aria2c --external-downloader-args "-x 16 -k 1M"


Traceback (most recent call last):
  File "/Users/jessie/anaconda3/bin/yt-dlp", line 5, in <module>
    from yt_dlp import main
  File "/Users/jessie/anaconda3/lib/python3.11/site-packages/yt_dlp/__init__.py", line 18, in <module>
    from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS, CookieLoadError
  File "/Users/jessie/anaconda3/lib/python3.11/site-packages/yt_dlp/cookies.py", line 8, in <module>
    import http.cookiejar
  File "/Users/jessie/anaconda3/lib/python3.11/http/cookiejar.py", line 36, in <module>
    import urllib.parse, urllib.request
  File "/Users/jessie/anaconda3/lib/python3.11/urllib/request.py", line 88, in <module>
    import http.client
  File "/Users/jessie/anaconda3/lib/python3.11/http/client.py", line 71, in <module>
    import email.parser
  File "<frozen importlib._bootstrap>", line 1176, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1147, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 690, in _load_unlocke

[youtube] Extracting URL: https://www.youtube.com/watch?v=ekdWbe1s_d4
[youtube] ekdWbe1s_d4: Downloading webpage
[youtube] ekdWbe1s_d4: Downloading ios player API JSON
[youtube] ekdWbe1s_d4: Downloading mweb player API JSON
[youtube] ekdWbe1s_d4: Downloading m3u8 information
[info] ekdWbe1s_d4: Downloading 1 format(s): 137+140
[download] Destination: ./downloaded_celebvhq/raw/ekdWbe1s_d4.f137.mp4
[download]   1.6% of   31.18MiB at   14.17MiB/s ETA 00:02



[download] 100% of   31.18MiB in 00:00:01 at 17.99MiB/s    
[download] Destination: ./downloaded_celebvhq/raw/ekdWbe1s_d4.f140.m4a
[download] 100% of    2.03MiB in 00:00:00 at 8.97MiB/s   
yt-dlp  -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio' --skip-unavailable-fragments --merge-output-format mp4 https://www.youtube.com/watch?v=ekdWbe1s_d4 --output ./downloaded_celebvhq/raw/ekdWbe1s_d4.mp4 --external-downloader aria2c --external-downloader-args "-x 16 -k 1M"




[youtube] Extracting URL: https://www.youtube.com/watch?v=ekdWbe1s_d4
[youtube] ekdWbe1s_d4: Downloading webpage
[youtube] ekdWbe1s_d4: Downloading ios player API JSON
[youtube] ekdWbe1s_d4: Downloading mweb player API JSON
[youtube] ekdWbe1s_d4: Downloading m3u8 information
[info] ekdWbe1s_d4: Downloading 1 format(s): 137+140
[download] ./downloaded_celebvhq/raw/ekdWbe1s_d4.f137.mp4 has already been downloaded
[download] 100% of   31.18MiB
[download] ./downloaded_celebvhq/raw/ekdWbe1s_d4.f140.m4a has already been downloaded
[download] 100% of    2.03MiB
yt-dlp  -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio' --skip-unavailable-fragments --merge-output-format mp4 https://www.youtube.com/watch?v=HV150LwK8Kc --output ./downloaded_celebvhq/raw/HV150LwK8Kc.mp4 --external-downloader aria2c --external-downloader-args "-x 16 -k 1M"




[youtube] Extracting URL: https://www.youtube.com/watch?v=HV150LwK8Kc
[youtube] HV150LwK8Kc: Downloading webpage
[youtube] HV150LwK8Kc: Downloading ios player API JSON
[youtube] HV150LwK8Kc: Downloading mweb player API JSON
[youtube] HV150LwK8Kc: Downloading m3u8 information
[info] HV150LwK8Kc: Downloading 1 format(s): 137+140
[download] Destination: ./downloaded_celebvhq/raw/HV150LwK8Kc.f137.mp4
[download]   0.3% of  292.55MiB at   14.21MiB/s ETA 00:20



[download]  48.1% of  292.55MiB at   11.52MiB/s ETA 00:13  