In [None]:
import numpy as np
import pandas as pd
import yt_dlp # pip install "yt-dlp[default,curl-cffi]"
from tqdm import tqdm
import os
import glob
import json
from config import MEDIA_DIR, DOWNLOAD_DIR

In [None]:
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
part_index = 0 # Dat, Huan, Tuan, Phat, Khoa

In [None]:
video_list = glob.glob(f"{MEDIA_DIR}/*.json")

# List to hold data
data_list = []
ignore_fields = ["description", "keywords", "title"]

for file in video_list:
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)  # Load JSON content
        data['name'] = os.path.basename(file).replace('.json', '')
        for ignore_field in ignore_fields:
            if ignore_field in data:
                del data[ignore_field]
        
        data_list.append(data)

df = pd.DataFrame(data_list)
df.set_index('name', inplace=True)
# sort index
df.sort_index(inplace=True)
df.head()

In [None]:
df_parts = np.array_split(df, 5)

# # Access each part
# for i, part in enumerate(df_parts, 0):
#     print(f"Part {i + 1}:\n", part.index[0], part.index[-1], "\n")

df = df_parts[part_index] # select part here

In [None]:
MAX_WORKERS = 4

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed


# Common download options
base_opts = {
    'format': 'mp4',
    'quiet': True,
    'writesubtitles': True,
    'subtitleslangs': ['vi'],
    'writeautomaticsub': True,
}

def download_video(name, row):
    batchh = name.split("_")[0]  # sanitize filename
    output_path = f"{DOWNLOAD_DIR}/{batchh}/{name}.%(ext)s"

    if os.path.exists(output_path):
        return f"Video {name} already exists, skipping."

    ydl_opts = {**base_opts, 'outtmpl': output_path}

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([row['watch_url']])
        return f"Downloaded {name}"
    except Exception as e:
        return f"Failed {name}: {e}"

# Number of parallel threads
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(download_video, name, row) for name, row in df.iterrows()]
    for f in tqdm(as_completed(futures), total=len(futures), desc="Downloading videos"):
        print(f.result())