In [None]:
import numpy as np
import pandas as pd
import yt_dlp
from tqdm import tqdm
import os
import glob
import json

In [None]:
MEDIA_PATH = '/mnt/d/AI Challenge/Data/media-info'

In [None]:
video_list = glob.glob(f"{MEDIA_PATH}/*.json")

# List to hold data
data_list = []
ignore_fields = ["description", "keywords", "title"]

for file in video_list:
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)  # Load JSON content
        data['name'] = os.path.basename(file).replace('.json', '')
        for ignore_field in ignore_fields:
            if ignore_field in data:
                del data[ignore_field]
        
        data_list.append(data)

df = pd.DataFrame(data_list)
df.set_index('name', inplace=True)
df.head()

In [None]:
df_parts = np.array_split(df, 5)

# Access each part
for i, part in enumerate(df_parts, 0):
    print(f"Part {i + 1}:\n", part.index[0], part.index[-1], "\n")

In [None]:
df = df_parts[0]

In [None]:
DOWNLOAD_FOLDER = '/mnt/d/AI Challenge/Data/video'
os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)

# Download options for 360p only
ydl_opts = {
    'format': 'mp4',
    # 'format': 'bestvideo[height<=360]+bestaudio/best[height<=360]',
    # 'merge_output_format': 'mp4',
    'outtmpl': f'{DOWNLOAD_FOLDER}/%(id)s.%(ext)s',
    'quiet': True,
    
    # Subtitle options
    'writesubtitles': True,           # Download subtitles
    'subtitleslangs': ['vi'],         # Vietnamese only
    'writeautomaticsub': True,        # Download auto-generated if manual not available
    # 'embedsubtitles': True,           # Embed subtitles into the video (for mp4/mkv)
    
    # 'postprocessor_args': ['-c:v', 'libx264', '-preset', 'veryfast', '-crf', '23']
}


# Initialize downloader
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    for name, row in tqdm(df.iterrows(), total=len(df), desc="Downloading videos"):
        if os.path.exists(f"{DOWNLOAD_FOLDER}/{name}.mp4"):
            print(f"Video {name} already exists, skipping download.")
            continue
        try:
            ydl.download([row['watch_url']])
        except Exception as e:
            print(f"Failed to download {name}: {e}")