In [1]:
import bertopic
from loguru import logger
import os
import yt_dlp
import glob
import json
from pathlib import Path
import pandas as pd
from dataclasses import dataclass, field
import sys
from pathlib import Path, PosixPath
import requests
from tqdm.notebook import tqdm

In [2]:
@dataclass
class video_info:
    id: str
    username: str = field(default_factory=str, compare=False)
    downloaded: bool = field(default_factory=bool, compare=False)
    video_location: PosixPath = field(default_factory=PosixPath, compare=False)
    comment_count: int = field(default_factory=int, compare=False)
    digg_count: int = field(default_factory=int, compare=False)
    play_count: int = field(default_factory=int, compare=False)
    share_count: int = field(default_factory=int, compare=False)
    whatsapp_share_count: int = field(default_factory=int, compare=False)
    description: str = field(default_factory=str, compare=False)
    ocr_text: str = field(default_factory=str, compare=False)
    whisper_text: str = field(default_factory=str, compare=False)
    concatenated: str = field(default_factory=str, compare=False)

    def as_url(self, site='yt'):
        if site == 'yt':
            return f'https://www.youtube.com/watch?v={self.id}'
        elif site == 'tiktok':
            return f'https://www.tiktok.com/@{self.username}/video/{self.id}'

In [3]:
df = pd.read_csv('Data/YT/videolist_search7071_2023_12_12-14_24_28.csv')

In [4]:
df.columns

Index(['position', 'channelId', 'channelTitle', 'videoId', 'publishedAt',
       'publishedAtSQL', 'videoTitle', 'videoDescription', 'tags',
       'videoCategoryId', 'videoCategoryLabel', 'topicCategories', 'duration',
       'durationSec', 'dimension', 'definition', 'caption', 'defaultLanguage',
       'defaultLAudioLanguage', 'thumbnail_maxres', 'licensedContent',
       'locationDescription', 'latitude', 'longitude', 'viewCount',
       'likeCount', 'dislikeCount', 'favoriteCount', 'commentCount'],
      dtype='object')

In [6]:
def download(
        file,
        savepath,
        overwrite = False,
        max_download = None,
        audio_only = True,
        log_file = None,
        show_tqdm = True
    ):

    if log_file:
        logger.remove()
        logger.add(log_file, backtrace=True, diagnose=True)
    else:
        logger.remove()
        logger.add(sys.stderr, level="INFO", backtrace=True, diagnose=True)

    savefolder = 'audio' if audio_only else 'videos'

    assert os.path.exists(savepath)
    if not os.path.exists(savefolder):
        os.makedirs(savefolder)

    if max_download:
        max_download = int(max_download)

    all_ids = []
    # with open(file, 'r') as f:
    #     x = json.load(f)
    #     for row in x['data']:
    #         all_ids.append(video_info(
    #             id = row['aweme_info']['aweme_id'],
    #             username= row['aweme_info']['author']['unique_id']
    #         ))
    df = pd.read_csv(file)

    for row in df.itertuples(index=False):
        all_ids.append(video_info(
            id=row.videoId
        ))

    existing_videos = glob.glob(os.path.join(savepath, savefolder, '*.*'))
    existing_videos = [Path(i).stem for i in existing_videos]
    logger.debug(f'{existing_videos[:10]}')

    ydl_opts= {
        'outtmpl': os.path.join(savepath, f"{savefolder}/%(id)s.%(ext)s"),
        'overwrites': overwrite,
        'logger': logger,
        'format': 'm4a/bestaudio/best',
        # ℹ️ See help(yt_dlp.postprocessor) for a list of available Postprocessors and their arguments
        'postprocessors': [{  # Extract audio using ffmpeg
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'm4a',
        }],
        # 'ffmpeg_location':'//opt//homebrew//bin//ffmpeg',
    }
    skipped = 0
    success = 0
    errored = 0

    def tqdm_enumerate(iterable, use_tqdm=False):
        if use_tqdm:
            return enumerate(tqdm(iterable))
        else:
            return enumerate(iterable)

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        for counter, video in tqdm_enumerate(all_ids, use_tqdm=show_tqdm):
            if video.id in existing_videos:
                logger.info(f'{video.id} already downloaded. Continuing...')
                skipped += 1
                continue
            logger.info(f"processing {counter} of {len(all_ids)}, skipped {skipped}, errored: {errored}, max dl:{max_download}")
            if max_download is not None and counter-skipped >= max_download:
                logger.info(f'Max download of {max_download} reached. Terminating...')
                break
            try:
                ydl.download(video.as_url(site='yt'))
                success += 1
            except Exception:
                logger.error('Something went wrong')
                errored += 1
    logger.info(f'Downloaded {success} succesfully, skipped {skipped} existing, {errored} failed.')

In [7]:
download(
    'Data/YT/videolist_search7071_2023_12_12-14_24_28.csv',
    savepath='./Data/YT/',
    # max_download=5,
    log_file = './logs/YT_audio_download.log',
    show_tqdm=True
)

  0%|          | 0/7071 [00:00<?, ?it/s]