In [37]:
from youtube_transcript_api import YouTubeTranscriptApi, Transcript
from pprint import pprint

import warnings

import pathlib
import yt_dlp
import re

In [2]:
YT_ID_OLA_CIENCIA_01 = 'hVNEKpSZ_dE'
YT_ID_HNAC_01 = '6ERinp7x3kc'
YT_ID_HNAC_02 = 'fJOlZxNbRaY'

### Utils

In [3]:
def show_video_transcript_obj(transcript: Transcript, show_dirs=False, show_vars=False) -> None:
    """
    Display information about a YouTube transcript object.

    Args:
        transcript (Transcript): The transcript object to display information about.
        show_dirs (bool, optional): Whether to display the transcript object's methods and attributes. Defaults to False.
        show_vars (bool, optional): Whether to display the transcript object's variables and their values. Defaults to False.

    Returns:
        None
    """
    
    display_str = '\n\t'.join([
        'Transcript:',
        f'{transcript.video_id=}',
        f'{transcript.is_generated=}',
        f'{transcript.language=}',
        f'{transcript.language_code=}',
        f'translation_languages: {len(transcript.translation_languages)}',
    ])

    print(display_str)

    if show_dirs:
        display(transcript.__dir__())
    if show_vars:
        display(vars(transcript))


def show_video_metadata(metadata: dict, show_everything=False) -> None:

    # Show all (the many) available information
    if show_everything:
        pprint(metadata)
        return

    # Show only main information
    metadata = {
        'id': metadata.get('id'),
        'title': metadata.get('title'),
        'url': metadata.get('url'),
        'duration': metadata.get('duration'),
        'view_count': metadata.get('view_count'),
        'description': metadata.get('description'),
        'upload_date': metadata.get('upload_date'),
        'channel': metadata.get('channel'),
        'channel_id': metadata.get('channel_id'),
        'likes': metadata.get('like_count'),
        # 'thumbnail': metadata.get('thumbnail'),
    }

    display(metadata)
    

## Main

### Naming

In [3]:
path_out_root = '../out'
path_out_cache = '../out/.cache'

Function to split file names, path and extension

In [None]:

def get_filename_and_extension(file_path: str, expected_extension: str = None) -> tuple | None:
    ''' 
        Parse a filename into its path, name, and extension.

        Args:
            file_path (str): Path to the file to parse.
            expected_extension (str, optional): The expected file extension. Defaults to None.

        Returns:
            tuple | None: A tuple containing the file path, name, and extension. If the filename does not match the expected pattern, returns None.

        Raises:
            ValueError
    ''' 

    # Build regex
    extension_pattern = expected_extension or '\\w+'
    patter_str = r'^(.*[\\\/])(.+)\.(' + extension_pattern + r')$'
    # print(f'{patter_str=}')
    regex = re.compile(patter_str)
    
    # Test string for pattern matching
    match = re.match(regex, file_path)

    if not match:
        if expected_extension:
            raise ValueError(f'File parsing failed for file "{file_path}" with extension ".{expected_extension}"')
        return None

    # Split file path, file name and extension
    dir_path = match.group(1)
    file_name = match.group(2)
    extension = match.group(3)
    
    return dir_path, file_name, extension


def get_filename_and_extension_test() -> None:
    ''' 
        Test for the get_filename_and_extension function.
    ''' 

    n_errors = 0
    
    # Test 01"
    try:
        
        filename1 = 'heber\Documents\_code\hjcostabr76\rename\transcription\src\Introducao a LIBRAS - Apresentação Hebert Costa [6ERinp7x3kc].mp3'
        result1 = get_filename_and_extension(file_path=filename1)
        
        assert result1[0] == 'heber\\Documents\\_code\\hjcostabr76\rename\transcription\\src\\'
        assert result1[1] == 'Introducao a LIBRAS - Apresentação Hebert Costa [6ERinp7x3kc]'
        assert result1[2] == 'mp3'

    except AssertionError:
        display('"get_filename_and_extension_test::" 01 FAILED!')
        print(f'{result1=}')
        n_errors += 1
    else:
        display('"get_filename_and_extension_test::" 01 OK')

    # # Test 02
    try:
        
        filename2 = 'heber\Documents\_code\hjcostabr76\rename\transcription\src\Introducao a LIBRAS - Apresentação Hebert Costa [6ERinp7x3kc].info.json'
        result2 = get_filename_and_extension(file_path=filename2, expected_extension='info.json')
        
        assert result2[0] == 'heber\\Documents\\_code\\hjcostabr76\rename\transcription\\src\\'
        assert result2[1] == 'Introducao a LIBRAS - Apresentação Hebert Costa [6ERinp7x3kc]'
        assert result2[2] == 'info.json'

    except AssertionError:
        display('"get_filename_and_extension_test::" 02 FAILED!')
        print(f'{result2=}')
        n_errors += 1
    else:
        display('"get_filename_and_extension_test::" 02 OK')

    # # Test 03
    try:
        
        filename3 = 'heber\Documents\_code\hjcostabr76\rename\transcription\src\Introducao a LIBRAS - Apresentação Hebert Costa [6ERinp7x3kc].mp4'
        
        try:
            result2 = get_filename_and_extension(file_path=filename3, expected_extension='info.json')
        except ValueError as err:
            assert str(err) == f'File parsing failed for file "{filename3}" with extension ".info.json"'
        
    except AssertionError:
        display('"get_filename_and_extension_test::" 03 FAILED!')
        n_errors += 1
    else:
        display('"get_filename_and_extension_test::" 03 OK')

    if not n_errors:
        display('"get_filename_and_extension_test::" All tests passed')
    else:
        raise AssertionError(f'{n_errors} failed for "get_filename_and_extension" function!')
    
get_filename_and_extension_test()


  filename1 = 'heber\Documents\_code\hjcostabr76\rename\transcription\src\Introducao a LIBRAS - Apresentação Hebert Costa [6ERinp7x3kc].mp3'
  filename2 = 'heber\Documents\_code\hjcostabr76\rename\transcription\src\Introducao a LIBRAS - Apresentação Hebert Costa [6ERinp7x3kc].info.json'
  filename3 = 'heber\Documents\_code\hjcostabr76\rename\transcription\src\Introducao a LIBRAS - Apresentação Hebert Costa [6ERinp7x3kc].mp4'


'"get_filename_and_extension_test::" 01 OK'

'"get_filename_and_extension_test::" 02 OK'

'"get_filename_and_extension_test::" 03 OK'

'"get_filename_and_extension_test::" All tests passed'

In [None]:
def get_std_out_filename(file_path: str, max_length=30, expected_extension: str = '') -> tuple:
    """
    Generate a standardized output filename based on the provided file path.

    Args:
        file_path (str): The original file path.
        max_length (int, optional): The maximum length of the standardized filename. Defaults to 30.
        expected_extension (str, optional): The expected file extension.
            - If it is present, an error is raised in case of a mismatch.
            - It if is not present, the function tries automatically detect the file extension.

    Returns:
        tuple: A tuple containing the standardized filename and a dictionary with additional details.
    """
    
    dir_path, file_name, extension = get_filename_and_extension(file_path=file_path, expected_extension=expected_extension)
    
    # Remove special characters except spaces
    file_name = file_name.lower()
    file_name = re.sub(r'[^\w\s-]', '', file_name)
    file_name = re.sub(r'áàã', 'a', file_name)
    file_name = re.sub(r'éê', 'e', file_name)
    file_name = re.sub(r'íì', 'i', file_name)
    file_name = re.sub(r'óôõ', 'o', file_name)
    file_name = re.sub(r'ç', 'c', file_name)
    
    # Replace spaces with dashes
    file_name = re.sub(r'[\s\-]+', '-', file_name) 
    
    # Assure max length
    if len(file_name) > max_length:
        file_name = file_name[:(max_length - 1)] + '-'
    
    # Build and return standardized path
    std_name_full = file_name
    if extension:
        std_name_full += f'.{extension}'

    result_details = {
        'full_name': std_name_full,
        'name': file_name,
        'extension': extension
    }

    return std_name_full, result_details


def get_out_file_full_path(file_path: str, max_length=30, prefix: str = '', extension: str = '') -> str:
    ''' 
        # TODO: 2024-11-15 - ADD Description
    '''

    _, std_name_details = get_std_out_filename(file_path=file_path, max_length=max_length, expected_extension=extension)
    print(f'{std_name_details=}')

    # Create target folder
    # NOTE: 2024-11-19 - All output files are grouped and have the same of their parent folder
    full_path = pathlib.Path(prefix) / std_name_details.get('name') / std_name_details.get('full_name')
    full_path.parent.mkdir(parents=True, exist_ok=True)
    return str(full_path.absolute())

def standardize_out_files(video_filename: str, path_out=path_out_root, max_out_length=30, ) -> dict:
    ''' 
        # TODO: 2024-11-19 - ADD Description
    ''' 

    print(f'{video_filename=}')

    dir_path, video_title, media_extension = get_filename_and_extension(file_path=video_filename)
    print(f'{dir_path=}')
    print(f'{video_title=}')
    print(f'{media_extension=}')

    extra_file_ext_map = {
        'media': media_extension,
        'description': 'description',
        'info': 'info.json',
    }
    
    return_map = {}

    for file_type, ext in extra_file_ext_map.items():
        
        # Parse default file name
        default_file_name = f'{video_title}.{ext}'
        # path_default = pathlib.Path(default_file_name).absolute()
        path_default = (pathlib.Path(dir_path) / default_file_name).absolute()
        print(f'{default_file_name=}')
        print(f'{path_default=}')

        if not path_default.exists():
            warnings.warn(f'File not found: {str(path_default)}')
            continue

        # Rename to standard naming
        path_new_str = get_out_file_full_path(file_path=str(path_default), max_length=max_out_length, prefix=path_out, extension=ext)
        print(f'{path_new_str=}')

        path_new = path_default.replace(path_new_str)
        print(f'{path_new=}')
        return_map[file_type] = str(path_new.absolute())

    return return_map

# test_title = 'Apresentação para processo seletivo Americanas Futuro Polo Tech'
# t1 = get_std_out_filename(title=test_title)
# print(f'{t1=}')

# t1 = get_out_file_full_path(title=test_title, prefix=path_out_root)
# print(f'{t1=}')

In [None]:

def get_yt_video_info(
        video_id_or_url: str, max_out_length=30,
        download_audio=False, is_verbose=True, is_windows=True,
        path_out=path_out_root, path_cache=path_out_cache,
) -> dict:
    try: 

        ''' 
            # NOTE: 2024-11-13 - See 'Format Selection' descriptions
            @link https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#format-selection
        ''' 

        # Set options for YouTube downloader
        ydl_opts = {
            
            'format': 'bestaudio' if download_audio else None,  # Download best audio-only format
            
            'quiet': not is_verbose,
            'no_warnings': False,
            'consoletitle': True,       # Display progress in the console window's titlebar.

            'windowsfilenames': is_windows,
            'outtmpl': f'{path_cache}/%(title)s.%(ext)s',
            'extract_flat': False, # Flattens the output directory structure
            
            'skip_download': not download_audio,  # Skip the actual download of the video file
            'noplaylist': True,         # Download single video instead of a playlist if in doubt.
            # 'simulate': True,       # Don't download video files. If unset (or None) -> Simulate only if listsubtitles, listformats or list_thumbnails is used
            # 'getcomments': False,        # Will not be written to disk unless writeinfojson is also given
            
            'writedescription': True,   # Write the video description to a .description file
            'forcejson': True,  # Force printing info_dict as JSON.
            'writeinfojson': True,      # Write the video description to a .info.json file
            # 'clean_infojson': _,        # Remove internal metadata from the infojson

            'keepvideo': True,          # Keep the video file after post-processing
            'cachedir': path_cache,      # Location of the cache files in the filesystem. False to disable filesystem cache.
            # 'logger': _ # Log messages to a logging.Logger instance.': True,
            
            # 'cookiefile': 'cookies.txt',
            # 'cookiesfrombrowser': (_, _, _),    # (tuple) name of the browser + profile name/path from where cookies are loaded + the name of the keyring

        }

        # TODO: 2024-11-13 - Will we really need this?
        # ydl_opts['postprocessors'] = [{
        #     'key': 'FFmpegExtractAudio',  # Extract audio using FFmpeg
        #     'preferredcodec': 'mp3',      # Convert to mp3
        #     'preferredquality': '192',    # Set audio quality
        # }]

        # Run search
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            
            metadata = ydl.extract_info(video_id_or_url, download=download_audio)
            
            # Adjust output file names
            path_out_media_default = ydl.prepare_filename(metadata)
            print(f'{path_out_media_default=}')
            out_files_map = standardize_out_files(video_filename=path_out_media_default, path_out=path_out, max_out_length=max_out_length)

        return out_files_map

    except FileNotFoundError as err:
        print(f"File not found: {err=}")
        raise err

    except Exception as err:
        pprint( f"Error extracting metadata: {str(err)}")
        raise err
    
get_yt_video_info(video_id_or_url=YT_ID_HNAC_01, download_audio=True, )
# pprint(foo)

[youtube] Extracting URL: 6ERinp7x3kc
[youtube] 6ERinp7x3kc: Downloading webpage
[youtube] 6ERinp7x3kc: Downloading ios player API JSON
[youtube] 6ERinp7x3kc: Downloading mweb player API JSON
[youtube] 6ERinp7x3kc: Downloading player 5f315c3d
[youtube] 6ERinp7x3kc: Downloading m3u8 information
[info] 6ERinp7x3kc: Downloading 1 format(s): 251
{"id": "6ERinp7x3kc", "title": "Introducao a LIBRAS - Apresenta\u00e7\u00e3o: Hebert Costa", "formats": [{"format_id": "sb2", "format_note": "storyboard", "ext": "mhtml", "protocol": "mhtml", "acodec": "none", "vcodec": "none", "url": "https://i.ytimg.com/sb/6ERinp7x3kc/storyboard3_L0/default.jpg?sqp=-oaymwENSDfyq4qpAwVwAcABBqLzl_8DBginhujABQ==&sigh=rs$AOn4CLAfKL5PcI6BkESi_0K_gfSdjdE0Ng", "width": 48, "height": 27, "fps": 1.5873015873015872, "rows": 10, "columns": 10, "fragments": [{"url": "https://i.ytimg.com/sb/6ERinp7x3kc/storyboard3_L0/default.jpg?sqp=-oaymwENSDfyq4qpAwVwAcABBqLzl_8DBginhujABQ==&sigh=rs$AOn4CLAfKL5PcI6BkESi_0K_gfSdjdE0Ng", "dur

In [4]:
def get_transcript(video_id: str, display_metadata=False) -> None:

    try:

        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id=video_id, cookies=None)
        
        for transcript in transcript_list:

            if display_metadata:
                show_video_transcript_obj(transcript=transcript, show_dirs=True, show_vars=True)
            
            # transcript_text = transcript.fetch()
            # open('transcript.json', 'w').write(json.dumps(transcript_text, indent=4))
            
            ''' 
                # TODO: 2024-11-06 - Check this
            ''' 
            return transcript

    except Exception as e:
        print('fu')
        raise e


### Test

In [None]:
transcript1 = get_transcript(video_id=YT_ID_OLA_CIENCIA_01)
show_video_transcript_obj(transcript=transcript1, show_dirs=True, show_vars=True)

transcript2 = get_transcript(video_id=YT_ID_HNAC_01)
show_video_transcript_obj(transcript=transcript2, show_dirs=True, show_vars=True)
