In [None]:
from youtube_transcript_api import YouTubeTranscriptApi, Transcript
from pprint import pprint

import pathlib
import yt_dlp
import re

In [2]:
YT_ID_OLA_CIENCIA_01 = 'hVNEKpSZ_dE'
YT_ID_HNAC_01 = '6ERinp7x3kc'
YT_ID_HNAC_02 = 'fJOlZxNbRaY'

### Utils

In [3]:
def show_video_transcript_obj(transcript: Transcript, show_dirs=False, show_vars=False) -> None:
    """
    Display information about a YouTube transcript object.

    Args:
        transcript (Transcript): The transcript object to display information about.
        show_dirs (bool, optional): Whether to display the transcript object's methods and attributes. Defaults to False.
        show_vars (bool, optional): Whether to display the transcript object's variables and their values. Defaults to False.

    Returns:
        None
    """
    
    display_str = '\n\t'.join([
        'Transcript:',
        f'{transcript.video_id=}',
        f'{transcript.is_generated=}',
        f'{transcript.language=}',
        f'{transcript.language_code=}',
        f'translation_languages: {len(transcript.translation_languages)}',
    ])

    print(display_str)

    if show_dirs:
        display(transcript.__dir__())
    if show_vars:
        display(vars(transcript))


def show_video_metadata(metadata: dict, show_everything=False) -> None:

    # Show all (the many) available information
    if show_everything:
        pprint(metadata)
        return

    # Show only main information
    metadata = {
        'id': metadata.get('id'),
        'title': metadata.get('title'),
        'url': metadata.get('url'),
        'duration': metadata.get('duration'),
        'view_count': metadata.get('view_count'),
        'description': metadata.get('description'),
        'upload_date': metadata.get('upload_date'),
        'channel': metadata.get('channel'),
        'channel_id': metadata.get('channel_id'),
        'likes': metadata.get('like_count'),
        # 'thumbnail': metadata.get('thumbnail'),
    }

    display(metadata)
    

## Main

### Naming

In [14]:
path_out_root = '../out'
path_out_cache = '../out/.cache'

In [25]:
import re


def adsfa():

    import re

    # Example filename
    filename_with_extension = "example_file.txt"

    

    if match:
        filename = match.group(1)  # Filename without extension
        extension = match.group(2)  # File extension
    else:
        print("No match found")




def get_std_out_filename(title: str, max_length=20) -> str:
    ''' 
        # TODO: 2024-11-15 - ADD Description
    ''' 
    
    # Separate filename and extension
    match = re.match(r"(.+)\.(.+)", title)
    title = match.group(1) if match else title
    extension = match.group(2) if match else ''
    
    # Remove special characters except spaces
    title = title.lower()
    
    title = re.sub(r'[^a-z0-9\s]', '', title) 
    title = re.sub(r'áàã', 'a', title)
    title = re.sub(r'éê', 'e', title)
    title = re.sub(r'íì', 'i', title)
    title = re.sub(r'óôõ', 'o', title)
    title = re.sub(r'ç', 'c', title)
    
    # Replace spaces with dashes
    title = re.sub(r'[\s\-]+', '-', title) 
    
    # Assure max length
    if len(title) > max_length:
        title = title[:(max_length - 1)] + '-'
    
    # Return
    if extension:
        title += f'.{extension}'
    
    return title


def get_out_file_full_path(title: str, max_length=20, prefix: str = '') -> str:
    ''' 
        # TODO: 2024-11-15 - ADD Description
    ''' 

    std_name = get_std_out_filename(title=title, max_length=max_length)

    # All output files are grouped and have the same of their parent folder
    full_path = pathlib.Path(prefix) / std_name / std_name
    full_path.parent.mkdir(parents=True, exist_ok=True)
    return full_path.absolute()



# pathlib.Path.mkdir()

# test_title = 'Apresentação para processo seletivo Americanas Futuro Polo Tech'
# t1 = get_std_out_filename(title=test_title)
# print(f'{t1=}')

# t1 = get_out_file_full_path(title=test_title, prefix=path_out_root)
# print(f'{t1=}')

In [26]:
def get_yt_video_info(
        video_id_or_url: str, download_audio=False, display_metadata=False, is_verbose=True, is_windows=True,
        path_out=path_out_root, path_cache=path_out_cache,
        max_out_length=20,
) -> None:
    try:

        # Set options for YouTube downloader
        ydl_opts = {
            
            'format': 'bestaudio' if download_audio else None,  # Download best audio-only format
            
            'quiet': not is_verbose,
            'no_warnings': False,
            'consoletitle': True,       # Display progress in the console window's titlebar.

            'windowsfilenames': is_windows,
            'outtmpl': f'{path_cache}/%(title)s.%(ext)s',
            'extract_flat': False, # Flattens the output directory structure
            
            'skip_download': not download_audio,  # Skip the actual download of the video file
            'noplaylist': True,         # Download single video instead of a playlist if in doubt.
            # 'simulate': True,       # Don't download video files. If unset (or None) -> Simulate only if listsubtitles, listformats or list_thumbnails is used
            # 'getcomments': False,        # Will not be written to disk unless writeinfojson is also given
            
            'writedescription': True,   # Write the video description to a .description file
            'forcejson': True,  # Force printing info_dict as JSON.
            'writeinfojson': True,      # Write the video description to a .info.json file
            # 'clean_infojson': _,        # Remove internal metadata from the infojson

            'keepvideo': True,          # Keep the video file after post-processing
            'cachedir': path_cache,      # Location of the cache files in the filesystem. False to disable filesystem cache.
            # 'logger': _ # Log messages to a logging.Logger instance.': True,
            
            # 'cookiefile': 'cookies.txt',
            # 'cookiesfrombrowser': (_, _, _),    # (tuple) name of the browser + profile name/path from where cookies are loaded + the name of the keyring

        }

        extra_file_ext_map = {
            'description': 'description',
            'infojson': 'info.json',
        }

        # Run search
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            
            metadata = ydl.extract_info(video_id_or_url, download=download_audio)
            # if display_metadata:
                # show_video_metadata(metadata=metadata, show_everything=False)

            for _, ext in extra_file_ext_map.items():
                default_file_name = f'{metadata.get("title")}.{ext}'
                print(f'{default_file_name=}')
                # path_default = pathlib.Path(default_file_name).absolute()
                path_new = get_out_file_full_path(title=default_file_name, max_length=max_out_length, prefix=path_out_root)
                print(f'{path_new=}')
                pathlib.Path(default_file_name).replace(path_new)

            path_out_media = ydl.prepare_filename(metadata)
            pprint('path_out_media')
            pprint(path_out_media)

        
        # return metadata
    
    except Exception as e:
        # return f"Error extracting metadata: {str(e)}"
        print(f"Error extracting metadata: {str(e)}")
        raise
    
get_yt_video_info(video_id_or_url=YT_ID_HNAC_01, download_audio=True, )

[youtube] Extracting URL: 6ERinp7x3kc
[youtube] 6ERinp7x3kc: Downloading webpage
[youtube] 6ERinp7x3kc: Downloading ios player API JSON
[youtube] 6ERinp7x3kc: Downloading mweb player API JSON
[youtube] 6ERinp7x3kc: Downloading m3u8 information
[info] 6ERinp7x3kc: Downloading 1 format(s): 251
{"id": "6ERinp7x3kc", "title": "Introducao a LIBRAS - Apresenta\u00e7\u00e3o: Hebert Costa", "formats": [{"format_id": "sb2", "format_note": "storyboard", "ext": "mhtml", "protocol": "mhtml", "acodec": "none", "vcodec": "none", "url": "https://i.ytimg.com/sb/6ERinp7x3kc/storyboard3_L0/default.jpg?sqp=-oaymwENSDfyq4qpAwVwAcABBqLzl_8DBginhujABQ==&sigh=rs$AOn4CLAfKL5PcI6BkESi_0K_gfSdjdE0Ng", "width": 48, "height": 27, "fps": 1.5873015873015872, "rows": 10, "columns": 10, "fragments": [{"url": "https://i.ytimg.com/sb/6ERinp7x3kc/storyboard3_L0/default.jpg?sqp=-oaymwENSDfyq4qpAwVwAcABBqLzl_8DBginhujABQ==&sigh=rs$AOn4CLAfKL5PcI6BkESi_0K_gfSdjdE0Ng", "duration": 63.0}], "audio_ext": "none", "video_ext": "

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Introducao a LIBRAS - Apresentação: Hebert Costa.description' -> 'c:\\Users\\hcosta\\Documents\\__code\\rename\\transcription\\src\\..\\out\\introducao-a-libras-.description\\introducao-a-libras-.description'

In [4]:
def get_transcript(video_id: str, display_metadata=False) -> None:

    try:

        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id=video_id, cookies=None)
        
        for transcript in transcript_list:

            if display_metadata:
                show_video_transcript_obj(transcript=transcript, show_dirs=True, show_vars=True)
            
            # transcript_text = transcript.fetch()
            # open('transcript.json', 'w').write(json.dumps(transcript_text, indent=4))
            
            ''' 
                # TODO: 2024-11-06 - Check this
            ''' 
            return transcript

    except Exception as e:
        print('fu')
        raise e


### Test

In [None]:
transcript1 = get_transcript(video_id=YT_ID_OLA_CIENCIA_01)
show_video_transcript_obj(transcript=transcript1, show_dirs=True, show_vars=True)

transcript2 = get_transcript(video_id=YT_ID_HNAC_01)
show_video_transcript_obj(transcript=transcript2, show_dirs=True, show_vars=True)
