In [7]:
import yt_dlp
import os
import re

def extract_text_from_vtt(vtt_file_path):
    """
    Extrai o texto de um arquivo .vtt, removendo as marcações de tempo, formatação e duplicatas.
    """
    if not os.path.exists(vtt_file_path):
        return None

    subtitle_text = []
    with open(vtt_file_path, 'r', encoding='utf-8') as f:
        previous_line = None
        for line in f:
            # Remove as marcações de tempo e formatação usando expressão regular
            cleaned_line = re.sub(r'<[^>]+>', '', line).strip()
            # Ignora linhas vazias e continua adicionando o texto limpo
            if cleaned_line and not '-->' in cleaned_line:
                # Evita adicionar linhas duplicadas consecutivamente
                if cleaned_line != previous_line:
                    subtitle_text.append(cleaned_line)
                previous_line = cleaned_line
    
    # Junta o texto das legendas em uma única string, removendo possíveis duplicatas
    subtitle_text = "\n".join(subtitle_text)
    
    # Remover duplicatas em linhas não consecutivas
    unique_lines = list(dict.fromkeys(subtitle_text.splitlines()))
    return "\n".join(unique_lines)

def download_video_and_subtitles(url):
    try:
        # Opções de download para vídeo e legendas
        ydl_opts = {
            'outtmpl': 'downloaded_video.%(ext)s',  # Nome do arquivo de saída
            'format': 'best',  # Baixa a melhor qualidade disponível
            'subtitleslangs': ['en'],  # Baixa as legendas em inglês
            'writesubtitles': True,  # Especifica que queremos as legendas
            'writeautomaticsub': True,  # Tenta pegar as legendas automáticas, se disponíveis
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info_dict = ydl.extract_info(url, download=True)
            video_filename = ydl.prepare_filename(info_dict)

            # Verifica se as legendas foram baixadas
            subtitle_filename = video_filename.rsplit('.', 1)[0] + ".en.vtt"
            subtitle_text = None
            if os.path.exists(subtitle_filename):
                subtitle_text = extract_text_from_vtt(subtitle_filename)

            return video_filename, subtitle_text

    except Exception as e:
        print(f"Failed to download video or subtitles: {str(e)}")
        return None, None

# Exemplo de uso:
url = 'https://www.youtube.com/watch?v=-gYpCIbZjUQ'
video_filename, subtitles_text = download_video_and_subtitles(url)

if video_filename:
    print(f"Video downloaded: {video_filename}")
    if subtitles_text:
        print("Subtitles text extracted:")
        print(subtitles_text)
    else:
        print("No subtitles found.")
else:
    print("Failed to download video.")

[youtube] Extracting URL: https://www.youtube.com/watch?v=-gYpCIbZjUQ
[youtube] -gYpCIbZjUQ: Downloading webpage
[youtube] -gYpCIbZjUQ: Downloading ios player API JSON
[youtube] -gYpCIbZjUQ: Downloading web creator player API JSON
[youtube] -gYpCIbZjUQ: Downloading m3u8 information
[info] -gYpCIbZjUQ: Downloading subtitles: en
[info] -gYpCIbZjUQ: Downloading 1 format(s): 18
Deleting existing file downloaded_video.en.vtt
[info] Writing video subtitles to: downloaded_video.en.vtt
[download] Destination: downloaded_video.en.vtt
[download] 100% of   94.17KiB in 00:00:00 at 633.78KiB/s
[download] downloaded_video.mp4 has already been downloaded
[download] 100% of   14.95MiB
Video downloaded: downloaded_video.mp4
Subtitles text extracted:
WEBVTT
Kind: captions
Language: en
foreign
[Music]
part of Spiritual Development is to
recognize the satanic tendencies that
characterize you
and to fully wrestle with them and to
and to integrate them that's the thing
it's it's not so much to cast them awa