In [23]:
import nltk
from youtube_transcript_api import YouTubeTranscriptApi

def get_captions(youtube_video_id):
    # Získání titulků z YouTube videa
    try:
        captions = YouTubeTranscriptApi.get_transcript(youtube_video_id)
    except:
        captions = [{'text': 'TranscriptsDisabled'}]
    return captions

def get_english_captions(captions):
    # Získání anglických titulků
    english_captions = [caption for caption in captions if 'language' in caption and caption['language'] == 'en']
    if len(english_captions) == 0:
        english_captions = [caption for caption in captions if 'language' in caption and caption['language'] == 'a.en']
    if len(english_captions) == 0:
        english_captions = [caption for caption in captions if 'language' in caption and caption['language'] == 'und']
    return english_captions

def captions_to_list(captions):
    # Převod titulků na seznam
    text_list = []
    for caption in captions:
        text_list.append(caption['text'])
    return text_list

def stem_text(text_list):
    # Stemování seznamu pomocí NLTK
    ps = nltk.PorterStemmer()
    stemmed_list = []
    for text in text_list:
        words = nltk.word_tokenize(text)
        stemmed_words = [ps.stem(word) for word in words]
        stemmed_sentence = ' '.join(stemmed_words)
        stemmed_list.append(stemmed_sentence)
    return stemmed_list

def save_files(youtube_video_id, captions, text_list, stemmed_list):
    # Uložení titulků do souborů
    english_captions = get_english_captions(captions)
    with open(f"{youtube_video_id}-captions-without-timestamps.txt", "w", encoding="utf-8") as file:
        for line in english_captions:
            file.write(line['text'] + "\n")
    with open(f"{youtube_video_id}-captions-with-timestamps.txt", "w", encoding="utf-8") as file:
        for line in captions:
            file.write(str(line['start']) + " " + line['text'] + "\n")
    with open(f"{youtube_video_id}-list.txt", "w", encoding="utf-8") as file:
        file.write(str(text_list))
    with open(f"{youtube_video_id}-stemmed-list.txt", "w", encoding="utf-8") as file:
        for line in stemmed_list:
            file.write(line + "\n")

# Example usage
youtube_video_id = "U_n41RLJu_w"
captions = get_captions(youtube_video_id)
text_list = captions_to_list(captions)
stemmed_list = stem_text(text_list)
save_files(youtube_video_id, captions, text_list, stemmed_list)