In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import CouldNotRetrieveTranscript
import json
import os

In [2]:
def get_transcripts(video_id, is_generated=False):
    transcripts = dict()
    languages = ['en', 'th']
    try:
        # List all available transcripts for the video
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        
        # Iterate over all available transcripts
        for transcript in transcript_list:
            try:
                # Fetch the transcript if it matches the `is_generated` flag
                if transcript.is_generated == is_generated and transcript.language_code.strip() in languages:
                    tr = YouTubeTranscriptApi.get_transcript(video_id, languages=[transcript.language_code])
                    transcripts[transcript.language_code] = tr
            except Exception as e:
                print(f"Error fetching transcript for video ID {video_id} in language {transcript.language_code}: {e}")

    except CouldNotRetrieveTranscript as e:
        print(f"Transcript could not be retrieved for video ID {video_id}: {e}")
    except Exception as e:
        print(f"An error occurred while processing video ID {video_id}: {e}")

    return transcripts


In [3]:
def dump_transcript(channel):
    os.makedirs(f'all_transcript', exist_ok=True)
    os.makedirs(f'all_transcript/{channel}', exist_ok=True)
    os.makedirs(f'all_transcript/{channel}/transcripts', exist_ok=True)
    json_file_path = f'all_transcript/{channel}/unique_videos.json'

    # Read the JSON file
    with open(json_file_path, 'r', encoding='utf-8') as json_file:
        unique_videos = json.load(json_file)

    # Initialize a list to store data with title, video_id, and transcript
    # Process each video
    for title, video_id in unique_videos.items():
        tr = get_transcripts(video_id=video_id)  # Assuming get_transcripts function returns the transcript
        video_data = {
        'title': title,
        'video_id': video_id,
        'transcript': tr
        }
        output_file_path = f'all_transcript/{channel}/transcripts/{video_id}.json'
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            json.dump(video_data, output_file, ensure_ascii=False, indent=4)

    print(f"Transcripts saved")

In [4]:
dump_transcript('ted-ed')

Error fetching transcript for video ID LBALm7CeEG4 in language th: no element found: line 1, column 0
Error fetching transcript for video ID 9sNpMb4M7XM in language en: no element found: line 1, column 0
Error fetching transcript for video ID deNGkzUlhZU in language en: no element found: line 1, column 0
Error fetching transcript for video ID QkZCPMVgR4g in language en: no element found: line 1, column 0
Transcripts saved
