In [23]:
import configparser

config = configparser.ConfigParser()
config.read('config.ini')

google_api_key = config['google']['api']


In [24]:
import os
import json
import re
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build

# YouTube API setup

youtube = build("youtube", "v3", developerKey=google_api_key)

def sanitize_filename(name):
    # Remove any unsafe characters for filenames
    name = re.sub(r'[\\/:"*?<>|]+', "", name)
    return name.strip().replace(" ", "_")

def get_playlist_name(playlist_id):
    try:
        playlist_response = youtube.playlists().list(
            part="snippet",
            id=playlist_id
        ).execute()
        items = playlist_response.get("items", [])
        if items:
            return items[0]["snippet"]["title"]
    except Exception as e:
        print(f"Error retrieving playlist name: {e}")
    return "captions"

def get_video_captions(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        # Extract only the caption text from each segment, ignoring timestamps and duration.
        captions_texts = [segment["text"] for segment in transcript]
        return captions_texts
    except Exception as e:
        print(f"Error retrieving captions for {video_id}: {e}")
        return []

def get_playlist_videos(playlist_id):
    videos = []
    next_page_token = None

    while True:
        playlist_items = youtube.playlistItems().list(
            part="snippet",
            playlistId=playlist_id,
            maxResults=50,
            pageToken=next_page_token
        ).execute()

        for item in playlist_items["items"]:
            video_id = item["snippet"]["resourceId"]["videoId"]
            title = item["snippet"]["title"]
            url = f"https://www.youtube.com/watch?v={video_id}"
            captions = get_video_captions(video_id)
            videos.append({
                "title": title,
                "url": url,
                "captions": captions
            })

        next_page_token = playlist_items.get("nextPageToken")
        if not next_page_token:
            break

    return videos

def save_videos_to_json(videos, output_filename):
    with open(output_filename, "w", encoding="utf-8") as f:
        json.dump(videos, f, ensure_ascii=False, indent=4)
    print(f"Saved {len(videos)} videos to {output_filename}")


In [25]:
playlist_id = "PLHutrxqbP1BzzTi8odV40RhLZQjK8Iy6_"


# Get the playlist name to name the file accordingly
playlist_title = get_playlist_name(playlist_id)
sanitized_title = sanitize_filename(playlist_title)
output_filename = f"{sanitized_title}.json"

videos = get_playlist_videos(playlist_id)
save_videos_to_json(videos, output_filename)

Saved 22 videos to Data_Analytics_with_Generative_AI.json
