In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import requests
import time
from youtube_transcript_api import YouTubeTranscriptApi
import re
import yt_dlp
import json
import glob
import os
import csv
from dotenv import load_dotenv

In [11]:
AUDIO_DIR = "audio"

load_dotenv()
API_KEY = os.getenv("API_KEY")

In [12]:
def get_transcript(id: str) -> None:
    transcript = YouTubeTranscriptApi.get_transcript(
        id, languages=('en', 'en-US', 'en-GB'))
    return transcript

In [13]:
def download_audio(url):
    print("in download_audio")
    try:
        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            },
            {
                'key': 'FFmpegSplitChapters',  # Adding split chapters processor
                'force_keyframes': True,  # Optional: force keyframes at the start of each chapter
                }],
            'noplaylist': True,
            'writeinfojson': True,  # Optional: write metadata into a JSON file
            'writeannotations': True,  # Optional: write annotations into a file
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
    except Exception as e:
        print("Error downloading audio: ", e)

In [14]:
def extract_text(transcript, start_time, end_time):
    # Initialize an empty string to hold the extracted text
    extracted_text = ""
    
    # Loop through each entry in the transcript
    for entry in transcript:
        # Calculate the end time of the current entry
        entry_end_time = entry['start'] + entry['duration']
        
        # Check if the entry overlaps with the given time range
        if entry['start'] < end_time and entry_end_time > start_time:
            # Add the text to the extracted text string
            extracted_text += entry['text'] + " "
    
    return extracted_text.strip()


In [15]:
def load_podcast_urls(filename):
    urls = []  # List to hold the URLs
    try:
        with open(filename, mode='r', newline='', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for i, row in enumerate(reader):
                if i >= 10:  # Break the loop after the first 10 entries
                    break
                urls.append(row['URL'])
    except FileNotFoundError:
        print(f"Error: The file '{filename}' does not exist.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    return urls

In [16]:
def get_title_and_channel(url):
    video_id = url.split('v=')[1].split('&')[0]

    print("video_id: ", video_id)

    # YouTube Data API endpoint
    youtube_api_url = f"https://www.googleapis.com/youtube/v3/videos?id={video_id}&key={API_KEY}&part=snippet"

    # Make the API request
    response = requests.get(youtube_api_url)
    data = response.json()

    print("data: ", data)

    # Check if the API call returns items
    if not data.get('items'):
        return "No video found with the given ID."

    # Extract title and channel title
    video_title = data['items'][0]['snippet']['title']
    channel_title = data['items'][0]['snippet']['channelTitle']

    return video_title, channel_title

In [17]:
video_urls_raw = load_podcast_urls("podcast_list.csv")

video_urls = []
for url in video_urls_raw:
    res = get_title_and_channel(url)
    print(str(res))
    video_title = res[0]
    channel_title = res[1]
    video_urls.append({"link": url, "channel": channel_title, "episode" : video_title})

for vid in video_urls:
        print(str(vid))

video_id:  dX7d6bRJI9k
data:  {'kind': 'youtube#videoListResponse', 'etag': 'tByUidWgh05ODITM9I655sJ03NA', 'items': [{'kind': 'youtube#video', 'etag': 'NAJ3urhBqPm8JMSlG5ocWFDfZUk', 'id': 'dX7d6bRJI9k', 'snippet': {'publishedAt': '2024-04-02T22:53:38Z', 'channelId': 'UC9cn0TuPq4dnbTY-CBsm8XA', 'title': 'Politics & the Future of Tech', 'description': "“If America is going to be America in the next hundred years, we have to get this right.” - Ben Horowitz\n\nWelcome to “The Ben & Marc Show”, featuring a16z co-founders Ben Horowitz and Marc Andreessen.  In this latest episode, Marc and Ben take on one of the most hot button issues facing technology today: regulation and policy.\n\nIn this one-on-one conversation, Ben and Marc delve into why the political interests of “Big Tech” often conflict with a positive technological future, the necessity of decentralized AI, and how the future of American innovation is at its most critical point. They also answer YOUR questions from X (formerly Twit

In [18]:
def main():

    all_results = []  # Start with an empty list to store dictionaries

    episodes_to_transcripts = {}

    curr_base_idx = 0

    for video in video_urls:
        episode = video['episode']
        channel = video['channel']
        id = video['link'].split("v=")[1]
        print("id is: " + id)

        if episode not in episodes_to_transcripts:
            transcript_data = get_transcript(id)
            episodes_to_transcripts[episode] = transcript_data
        
        #download_audio(video['link'])

        files = glob.glob(f'{episode[:10]}*.json')
        if not files:
            raise FileNotFoundError("No JSON file found matching the pattern.")
        json_file = files[0]
        with open(json_file, 'r') as file:
            data = json.load(file)
            chapters = data['chapters']
        
        for index, chapter in enumerate(chapters):
            extracted_text = extract_text(episodes_to_transcripts[episode], chapter['start_time'], chapter['end_time'])

            chapter_dict = {
                "channel": channel,
                "episode": episode,
                "chapter": chapter['title'],
                "start_time": chapter['start_time'],
                "end_time": chapter['end_time'],
                "text": extracted_text,
                "file_name_prefix": f"{episode} - {str(index + 1).zfill(3)}",
                "prev_index": curr_base_idx + index - 1 if index > 0 else None,
                "next_index": curr_base_idx + index + 1 if index < len(chapters) - 1 else None
            }
            
            all_results.append(chapter_dict)  # Add dictionaries to the list
        curr_base_idx += len(chapters)

    columns = ['channel', 'episode', 'chapter', 'start_time', 'end_time', 'text', 'file_name_prefix', 'prev_index', 'next_index']
    return pd.DataFrame(all_results, columns=columns)  # Create DataFrame from list of dictionaries

# Run the script
df = main()
print(df)

id is: dX7d6bRJI9k&ab_channel=a16z
in download_audio
[youtube] Extracting URL: https://www.youtube.com/watch?v=dX7d6bRJI9k&ab_channel=a16z
[youtube] dX7d6bRJI9k: Downloading webpage
[youtube] dX7d6bRJI9k: Downloading ios player API JSON
[youtube] dX7d6bRJI9k: Downloading android player API JSON
[youtube] dX7d6bRJI9k: Downloading m3u8 information
[info] dX7d6bRJI9k: Downloading 1 format(s): 251
[info] Writing video metadata as JSON to: Politics & the Future of Tech [dX7d6bRJI9k].info.json




[download] Destination: Politics & the Future of Tech [dX7d6bRJI9k].webm
[download] 100% of  105.43MiB in 00:00:03 at 27.93MiB/s    
[ExtractAudio] Destination: Politics & the Future of Tech [dX7d6bRJI9k].mp3
Deleting original file Politics & the Future of Tech [dX7d6bRJI9k].webm (pass -k to keep)
[SplitChapters] Re-encoding "Politics & the Future of Tech [dX7d6bRJI9k].mp3" with appropriate keyframes
[SplitChapters] Splitting video by chapters; 21 chapters found
[SplitChapters] Chapter 001; Destination: Politics & the Future of Tech - 001 Teaser [dX7d6bRJI9k].mp3
[SplitChapters] Chapter 002; Destination: Politics & the Future of Tech - 002 Intro [dX7d6bRJI9k].mp3
[SplitChapters] Chapter 003; Destination: Politics & the Future of Tech - 003 Why get involved in politics now？ [dX7d6bRJI9k].mp3
[SplitChapters] Chapter 004; Destination: Politics & the Future of Tech - 004 Big Tech's involvement in political process [dX7d6bRJI9k].mp3
[SplitChapters] Chapter 005; Destination: Politics & the F



[download] Destination: A Nuclear Comeback： Are New Reactors the Answer？ [BZUMZl4OsPI].webm
[download] 100% of   45.59MiB in 00:00:02 at 20.26MiB/s    
[ExtractAudio] Destination: A Nuclear Comeback： Are New Reactors the Answer？ [BZUMZl4OsPI].mp3
Deleting original file A Nuclear Comeback： Are New Reactors the Answer？ [BZUMZl4OsPI].webm (pass -k to keep)
[SplitChapters] Re-encoding "A Nuclear Comeback： Are New Reactors the Answer？ [BZUMZl4OsPI].mp3" with appropriate keyframes
[SplitChapters] Splitting video by chapters; 10 chapters found
[SplitChapters] Chapter 001; Destination: A Nuclear Comeback： Are New Reactors the Answer？ - 001 The Promise of Advanced Nuclear Reactors [BZUMZl4OsPI].mp3
[SplitChapters] Chapter 002; Destination: A Nuclear Comeback： Are New Reactors the Answer？ - 002 Nuclear Energy's Current Landscape [BZUMZl4OsPI].mp3
[SplitChapters] Chapter 003; Destination: A Nuclear Comeback： Are New Reactors the Answer？ - 003 Vulnerabilities in Fuel Delivery [BZUMZl4OsPI].mp3
[Sp



[download] Destination: Safety in Numbers： Keeping AI Open [NhASk7rZsmU].webm
[download] 100% of   30.42MiB in 00:00:01 at 19.20MiB/s    
[ExtractAudio] Destination: Safety in Numbers： Keeping AI Open [NhASk7rZsmU].mp3
Deleting original file Safety in Numbers： Keeping AI Open [NhASk7rZsmU].webm (pass -k to keep)
[SplitChapters] Re-encoding "Safety in Numbers： Keeping AI Open [NhASk7rZsmU].mp3" with appropriate keyframes
[SplitChapters] Splitting video by chapters; 10 chapters found
[SplitChapters] Chapter 001; Destination: Safety in Numbers： Keeping AI Open - 001 Introduction to scaling laws and their impact [NhASk7rZsmU].mp3
[SplitChapters] Chapter 002; Destination: Safety in Numbers： Keeping AI Open - 002 Arthur Mensch and the Founding of Mistral [NhASk7rZsmU].mp3
[SplitChapters] Chapter 003; Destination: Safety in Numbers： Keeping AI Open - 003 Mistral 7b and the launch of Mixtral [NhASk7rZsmU].mp3
[SplitChapters] Chapter 004; Destination: Safety in Numbers： Keeping AI Open - 004 Mi



[download] Destination: Yann Lecun： Meta AI, Open Source, Limits of LLMs, AGI & the Future of AI ｜ Lex Fridman Podcast #416 [5t1vTLU7s40].webm
[download] 100% of  139.62MiB in 00:00:06 at 20.15MiB/s    
[ExtractAudio] Destination: Yann Lecun： Meta AI, Open Source, Limits of LLMs, AGI & the Future of AI ｜ Lex Fridman Podcast #416 [5t1vTLU7s40].mp3
Deleting original file Yann Lecun： Meta AI, Open Source, Limits of LLMs, AGI & the Future of AI ｜ Lex Fridman Podcast #416 [5t1vTLU7s40].webm (pass -k to keep)
[SplitChapters] Re-encoding "Yann Lecun： Meta AI, Open Source, Limits of LLMs, AGI & the Future of AI ｜ Lex Fridman Podcast #416 [5t1vTLU7s40].mp3" with appropriate keyframes
[SplitChapters] Splitting video by chapters; 23 chapters found
[SplitChapters] Chapter 001; Destination: Yann Lecun： Meta AI, Open Source, Limits of LLMs, AGI & the Future of AI ｜ Lex Fridman Podcast #416 - 001 Introduction [5t1vTLU7s40].mp3
[SplitChapters] Chapter 002; Destination: Yann Lecun： Meta AI, Open Source



[download] Destination: Sam Altman： OpenAI, GPT-5, Sora, Board Saga, Elon Musk, Ilya, Power & AGI ｜ Lex Fridman Podcast #419 [jvqFAi7vkBc].webm
[download] 100% of   91.73MiB in 00:00:03 at 23.76MiB/s    
[ExtractAudio] Destination: Sam Altman： OpenAI, GPT-5, Sora, Board Saga, Elon Musk, Ilya, Power & AGI ｜ Lex Fridman Podcast #419 [jvqFAi7vkBc].mp3
Deleting original file Sam Altman： OpenAI, GPT-5, Sora, Board Saga, Elon Musk, Ilya, Power & AGI ｜ Lex Fridman Podcast #419 [jvqFAi7vkBc].webm (pass -k to keep)
[SplitChapters] Re-encoding "Sam Altman： OpenAI, GPT-5, Sora, Board Saga, Elon Musk, Ilya, Power & AGI ｜ Lex Fridman Podcast #419 [jvqFAi7vkBc].mp3" with appropriate keyframes
[SplitChapters] Splitting video by chapters; 14 chapters found
[SplitChapters] Chapter 001; Destination: Sam Altman： OpenAI, GPT-5, Sora, Board Saga, Elon Musk, Ilya, Power & AGI ｜ Lex Fridman Podcast #419 - 001 Introduction [jvqFAi7vkBc].mp3
[SplitChapters] Chapter 002; Destination: Sam Altman： OpenAI, GPT-5, 



[download] Destination: Anthropic CEO on Leaving OpenAI and Predictions for Future of AI [gAaCqj6j5sQ].webm
[download] 100% of   92.04MiB in 00:00:03 at 24.00MiB/s    
[ExtractAudio] Destination: Anthropic CEO on Leaving OpenAI and Predictions for Future of AI [gAaCqj6j5sQ].mp3
Deleting original file Anthropic CEO on Leaving OpenAI and Predictions for Future of AI [gAaCqj6j5sQ].webm (pass -k to keep)
[SplitChapters] Re-encoding "Anthropic CEO on Leaving OpenAI and Predictions for Future of AI [gAaCqj6j5sQ].mp3" with appropriate keyframes
[SplitChapters] Splitting video by chapters; 24 chapters found
[SplitChapters] Chapter 001; Destination: Anthropic CEO on Leaving OpenAI and Predictions for Future of AI - 001 Intro [gAaCqj6j5sQ].mp3
[SplitChapters] Chapter 002; Destination: Anthropic CEO on Leaving OpenAI and Predictions for Future of AI - 002 Joining OpenAI [gAaCqj6j5sQ].mp3
[SplitChapters] Chapter 003; Destination: Anthropic CEO on Leaving OpenAI and Predictions for Future of AI - 0



[download] Destination: Emmett Shear on the Future of AI and YC Days with Sam Altman [ICnFtfN-sUc].webm
[download] 100% of  119.56MiB in 00:00:04 at 25.20MiB/s    
[ExtractAudio] Destination: Emmett Shear on the Future of AI and YC Days with Sam Altman [ICnFtfN-sUc].mp3
Deleting original file Emmett Shear on the Future of AI and YC Days with Sam Altman [ICnFtfN-sUc].webm (pass -k to keep)
[SplitChapters] Re-encoding "Emmett Shear on the Future of AI and YC Days with Sam Altman [ICnFtfN-sUc].mp3" with appropriate keyframes
[SplitChapters] Splitting video by chapters; 18 chapters found
[SplitChapters] Chapter 001; Destination: Emmett Shear on the Future of AI and YC Days with Sam Altman - 001 Intro [ICnFtfN-sUc].mp3
[SplitChapters] Chapter 002; Destination: Emmett Shear on the Future of AI and YC Days with Sam Altman - 002 Welcome Emmett Shear [ICnFtfN-sUc].mp3
[SplitChapters] Chapter 003; Destination: Emmett Shear on the Future of AI and YC Days with Sam Altman - 003 Products leading to



[download] Destination: Building to $100B： The CEO that Revolutionized Palo Alto Networks [oO1OBs2zaWU].webm
[download] 100% of   55.80MiB in 00:00:03 at 18.12MiB/s    
[ExtractAudio] Destination: Building to $100B： The CEO that Revolutionized Palo Alto Networks [oO1OBs2zaWU].mp3
Deleting original file Building to $100B： The CEO that Revolutionized Palo Alto Networks [oO1OBs2zaWU].webm (pass -k to keep)
[SplitChapters] Re-encoding "Building to $100B： The CEO that Revolutionized Palo Alto Networks [oO1OBs2zaWU].mp3" with appropriate keyframes
[SplitChapters] Splitting video by chapters; 18 chapters found
[SplitChapters] Chapter 001; Destination: Building to $100B： The CEO that Revolutionized Palo Alto Networks - 001 Intro [oO1OBs2zaWU].mp3
[SplitChapters] Chapter 002; Destination: Building to $100B： The CEO that Revolutionized Palo Alto Networks - 002 The Benefits of Being a Generalist [oO1OBs2zaWU].mp3
[SplitChapters] Chapter 003; Destination: Building to $100B： The CEO that Revolution



[download] Destination: Elon Musk ｜ LIVE Podcast ｜ In Good Company ｜ Norges Bank Investment Management [_rQBZ3vKRA0].webm
[download] 100% of   46.25MiB in 00:00:02 at 16.53MiB/s    
[ExtractAudio] Destination: Elon Musk ｜ LIVE Podcast ｜ In Good Company ｜ Norges Bank Investment Management [_rQBZ3vKRA0].mp3
Deleting original file Elon Musk ｜ LIVE Podcast ｜ In Good Company ｜ Norges Bank Investment Management [_rQBZ3vKRA0].webm (pass -k to keep)
[SplitChapters] Re-encoding "Elon Musk ｜ LIVE Podcast ｜ In Good Company ｜ Norges Bank Investment Management [_rQBZ3vKRA0].mp3" with appropriate keyframes
[SplitChapters] Splitting video by chapters; 37 chapters found
[SplitChapters] Chapter 001; Destination: Elon Musk ｜ LIVE Podcast ｜ In Good Company ｜ Norges Bank Investment Management - 001 Introduction [_rQBZ3vKRA0].mp3
[SplitChapters] Chapter 002; Destination: Elon Musk ｜ LIVE Podcast ｜ In Good Company ｜ Norges Bank Investment Management - 002 Where are we in the AI race [_rQBZ3vKRA0].mp3
[Split



[download] Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].webm
[download] 100% of   31.81MiB in 00:00:01 at 30.38MiB/s    
[ExtractAudio] Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].mp3
Deleting original file Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].webm (pass -k to keep)
[SplitChapters] Re-encoding "Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].mp3" with appropriate keyframes
[SplitChapters] Splitting video by chapters; 18 chapters found
[SplitChapters] Chapter 001; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 001 Introduction [yNuLPWu38IU].mp3
[SplitChapters] Chapter 002; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Comp



[download] Destination: Google fires protestors, NPR chaos, Humane's AI Pin, Startup tax crisis, sports betting scandal [HKtlezdPNAI].webm
[download] 100% of   84.07MiB in 00:00:03 at 25.44MiB/s    
[ExtractAudio] Destination: Google fires protestors, NPR chaos, Humane's AI Pin, Startup tax crisis, sports betting scandal [HKtlezdPNAI].mp3
Deleting original file Google fires protestors, NPR chaos, Humane's AI Pin, Startup tax crisis, sports betting scandal [HKtlezdPNAI].webm (pass -k to keep)
[SplitChapters] Re-encoding "Google fires protestors, NPR chaos, Humane's AI Pin, Startup tax crisis, sports betting scandal [HKtlezdPNAI].mp3" with appropriate keyframes
[SplitChapters] Splitting video by chapters; 8 chapters found
[SplitChapters] Chapter 001; Destination: Google fires protestors, NPR chaos, Humane's AI Pin, Startup tax crisis, sports betting scandal - 001 Bestie Intros： Chamath recaps the Breakthrough Prize Ceremony, ＂High IQ foods＂ [HKtlezdPNAI].mp3
[SplitChapters] Chapter 002; 



[download] Destination: E174： Inflation stays hot, AI disclosure bill, Drone warfare, defense startups & more [hZp80SYIRlY].webm
[download] 100% of   76.93MiB in 00:00:03 at 20.18MiB/s    
[ExtractAudio] Destination: E174： Inflation stays hot, AI disclosure bill, Drone warfare, defense startups & more [hZp80SYIRlY].mp3
Deleting original file E174： Inflation stays hot, AI disclosure bill, Drone warfare, defense startups & more [hZp80SYIRlY].webm (pass -k to keep)
[SplitChapters] Re-encoding "E174： Inflation stays hot, AI disclosure bill, Drone warfare, defense startups & more [hZp80SYIRlY].mp3" with appropriate keyframes
[SplitChapters] Splitting video by chapters; 6 chapters found
[SplitChapters] Chapter 001; Destination: E174： Inflation stays hot, AI disclosure bill, Drone warfare, defense startups & more - 001 Bestie Intros： J-Cal is out this week! [hZp80SYIRlY].mp3
[SplitChapters] Chapter 002; Destination: E174： Inflation stays hot, AI disclosure bill, Drone warfare, defense startup

FileNotFoundError: No JSON file found matching the pattern.

In [None]:
def delete_unmatched_mp3_files():
    # Pattern to match files that should NOT be deleted
    # This regex matches files that include a hyphen, space, three digits, possibly followed by more characters, before ending with ".mp3"
    pattern = re.compile(r'.* - \d{3}.*\.mp3$')

    # Get all files in the current directory
    files = os.listdir('.')

    # Iterate through files
    for file in files:
        # Check if the file is an MP3 and does not match the pattern
        if file.endswith('.mp3') and not pattern.match(file):
            # If it is an MP3 and doesn't match, delete the file
            os.remove(file)
            print(f"Deleted MP3 file: {file}")

# Call the function to perform the operation
# delete_unmatched_mp3_files()