In [245]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import requests
import time
from youtube_transcript_api import YouTubeTranscriptApi
import re
import yt_dlp
import json
import glob
import os

In [246]:
AUDIO_DIR = "audio"

In [247]:
def get_transcript(id: str) -> None:
    transcript = YouTubeTranscriptApi.get_transcript(
        id, languages=('en', 'en-US', 'en-GB'))
    return transcript

In [248]:
def download_audio(url):
    print("in download_audio")
    try:
        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            },
            {
                'key': 'FFmpegSplitChapters',  # Adding split chapters processor
                'force_keyframes': True,  # Optional: force keyframes at the start of each chapter
                }],
            'noplaylist': True,
            'writeinfojson': True,  # Optional: write metadata into a JSON file
            'writeannotations': True,  # Optional: write annotations into a file
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
    except Exception as e:
        print("Error downloading audio: ", e)

In [249]:
def extract_text(transcript, start_time, end_time):
    # Initialize an empty string to hold the extracted text
    extracted_text = ""
    
    # Loop through each entry in the transcript
    for entry in transcript:
        # Calculate the end time of the current entry
        entry_end_time = entry['start'] + entry['duration']
        
        # Check if the entry overlaps with the given time range
        if entry['start'] < end_time and entry_end_time > start_time:
            # Add the text to the extracted text string
            extracted_text += entry['text'] + " "
    
    return extracted_text.strip()


In [250]:
def main():
    driver = webdriver.Chrome()

    all_results = []  # Start with an empty list to store dictionaries

    video_urls = []
    video_urls.extend([{"link": "https://www.youtube.com/watch?v=yNuLPWu38IU&ab_channel=NorgesBankInvestmentManagement", "channel": "In Good Company", "episode": "Satya Nadella - CEO of Microsoft | In Good Company | Podcast | Norges Bank Investment Management"},
                      {"link": "https://www.youtube.com/watch?v=rsa1_d8Z0Zc&ab_channel=NoPriors%3AAI%2CMachineLearning%2CTech%2C%26Startups", "channel": "No Priors", "episode": "No Priors Ep. 57 | With LangChain CEO and Co-Founder Harrison Chase"}])

    episodes_to_transcripts = {}

    curr_base_idx = 0

    for video in video_urls:
        episode = video['episode']
        channel = video['channel']
        id = video['link'].split("v=")[1]
        print("id is: " + id)

        if episode not in episodes_to_transcripts:
            transcript_data = get_transcript(id)
            episodes_to_transcripts[episode] = transcript_data
        
        download_audio(video['link'])

        files = glob.glob(f'{episode[:10]}*.json')
        if not files:
            raise FileNotFoundError("No JSON file found matching the pattern.")
        json_file = files[0]
        with open(json_file, 'r') as file:
            data = json.load(file)
            chapters = data['chapters']
        
        for index, chapter in enumerate(chapters):
            extracted_text = extract_text(episodes_to_transcripts[episode], chapter['start_time'], chapter['end_time'])

            chapter_dict = {
                "channel": channel,
                "episode": episode,
                "chapter": chapter['title'],
                "start_time": chapter['start_time'],
                "end_time": chapter['end_time'],
                "text": extracted_text,
                "file_name_prefix": f"{episode} - {str(index + 1).zfill(3)}",
                "prev_index": curr_base_idx + index - 1 if index > 0 else None,
                "next_index": curr_base_idx + index + 1 if index < len(chapters) - 1 else None
            }
            
            all_results.append(chapter_dict)  # Add dictionaries to the list
        curr_base_idx += len(chapters)

    driver.quit()

    columns = ['channel', 'episode', 'chapter', 'start_time', 'end_time', 'text', 'file_name_prefix', 'prev_index', 'next_index']
    return pd.DataFrame(all_results, columns=columns)  # Create DataFrame from list of dictionaries

# Run the script
df = main()
print(df)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


id is: yNuLPWu38IU&ab_channel=NorgesBankInvestmentManagement
in download_audio
[youtube] Extracting URL: https://www.youtube.com/watch?v=yNuLPWu38IU&ab_channel=NorgesBankInvestmentManagement
[youtube] yNuLPWu38IU: Downloading webpage
[youtube] yNuLPWu38IU: Downloading ios player API JSON
[youtube] yNuLPWu38IU: Downloading android player API JSON
[youtube] yNuLPWu38IU: Downloading m3u8 information
[info] yNuLPWu38IU: Downloading 1 format(s): 251
[info] Writing video metadata as JSON to: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].info.json




[download] Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].webm
[download] 100% of   31.81MiB in 00:00:01 at 27.27MiB/s    
[ExtractAudio] Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Deleting original file Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].webm (pass -k to keep)
[SplitChapters] Re-encoding "Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].mp3" with appropriate keyframes


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Splitting video by chapters; 18 chapters found
[SplitChapters] Chapter 001; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 001 Introduction [yNuLPWu38IU].mp3
[SplitChapters] Chapter 002; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 002 Whats on your mind [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 003; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 003 How can technology be a driver for economic growth [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 004; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 004 Microsofts partnership with Open AI [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 005; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 005 Where is Microsoft in the AI ecosystem [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 006; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 006 Year 2 of the Paradigm Shift [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 007; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 007 Scaling Laws [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 008; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 008 Small Language Models [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 009; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 009 Tech in Geopolitics [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 010; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 010 Quantum Computing [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 011; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 011 Gaming [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 012; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 012 Changes at Microsoft [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 013; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 013 Empathy [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 014; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 014 Lifes experiences [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 015; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 015 Empathy and execution [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 016; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 016 Humility [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 017; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 017 Poetry [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 018; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 018 Advice to young people [yNuLPWu38IU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


id is: 8USI98_5GeU&ab_channel=TalkingSasquach
in download_audio
[youtube] Extracting URL: https://www.youtube.com/watch?v=8USI98_5GeU&ab_channel=TalkingSasquach
[youtube] 8USI98_5GeU: Downloading webpage
[youtube] 8USI98_5GeU: Downloading ios player API JSON
[youtube] 8USI98_5GeU: Downloading android player API JSON
[youtube] 8USI98_5GeU: Downloading m3u8 information
[info] 8USI98_5GeU: Downloading 1 format(s): 251
[info] Writing video metadata as JSON to: Momentum Firmware for Flipper Zero ：  The Next Generation of Flipper Custom Firmware!! [8USI98_5GeU].info.json




[download] Destination: Momentum Firmware for Flipper Zero ：  The Next Generation of Flipper Custom Firmware!! [8USI98_5GeU].webm
[download] 100% of   12.21MiB in 00:00:00 at 20.01MiB/s    
[ExtractAudio] Destination: Momentum Firmware for Flipper Zero ：  The Next Generation of Flipper Custom Firmware!! [8USI98_5GeU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Deleting original file Momentum Firmware for Flipper Zero ：  The Next Generation of Flipper Custom Firmware!! [8USI98_5GeU].webm (pass -k to keep)
[SplitChapters] Re-encoding "Momentum Firmware for Flipper Zero ：  The Next Generation of Flipper Custom Firmware!! [8USI98_5GeU].mp3" with appropriate keyframes


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Splitting video by chapters; 8 chapters found
[SplitChapters] Chapter 001; Destination: Momentum Firmware for Flipper Zero ：  The Next Generation of Flipper Custom Firmware!! - 001 Intro [8USI98_5GeU].mp3
[SplitChapters] Chapter 002; Destination: Momentum Firmware for Flipper Zero ：  The Next Generation of Flipper Custom Firmware!! - 002 Where to Download [8USI98_5GeU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 003; Destination: Momentum Firmware for Flipper Zero ：  The Next Generation of Flipper Custom Firmware!! - 003 Installation [8USI98_5GeU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 004; Destination: Momentum Firmware for Flipper Zero ：  The Next Generation of Flipper Custom Firmware!! - 004 USB Mass Storage [8USI98_5GeU].mp3
[SplitChapters] Chapter 005; Destination: Momentum Firmware for Flipper Zero ：  The Next Generation of Flipper Custom Firmware!! - 005 Running the Script [8USI98_5GeU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 006; Destination: Momentum Firmware for Flipper Zero ：  The Next Generation of Flipper Custom Firmware!! - 006 Find My Flipper [8USI98_5GeU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 007; Destination: Momentum Firmware for Flipper Zero ：  The Next Generation of Flipper Custom Firmware!! - 007 Video Game Module [8USI98_5GeU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SplitChapters] Chapter 008; Destination: Momentum Firmware for Flipper Zero ：  The Next Generation of Flipper Custom Firmware!! - 008 RGB Sync [8USI98_5GeU].mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


              channel                                            episode  \
0     In Good Company  Satya Nadella - CEO of Microsoft | In Good Com...   
1     In Good Company  Satya Nadella - CEO of Microsoft | In Good Com...   
2     In Good Company  Satya Nadella - CEO of Microsoft | In Good Com...   
3     In Good Company  Satya Nadella - CEO of Microsoft | In Good Com...   
4     In Good Company  Satya Nadella - CEO of Microsoft | In Good Com...   
5     In Good Company  Satya Nadella - CEO of Microsoft | In Good Com...   
6     In Good Company  Satya Nadella - CEO of Microsoft | In Good Com...   
7     In Good Company  Satya Nadella - CEO of Microsoft | In Good Com...   
8     In Good Company  Satya Nadella - CEO of Microsoft | In Good Com...   
9     In Good Company  Satya Nadella - CEO of Microsoft | In Good Com...   
10    In Good Company  Satya Nadella - CEO of Microsoft | In Good Com...   
11    In Good Company  Satya Nadella - CEO of Microsoft | In Good Com...   
12    In Goo

In [251]:
df.to_csv('ballsack.csv', index=False)