In [147]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import requests
import time
from youtube_transcript_api import YouTubeTranscriptApi
import re
import yt_dlp
import json
import glob

In [148]:
AUDIO_DIR = "audio"

In [149]:
def get_transcript(id: str) -> None:
    transcript = YouTubeTranscriptApi.get_transcript(
        id, languages=('en', 'en-US', 'en-GB'))
    return transcript

In [150]:
def download_audio(url):
    print("in download_audio")
    try:
        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            },
            {
                'key': 'FFmpegSplitChapters',  # Adding split chapters processor
                'force_keyframes': True,  # Optional: force keyframes at the start of each chapter
                }],
            'noplaylist': True,
            'writeinfojson': True,  # Optional: write metadata into a JSON file
            'writeannotations': True,  # Optional: write annotations into a file
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
    except Exception as e:
        print("Error downloading audio: ", e)

In [151]:
def extract_text(transcript, start_time, end_time):
    # Initialize an empty string to hold the extracted text
    extracted_text = ""
    
    # Loop through each entry in the transcript
    for entry in transcript:
        # Calculate the end time of the current entry
        entry_end_time = entry['start'] + entry['duration']
        
        # Check if the entry overlaps with the given time range
        if entry['start'] < end_time and entry_end_time > start_time:
            # Add the text to the extracted text string
            extracted_text += entry['text'] + " "
    
    return extracted_text.strip()


In [152]:
def main():
    driver = webdriver.Chrome()

    all_results = []
    video_urls = []
    video_urls.append({"link": "https://www.youtube.com/watch?v=yNuLPWu38IU&ab_channel=NorgesBankInvestmentManagement", "channel": "In Good Company", "episode": "Satya Nadella - CEO of Microsoft | In Good Company | Podcast | Norges Bank Investment Management"})

    episodes_to_transcripts = {}

    for video in video_urls:
        episode = video['episode']
        channel = video['channel']
        id = video['link'].split("v=")[1]
        print("id is: " + id)

        if episode not in episodes_to_transcripts:
            transcript_data = get_transcript(id)
            episodes_to_transcripts[episode] = transcript_data
        
        # Downloads audio and splits into chapters
        download_audio(video['link'])

        files = glob.glob(f'{episode[:10]}*.json')
        if not files:
            raise FileNotFoundError("No JSON file found matching the pattern.")
        json_file = files[0]  # Assuming you want the first matching file
        with open(json_file, 'r') as file:
            data = json.load(file)
            chapters = data['chapters']
            print("length of chapters: " + str(len(chapters)))
        count = 0

        for chapter in chapters:
            print("start time: " + str(chapter['start_time']))
            print("end time: " + str(chapter['end_time']))
            # Identify corresponding portion of transcript
            extracted_text = extract_text(episodes_to_transcripts[episode], chapter['start_time'], chapter['end_time'])

            chapter_dict = {
            "channel": channel,
            "episode": episode,
            "chapter": chapter['title'],
            "start_time": chapter['start_time'],
            "end_time": chapter['end_time'],
            "text": extracted_text,
            "file_name_prefix": episode + " - " + "00" + str(count)
            }
            count += 1
    # Here, assuming chapter titles are unique, using title as key:
            all_results.append(chapter_dict)
        

    driver.quit()
    return pd.DataFrame(all_results)

# Run the script
df = main()
print(df)

id is: yNuLPWu38IU&ab_channel=NorgesBankInvestmentManagement
in download_audio
[youtube] Extracting URL: https://www.youtube.com/watch?v=yNuLPWu38IU&ab_channel=NorgesBankInvestmentManagement
[youtube] yNuLPWu38IU: Downloading webpage
[youtube] yNuLPWu38IU: Downloading ios player API JSON
[youtube] yNuLPWu38IU: Downloading android player API JSON
[youtube] yNuLPWu38IU: Downloading m3u8 information
[info] yNuLPWu38IU: Downloading 1 format(s): 251
[info] Writing video metadata as JSON to: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].info.json




[download] Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].webm
[download] 100% of   31.81MiB in 00:00:01 at 25.48MiB/s    
[ExtractAudio] Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].mp3
Deleting original file Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].webm (pass -k to keep)
[SplitChapters] Re-encoding "Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management [yNuLPWu38IU].mp3" with appropriate keyframes
[SplitChapters] Splitting video by chapters; 18 chapters found
[SplitChapters] Chapter 001; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Company ｜ Podcast ｜ Norges Bank Investment Management - 001 Introduction [yNuLPWu38IU].mp3
[SplitChapters] Chapter 002; Destination: Satya Nadella - CEO of Microsoft ｜ In Good Comp

In [153]:
df.to_csv('ballsack.csv', index=False)