In [None]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.getenv("OPEN_AI_KEY")

Get video transcript or audio file if video has no captions

In [None]:
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
from datetime import date

videos = open("inputs.txt", "r")
video_obj = {}

for video in videos:
    # Get YouTube video
    yt = YouTube(video)
    video_obj[yt.title] = { "author": yt.author, "transcribed_on": date.today() }
    caption=YouTubeTranscriptApi.get_transcript(yt.video_id, languages=['en'])

    # Get video transcription, if none exist, download audio (last stream is guaranteed English)
    if not caption:
        streams = yt.streams.filter(only_audio=True, file_extension="mp4")
        streams[-1].download("audio_files")
    else:
        f = open("transcripts/" + yt.title + ".txt", "w")
        for obj in caption:
            f.write(obj['text'])
        f.close()

Transcribe audio files (if they exist)

In [None]:
import subprocess
import sys

audio_files = os.fsencode("audio_files")

for file in os.listdir(audio_files):
    filename = os.fsdecode(file)
    if filename.endswith(".mp4"):
        title = filename.replace(".mp4", "")

        # convert mp4 file to wav (16-bit)
        new_filename = title + ".wav"
        subprocess.run([
            'ffmpeg',
            '-i', os.path.join("audio_files", filename),
            '-ar', '16000',
            '-ac', '1',
            '-c:a', 'pcm_s16le',
            os.path.join("audio_files", new_filename)
        ])

        # call local whisper model
        result = subprocess.check_output([
            './main',
            '-f', '../Podcast-Summarizer/audio_files/' + new_filename
        ], cwd="../whisper.cpp")

        f = open("transcripts/" + title + ".txt", "w")
        f.write(result.decode(sys.stdout.encoding).strip())
        f.close()

print(video_obj)        

Summarize Transcripts

In [None]:
# Path to your 'transcripts' directory
directory_path = 'transcripts'

# Iterate over each file in the directory
for filename in os.listdir(directory_path):
    # Check if the file is a .txt file
    if filename.endswith('.txt'):
        # Construct the full path to the file
        file_path = os.path.join(directory_path, filename)
        file_path2 = os.path.join('other', 'example.txt')
        # Open and read the file
        example_file=open(file_path2, 'r')
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            string_length = len(content)
            midpoint = string_length // 2  # Using integer division to find the middle index
            combined_message = "use the following to help dictate formatting and how to structure answers, do not use any of the specific content" + "\n" + example_file.read()
            # Split the string into two halves
            first_half = content[:midpoint]
            second_half = content[midpoint:]
            first = openai.chat.completions.create(model="gpt-3.5-turbo", 
            messages=[
                {"role": "system", "content": "You are an investment analyst for a family office who has been tasked with summarizing podcast transcripts for your boss so he can make strategic investment insights."},
                {"role": "user", "content": "Please summarize the following podcast transcript for me in as much detail as possible - approximately 4000 words. Focus on the key points discussed, including the main arguments, insights, and takeaways. Additionally, identify any interesting quotes or counter-arguments presented. Please present in very detailed bullet point form, be sure to never miss a single point"},
                {"role": "user", "content": first_half},

                {"role": "user", "content": "there should be two sections in the output, the high level takeaways of each subject which includes key quotes, and then a detailed breakdown section as noted by the next prompt, try to stay away from generally saying what was talked about and instead focus on the specific details mentioned with some quotes and further information"},
               #  {"role": "user", "content": "it should include a section of high level key takeaways, some of which are crucial quotes but also include some specific takeaways from the whole thing - approximately 10 of them, then a section of a deep breakdown of the entire transcript that includes a header for each subject that is talked about in the podcast and include 5 detailed bullets for each subject, try to stay away from generally saying what was talked about and instead focus on the specific details mentioned with some quotes and further information"},
                {"role": "user", "content": combined_message}
            ]
            )
            print(first.choices[0].message.content)
            print("first half done")
            second = openai.chat.completions.create(model="gpt-3.5-turbo", 
            messages=[
                {"role": "system", "content": "You are an investment analyst for a family office who has been tasked with summarizing podcast transcripts for your boss so he can make strategic investment insights."},
                {"role": "user", "content": "Please summarize the following podcast transcript for me in as much detail as possible - approximately 4000 words. Focus on the key points discussed, including the main arguments, insights, and takeaways. Additionally, identify any interesting quotes or counter-arguments presented. Please present in very detailed bullet point form, be sure to never miss a single point"},
                {"role": "user", "content": second_half},

                {"role": "user", "content": "there should be two sections in the output, the high level takeaways of each subject which includes key quotes, and then a detailed breakdown section as noted by the next prompt, try to stay away from generally saying what was talked about and instead focus on the specific details mentioned with some quotes and further information"},
               #  {"role": "user", "content": "it should include a section of high level key takeaways, some of which are crucial quotes but also include some specific takeaways from the whole thing - approximately 10 of them, then a section of a deep breakdown of the entire transcript that includes a header for each subject that is talked about in the podcast and include 5 detailed bullets for each subject, try to stay away from generally saying what was talked about and instead focus on the specific details mentioned with some quotes and further information"},
                {"role": "user", "content": combined_message}
            ]
            )
            print(second.choices[0].message.content)
            
            title = filename.replace('.txt', '')
            if not os.path.isdir("summaries/" + video_obj[title]['author']):
                os.mkdir("summaries/" + video_obj[title]['author'])
                
            f = open("summaries/" + video_obj[title]['author'] + "/" + filename, "w")
            f.write(first.choices[0].message.content + second.choices[0].message.content)
            f.close()