In [None]:
!pip install youtube-transcript-api yt-dlp faster-whisper

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
import yt_dlp
from faster_whisper import WhisperModel
import os
import torch

In [None]:
def get_transcript(video_url, model_size="medium"):
    # Extract video ID
    video_id = video_url.split("v=")[-1].split("&")[0]

    try:
        # Try YouTube transcript API first
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = " ".join([t["text"] for t in transcript])
        print("Transcript fetched from YouTube captions")
        return text
    except Exception as e:
        print(f"No captions available. Falling back to STT. Reason: {e}")

        # Download audio with yt-dlp
        ydl_opts = {
            "format": "bestaudio/best",
            "outtmpl": "audio.%(ext)s",
            "postprocessors": [{
                "key": "FFmpegExtractAudio",
                "preferredcodec": "mp3",
                "preferredquality": "192",
            }],
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])

        audio_file = "audio.mp3"

        # Load Faster-Whisper
        model = WhisperModel(model_size, device="cuda" if torch.cuda.is_available() else "cpu")

        segments, _ = model.transcribe(audio_file, beam_size=5)

        text = " ".join([seg.text for seg in segments])
        print("Transcript generated with Faster-Whisper")
        return text


In [None]:

# Example usage
video_url = "Video Urltranscript_text = get_transcript(video_url)

with open("final_transcript.txt", "w", encoding="utf-8") as f:
    f.write(transcript_text)

In [None]:
with open("final_transcript.txt", "r", encoding="utf-8") as f:
    transcript_text = f.read()

print("Transcript read from file.")
# print(transcript_text) # Uncomment to see the full transcript

In [None]:
len(transcript_text)

In [None]:
transcript_text

In [None]:
# A simple approach: split by spaces and group words into chunks
words = transcript_text.split()
chunks = []
current_chunk = ""
max_chunk_size = 800

for word in words:
    if len(current_chunk) + len(word) + 1 <= max_chunk_size:
        current_chunk += (word + " ")
    else:
        chunks.append(current_chunk)
        current_chunk = (word + " ")

if current_chunk:
    chunks.append(current_chunk)

print(f"Transcript split into {len(chunks)} chunks.")

In [None]:
chunks[0]

In [None]:
from transformers import pipeline
import torch

# device = 0 if torch.cuda.is_available() else -1
device = "cpu" # Explicitly set to CPU for debugging

summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device)

# Ensure the transcript_text variable contains the text you want to summarize
final_summary_chunks = []
for i, chunk in enumerate(chunks):
    try:
        summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)
        final_summary_chunks.append(summary[0]['summary_text'])
        print(f"Successfully summarized chunk {i+1}")
    except Exception as e:
        print(f"Error summarizing chunk {i+1}: {e}")
        # print(f"Problematic chunk {i+1}:\n{chunk}")
        continue # Skip to the next chunk if an error occurs

In [None]:
final_summary_chunks

In [None]:
combined_summary = " ".join(final_summary_chunks)
print(combined_summary)

In [None]:
!pip install edge-tts

import edge_tts
import asyncio
from IPython.display import Audio

In [None]:
async def tts_edge(text, filename="podcast.mp3"):
    communicate = edge_tts.Communicate(text, voice="en-US-JennyNeural")
    await communicate.save(filename)

await tts_edge(combined_summary)
Audio("podcast.mp3")