In [1]:
# Installations required for Colab
!pip install pytube pydub gradio
!apt-get install ffmpeg
!pip install git+https://github.com/linto-ai/whisper-timestamped
!pip install nltk


from pytube import YouTube
from pydub import AudioSegment
import os
import whisper_timestamped as whisper
import nltk
import gradio as gr

nltk.download('punkt')

# Function to download and extract audio from YouTube
def download_and_extract_audio(video_url, output_audio_file):
    # Download the video
    yt = YouTube(video_url)
    stream = yt.streams.filter(only_audio=True).first()
    audio_file = stream.download()

    # Convert the audio file to WAV format using pydub
    audio = AudioSegment.from_file(audio_file)
    audio.export(output_audio_file, format="wav")

    # Clean up the original audio file
    os.remove(audio_file)

# Function to transcribe the audio using whisper-timestamped
def transcribe_audio(audio_file):
    model = whisper.load_model("base")
    audio = whisper.load_audio(audio_file)
    result = whisper.transcribe(model, audio, language="en")
    return result

# Function for semantic chunking of the transcript
def semantic_chunking(transcription_result, max_duration=15):
    chunks = []
    current_chunk = {"start": None, "end": None, "text": ""}
    current_duration = 0
    chunk_id = 0

    for segment in transcription_result["segments"]:
        segment_duration = segment["end"] - segment["start"]

        if current_duration + segment_duration > max_duration:
            chunks.append({
                "chunk_id": chunk_id,
                "chunk_length": current_chunk["end"] - current_chunk["start"],
                "text": current_chunk["text"].strip(),
                "start_time": current_chunk["start"],
                "end_time": current_chunk["end"]
            })
            chunk_id += 1
            current_chunk = {"start": None, "end": None, "text": ""}
            current_duration = 0

        if current_chunk["start"] is None:
            current_chunk["start"] = segment["start"]
        current_chunk["end"] = segment["end"]
        current_chunk["text"] += segment["text"] + " "
        current_duration += segment_duration

    if current_chunk["text"].strip():
        chunks.append({
            "chunk_id": chunk_id,
            "chunk_length": current_chunk["end"] - current_chunk["start"],
            "text": current_chunk["text"].strip(),
            "start_time": current_chunk["start"],
            "end_time": current_chunk["end"]
        })

    return chunks

# Function to process the YouTube video URL
def process_video_url(video_url):
    output_audio_file = 'audio.wav'
    download_and_extract_audio(video_url, output_audio_file)
    transcription_result = transcribe_audio(output_audio_file)
    chunks = semantic_chunking(transcription_result)
    return chunks

# Gradio interface
def gradio_interface(video_url):
    chunks = process_video_url(video_url)
    formatted_chunks = ""
    for chunk in chunks:
        formatted_chunks += f"**Chunk ID:** {chunk['chunk_id']} **Length:** {chunk['chunk_length']}s\n"
        formatted_chunks += f"**Start:** {chunk['start_time']} **End:** {chunk['end_time']}\n"
        formatted_chunks += f"{chunk['text']}\n\n"
    return formatted_chunks

iface = gr.Interface(
    fn=gradio_interface,
    inputs="text",
    outputs="markdown",
    title="YouTube Video Transcription",
    description="Enter the YouTube video URL to get the transcribed text in chunks.",
)

iface.launch()


Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m967.5 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting gradio
  Downloading gradio-4.37.2-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.0.2 (from gradio)
  Downloading gradio_client-1.0.2-py3-non

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://745a13968fa613446c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


