<a href="https://colab.research.google.com/github/hoodini/openai-whisper-colab-by-yuval-avidani/blob/main/OpenAI_Whisper_by_Yuval_Avidani.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **OpenAI's Whisper transcriber by Yuval Avidani - יובל אבידני**

**Please support with Beer: https://linktree.com/hackit.co.il**

**IMPORTANT: V100/A100 GPU IS REQUIRED TO USE THIS NOTEBOOK! OTHERWISE THE NOTEBOOK WILL CRASH AND WILL SHOW CUDA MEMORY ERROR MESSAGES**

This notebook has the followings capabilities:

1. Select between YouTube URL and Media Files Upload
2. YouTube Videos are downloaded and convereted to WAV
3. Uploaded media files are also being converted to WAV
4. File size check is made to adhere to Whisper's file limit size of 25MB
5. If the file is larger, the notebook uses Smart Chuncking
6. It then transcibes each chunk and concataned it all to one SRT / TXT file
7. The files can be downloaded using the last cell

**Instructions:**
1. Run cells 1-2
2. Run cell 3 and note to choose your media source (YouTube URL / Upload Media File)
3. Run cell 4 to get transcription in SRT / TXT format
4. Run cell 5 to download SRT / TXT
Note: the files can also be downloaded from the file explorer on the sidebar.

Enjoy!

**Don't forget to stand with the truth! Stand with Israel against Hamas!**

# **1. Install dependencies and import packages**

In [1]:
!pip install git+https://github.com/openai/whisper.git
!pip install pydub
!pip install tqdm
!pip install moviepy
!pip install ipywidgets
!pip install pytube

import ipywidgets as widgets
from IPython.display import display, clear_output
import whisper
import os
from pydub import AudioSegment
from google.colab import files
import moviepy.editor as mp
from pytube import YouTube
from tqdm.notebook import tqdm
import time
import wave

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-ipgh8o29
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-ipgh8o29
  Resolved https://github.com/openai/whisper.git to commit 1cea4357687b676b293cb5473e1ade25f5b1cef7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20231106)
  Downloading tiktoken-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20231106-py3-none-a

# **2. Setting up the logic**

In [2]:
def get_wav_duration(filename):
    with wave.open(filename, 'r') as wav:
        frames = wav.getnframes()
        rate = wav.getframerate()
        duration = frames / float(rate)
        return duration  # Duration in seconds

def combine_transcriptions(transcriptions, chunk_filenames):
    combined_segments = []
    time_offset = 0.0  # Time offset in seconds

    for i, transcription in enumerate(transcriptions):
        for segment in transcription['segments']:
            adjusted_segment = segment.copy()
            adjusted_segment['start'] += time_offset
            adjusted_segment['end'] += time_offset
            combined_segments.append(adjusted_segment)

        if i < len(chunk_filenames) - 1:
            # Update time_offset for the next chunk
            chunk_duration = get_wav_duration(chunk_filenames[i])  # Get the duration of the current chunk
            time_offset += chunk_duration

    return {'segments': combined_segments}

def convert_to_wav(filename):
    # Extract file name and extension
    file_name, file_extension = os.path.splitext(filename)
    file_extension = file_extension.lower()

    # Define output WAV filename
    output_filename = f"{file_name}.wav"

    # Process based on file extension
    if file_extension in ['.mp3', '.ogg', '.m4a', '.wav']:
        # For audio files
        audio = AudioSegment.from_file(filename)
        audio.export(output_filename, format="wav")
    elif file_extension in ['.mp4', '.mov', '.avi', '.mpeg']:
        # For video files
        video_clip = mp.VideoFileClip(filename)
        audio_clip = video_clip.audio
        audio_clip.write_audiofile(output_filename)
        audio_clip.close()
        video_clip.close()
    else:
        raise ValueError("Unsupported file format")

    return output_filename

def format_as_srt(segments):
    srt_content = []
    seq_number = 1

    for segment in segments:
        start_time = format_timestamp(segment["start"])
        words = segment["text"].split()
        word_index = 0  # Tracks the index of the word in the words list

        while word_index < len(words):
            # Determine the end time for this segment
            segment_length = len(words) - word_index
            next_index = min(word_index + 10, len(words))  # Take up to 10 words per segment
            end_time = format_timestamp(segment["start"] + (segment["end"] - segment["start"]) * next_index / len(words))

            # Split the words into two lines
            line1 = " ".join(words[word_index:min(word_index + 5, len(words))])
            line2 = " ".join(words[min(word_index + 5, len(words)):next_index])
            transcript = f"{line1}\n{line2}" if line2 else f"{line1}"

            srt_content.append(f"{seq_number}\n{start_time} --> {end_time}\n{transcript}\n")
            seq_number += 1

            word_index = next_index
            start_time = end_time  # Update start time for the next segment

    return "\n".join(srt_content)

def format_as_text(segments):
    return "\n".join([segment["text"] for segment in segments])

def format_timestamp(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    milliseconds = int((seconds - int(seconds)) * 1000)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}"

def split_wav_file(filename, max_size_mb=20):
    chunk_filenames = []
    file_size_mb = os.path.getsize(filename) / 1024 / 1024

    if file_size_mb > 25:
        with wave.open(filename, 'rb') as wav:
            frames_per_second = wav.getframerate()
            channels = wav.getnchannels()
            sampwidth = wav.getsampwidth()
            max_bytes = max_size_mb * 1024 * 1024

            # Calculate bytes per frame
            bytes_per_frame = channels * sampwidth
            # Calculate the maximum number of frames per chunk
            max_frames_per_chunk = max_bytes // bytes_per_frame

            frame_count = wav.getnframes()

            print(f"Splitting: {filename}")
            print(f"Frames per second: {frames_per_second}, Channels: {channels}, Sample width: {sampwidth}")
            print(f"Max frames per chunk: {max_frames_per_chunk}, Total frames: {frame_count}")

            for i in range(0, frame_count, max_frames_per_chunk):
                chunk_filename = f"{filename}_chunk_{i}.wav"
                chunk_filenames.append(chunk_filename)

                with wave.open(chunk_filename, 'wb') as chunk:
                    chunk.setnchannels(channels)
                    chunk.setsampwidth(sampwidth)
                    chunk.setframerate(frames_per_second)
                    frames_to_write = min(max_frames_per_chunk, frame_count - i)
                    chunk.writeframes(wav.readframes(frames_to_write))
                    print(f"Created chunk: {chunk_filename}, Frames: {frames_to_write}")

    return chunk_filenames

# Function to upload a file
def upload_file():
    with output:
        clear_output()
        uploaded = files.upload()
        if uploaded:
            filename = next(iter(uploaded))
            size_mb = os.path.getsize(filename) / (1024 * 1024)
            print(f"Uploaded File: {filename}, Size: {size_mb:.2f} MB")
            return filename
        return None

# Function to download a video from YouTube with progress bar
def download_youtube_video(url):
    with output:
        clear_output()
        yt = YouTube(url, on_progress_callback=on_progress)

        # Initialize the progress bar here
        global progress_bar
        progress_bar = tqdm(total=100, desc='Downloading', unit='%')

        stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
        filename = stream.download()
        size_mb = os.path.getsize(filename) / (1024 * 1024)
        print(f"Downloaded Video: {filename}, Size: {size_mb:.2f} MB")
        return filename

# Progress callback function for YouTube download
def on_progress(stream, chunk, bytes_remaining):
    total_size = stream.filesize
    bytes_downloaded = total_size - bytes_remaining
    percentage_of_completion = (bytes_downloaded / total_size) * 100
    progress_bar.n = percentage_of_completion
    progress_bar.refresh()


# Handlers for the UI elements
def handle_upload_button_click(b):
    global global_filename
    global_filename = upload_file()

    # After uploading, process the file to convert it to WAV if needed
    if global_filename:
        process_file(global_filename)

def handle_download_button_click(b):
    global global_filename
    global_filename = download_youtube_video(youtube_input.value)

    # After downloading, you might want to process the file
    if global_filename:
        process_file(global_filename)

def process_file(filename):
    global global_filename

    # Process the file (e.g., convert to WAV)
    audio, audio_filename = handle_media_file(filename)
    if audio:
        print("Conversion successful.")
        # Update the global filename to the new audio file
        global_filename = audio_filename
    else:
        print("Conversion failed.")

def on_dropdown_change(change):
    if change['new'] == 'upload':
        upload_button.layout.visibility = 'visible'
        youtube_input.layout.visibility = 'hidden'
        download_button.layout.visibility = 'hidden'
    elif change['new'] == 'youtube':
        upload_button.layout.visibility = 'hidden'
        youtube_input.layout.visibility = 'visible'
        download_button.layout.visibility = 'visible'
    else:
        upload_button.layout.visibility = 'hidden'
        youtube_input.layout.visibility = 'hidden'
        download_button.layout.visibility = 'hidden'

def handle_media_file(filename):
    try:
        file_name, file_extension = os.path.splitext(filename)
        file_extension = file_extension.lower()

        if file_extension in ['.mp3', '.wav', '.ogg', '.m4a']:
            return AudioSegment.from_file(filename), filename
        elif file_extension in ['.mov', '.avi', '.mpeg', '.mp4']:
            video = mp.VideoFileClip(filename)
            audio = video.audio
            audio_filename = f"{file_name}.wav"
            audio.write_audiofile(audio_filename)
            return AudioSegment.from_file(audio_filename), audio_filename
        else:
            raise ValueError("Unsupported file format")
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

# Function to estimate the chunk duration based on file size
def estimate_chunk_duration(file_size_bytes, total_duration_ms, target_chunk_size_mb=25):
    avg_bitrate = (file_size_bytes * 8) / (total_duration_ms / 1000)  # bits per second
    target_chunk_size_bytes = target_chunk_size_mb * 1024 * 1024  # bytes
    estimated_duration_ms = (target_chunk_size_bytes * 1000) / avg_bitrate  # milliseconds

    # Ensure that the estimated duration is at least 1 millisecond
    return max(1, int(estimated_duration_ms))


# Function to split the audio file into smaller chunks
def split_audio(filename, target_chunk_size_mb=25):
    audio = handle_media_file(filename)
    if not audio:
        print(f"Failed to process the file: {filename}")
        return []

    file_size_bytes = os.path.getsize(filename)
    total_duration_ms = len(audio[0])  # Assuming handle_media_file returns (audio, filename)

    estimated_chunk_duration_ms = estimate_chunk_duration(file_size_bytes, total_duration_ms, target_chunk_size_mb)

    chunks = []
    for i in range(0, total_duration_ms, estimated_chunk_duration_ms):
        chunk = audio[0][i:i + estimated_chunk_duration_ms]
        chunk_filename = f"{filename}_part{i}.wav"
        chunk.export(chunk_filename, format="wav")
        chunks.append(chunk_filename)
        print(f"Created chunk: {chunk_filename}")

    return chunks


# Function to format time for SRT file
def format_time(milliseconds):
    seconds, milliseconds = divmod(milliseconds, 1000)
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{int(milliseconds):03}"

def split_text(text):
    """Split text into chunks with 4-5 words in the first line and 5-6 in the second line."""
    words = text.split()
    lines = []
    current_line = []

    for word in words:
        current_line.append(word)
        # Check if the current line is the first line and has 4-5 words, or the second line with 5-6 words
        if (len(lines) == 0 and len(current_line) >= 4) or (len(lines) == 1 and len(current_line) >= 5):
            lines.append(' '.join(current_line))
            current_line = []

        # Break if two lines are filled
        if len(lines) == 2:
            break

    # Add the remaining words as a separate line if any
    if current_line:
        lines.append(' '.join(current_line))

    return lines

def format_srt_segment(counter, start_time, end_time, text):
    """Format an SRT segment."""
    formatted_text = "\n".join(split_text(text))
    return f"{counter}\n{format_time(start_time)} --> {format_time(end_time)}\n{formatted_text}\n\n"

def transcribe_file(filename):
    """Transcribe an audio file using Whisper."""
    # Extract the file extension
    _, file_extension = os.path.splitext(filename)
    file_extension = file_extension.lower()

    # Print the type of file being processed
    if file_extension in ['.wav']:
        print(f"Processing an audio file: {filename}")
    elif file_extension in ['.mp4']:
        print(f"Processing a video file: {filename}")
    elif file_extension in ['.mp3']:
        print(f"Processing an audio file: {filename}")
    elif file_extension in ['.avi']:
        print(f"Processing a video file: {filename}")
    elif file_extension in ['.ogg']:
        print(f"Processing an audio file: {filename}")
    elif file_extension in ['.mov']:
        print(f"Processing a video file: {filename}")
    else:
        print(f"Processing an unknown file type: {filename}")

    model = whisper.load_model("large")
    result = model.transcribe(filename)
    return result

# **3. Select between YouTube URL or Upload Media File**

In [3]:
# UI Elements
# Define the widgets
dropdown = widgets.Dropdown(
    options=[('Select Option', None), ('Upload File', 'upload'), ('YouTube Video', 'youtube')],
    description='Action:'
)
upload_button = widgets.Button(description="Upload File", layout=widgets.Layout(visibility='hidden'))
youtube_input = widgets.Text(placeholder='Enter YouTube URL here', layout=widgets.Layout(visibility='hidden'))
download_button = widgets.Button(description="Download YouTube Video", layout=widgets.Layout(visibility='hidden'))
output = widgets.Output()

# Assign handlers to buttons and dropdown
upload_button.on_click(handle_upload_button_click)
download_button.on_click(handle_download_button_click)
dropdown.observe(on_dropdown_change, names='value')

# Display UI
display(dropdown, upload_button, youtube_input, download_button, output)

Dropdown(description='Action:', options=(('Select Option', None), ('Upload File', 'upload'), ('YouTube Video',…

Button(description='Upload File', layout=Layout(visibility='hidden'), style=ButtonStyle())

Text(value='', layout=Layout(visibility='hidden'), placeholder='Enter YouTube URL here')

Button(description='Download YouTube Video', layout=Layout(visibility='hidden'), style=ButtonStyle())

Output()

MoviePy - Writing audio in /content/Shivers (Lyric Video).wav



chunk:   0%|          | 0/4589 [00:00<?, ?it/s, now=None][A
chunk:   7%|▋         | 307/4589 [00:00<00:01, 3069.61it/s, now=None][A
chunk:  13%|█▎        | 614/4589 [00:00<00:01, 2789.35it/s, now=None][A
chunk:  20%|█▉        | 895/4589 [00:00<00:01, 2767.09it/s, now=None][A
chunk:  26%|██▌       | 1173/4589 [00:00<00:01, 2720.14it/s, now=None][A
chunk:  32%|███▏      | 1454/4589 [00:00<00:01, 2749.86it/s, now=None][A
chunk:  38%|███▊      | 1741/4589 [00:00<00:01, 2789.55it/s, now=None][A
chunk:  44%|████▍     | 2021/4589 [00:00<00:00, 2699.80it/s, now=None][A
chunk:  50%|████▉     | 2292/4589 [00:00<00:00, 2651.99it/s, now=None][A
chunk:  56%|█████▌    | 2558/4589 [00:00<00:00, 2634.30it/s, now=None][A
chunk:  61%|██████▏   | 2822/4589 [00:01<00:00, 2627.14it/s, now=None][A
chunk:  68%|██████▊   | 3114/4589 [00:01<00:00, 2712.99it/s, now=None][A
chunk:  74%|███████▍  | 3386/4589 [00:01<00:00, 2649.84it/s, now=None][A
chunk:  80%|████████  | 3687/4589 [00:01<00:00, 2754.

MoviePy - Done.
Conversion successful.


# **4. Transcribe and save SRT / TXT Files**

In [4]:
# global_filename = 'notAlone.wav'

uploaded_filename = global_filename
wav_filename = convert_to_wav(uploaded_filename)

wav_filename

# Split the file into chunks if necessary
chunk_filenames = split_wav_file(wav_filename)
chunk_filenames

# Transcribe each chunk
all_transcriptions = [transcribe_file(chunk) for chunk in chunk_filenames]

# Combine the transcriptions
combined_result = combine_transcriptions(all_transcriptions, chunk_filenames)

# Format the combined transcriptions
formatted_srt = format_as_srt(combined_result['segments'])
formatted_text = format_as_text(combined_result['segments'])

# Save the formatted transcriptions to files
with open("transcription.srt", "w", encoding='utf-8') as srt_file:
    srt_file.write(formatted_srt)

with open("transcription.txt", "w", encoding='utf-8') as text_file:
    text_file.write(formatted_text)


Splitting: /content/Shivers (Lyric Video).wav
Frames per second: 44100, Channels: 2, Sample width: 2
Max frames per chunk: 5242880, Total frames: 9177210
Created chunk: /content/Shivers (Lyric Video).wav_chunk_0.wav, Frames: 5242880
Created chunk: /content/Shivers (Lyric Video).wav_chunk_5242880.wav, Frames: 3934330
Processing an audio file: /content/Shivers (Lyric Video).wav_chunk_0.wav



  0%|                                              | 0.00/2.88G [00:00<?, ?iB/s][A
  0%|                                     | 5.50M/2.88G [00:00<00:53, 57.6MiB/s][A
  0%|▏                                    | 12.4M/2.88G [00:00<00:46, 66.3MiB/s][A
  1%|▏                                    | 18.9M/2.88G [00:00<00:45, 66.8MiB/s][A
  1%|▎                                    | 27.0M/2.88G [00:00<00:41, 73.8MiB/s][A
  1%|▍                                    | 39.1M/2.88G [00:00<00:32, 92.7MiB/s][A
  2%|▌                                    | 48.0M/2.88G [00:00<00:37, 80.5MiB/s][A
  2%|▋                                    | 55.9M/2.88G [00:00<00:39, 76.0MiB/s][A
  2%|▊                                    | 66.8M/2.88G [00:00<00:34, 86.5MiB/s][A
  3%|▉                                    | 75.7M/2.88G [00:00<00:33, 88.5MiB/s][A
  3%|█                                    | 84.3M/2.88G [00:01<00:35, 84.3MiB/s][A
  3%|█▏                                   | 92.5M/2.88G [00:01<00:36, 81.9M

Processing an audio file: /content/Shivers (Lyric Video).wav_chunk_5242880.wav


TypeError: ignored

# **5. Download results as SRT / TXT**

In [19]:
import ipywidgets as widgets
import base64
from IPython.display import display, Javascript

def create_download_link(filename, content):
    b64 = base64.b64encode(content.encode())
    payload = b64.decode()
    js_download = f"""
    var link = document.createElement('a');
    link.href = "data:text/plain;base64,{payload}";
    link.download = "{filename}";
    document.body.appendChild(link);
    link.click();
    document.body.removeChild(link);
    """
    return Javascript(js_download)

# Function to handle download
def download_file(b):
    format_choice = download_dropdown.value
    filename = f"transcription.{format_choice.lower()}"

    if format_choice == 'SRT':
        content = formatted_srt
    else:  # TXT format
        content = formatted_text

    js = create_download_link(filename, content)
    display(js)

# Dropdown for selecting file format to download
download_dropdown = widgets.Dropdown(
    options=['SRT', 'TXT'],
    description='Download:',
    disabled=False,
)

# Button to trigger the download
download_button = widgets.Button(description="Download File")

# Display the dropdown and button
display(download_dropdown, download_button)

# Bind the button click event to the download function
download_button.on_click(download_file)


Dropdown(description='Download:', options=('SRT', 'TXT'), value='SRT')

Button(description='Download File', style=ButtonStyle())

<IPython.core.display.Javascript object>