# Setup
- Upload audio mp3 file/s to Google Drive
- Change runtime type Hardware accelerator to T4 GPU
- Install Whisper and mount Google Drive

# whisper-large-v2
- [https://github.com/openai/whisper](https://github.com/openai/whisper)

In [None]:
!pip install git+https://github.com/openai/whisper.git

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

# Define the input directory containing audio files
input_dir = '/content/drive/MyDrive/'  #@param {type: "string"}

# Iterate over audio files in the input directory
for audio_file in os.listdir(input_dir):
    if audio_file.endswith('.mp3'):  # Adjust the file extension as needed
        audio_path = os.path.join(input_dir, audio_file)

        # Transcribe the audio file
        !whisper "{audio_path}" --model large-v3 --output_dir "{input_dir}"

# whisper-large-v3
- https://huggingface.co/openai/whisper-large-v3

In [None]:
# Install necessary libraries
!pip install --upgrade pip
!pip install torch==2.6.0+cu118 torchaudio==2.6.0+cu118 --index-url https://download.pytorch.org/whl/cu118
!pip install -U transformers librosa soundfile tqdm

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# STEP 0: (Before importing transformers) disable torchvision usage inside transformers
import os
os.environ["TRANSFORMERS_NO_TORCHVISION"] = "1"

# STEP 1: Imports
import torch, librosa, numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from tqdm import tqdm
import os

# STEP 2: Set device
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"\nDevice set to use {device}")

# STEP 3: Load model & processor
model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
).to(device)

processor = AutoProcessor.from_pretrained(model_id)

# STEP 4: Parameters
chunk_duration = 30.0  # seconds
generate_kwargs = {
    "max_length": 448,
    "return_timestamps": "word",
    "do_sample": False,
}

# STEP 5: Input directory
input_dir = '/content/drive/MyDrive/'  # @param {type: "string"}

# STEP 6: Find audio files
audio_files = [
    os.path.join(input_dir, f)
    for f in os.listdir(input_dir)
    if f.lower().endswith(('.mp3', '.wav', '.flac', '.m4a'))
]

# STEP 7: Transcribe
for audio_path in tqdm(audio_files, desc="Transcribing audio files"):
    print(f"\n🗂️ Transcribing: {audio_path}")

    audio_array, sr = librosa.load(audio_path, sr=16000)
    total_duration = librosa.get_duration(y=audio_array, sr=sr)
    num_chunks = int(np.ceil(total_duration / chunk_duration))

    print(f"📏 Total duration: {total_duration:.2f} sec → {num_chunks} chunks of {chunk_duration:.0f} sec")

    full_transcript = []

    for i in tqdm(range(num_chunks), desc="Chunks", leave=False):
        start_sample = int(i * chunk_duration * sr)
        end_sample = int(min((i + 1) * chunk_duration * sr, len(audio_array)))
        chunk = audio_array[start_sample:end_sample]

        inputs = processor(chunk, sampling_rate=16000, return_tensors="pt")
        input_features = inputs.input_features.to(device, dtype=torch_dtype)

        with torch.inference_mode():
            generated_tokens = model.generate(input_features=input_features, **generate_kwargs)

        decoded = processor.batch_decode(generated_tokens, skip_special_tokens=True)[0].strip()

        start_time = i * chunk_duration
        end_time = min((i + 1) * chunk_duration, total_duration)
        line = f"[{start_time:06.2f} --> {end_time:06.2f}] {decoded}"
        print(line)
        full_transcript.append(line)

    transcript_path = audio_path + ".txt"
    with open(transcript_path, "w", encoding="utf-8") as f:
        f.write("\n".join(full_transcript))
    print(f"\n✅ Saved transcript to: {transcript_path}\n")