# Speech-to-Text

In [2]:
import os
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import librosa
import IPython.display as ipd
import pandas as pd
from tqdm import tqdm
import torch.nn.functional as F
import numpy as np
import torchaudio
import re

In [7]:
clips_path = "../Dataset/cv-corpus-20.0-delta-2024-12-06/en/clips"
output_dir = "../Dataset/Transcriptions"

clips = os.listdir(clips_path)
clips = [i for i in clips if ".mp3" in i[-4:]]
len(clips)

27408

In [None]:
# Find clips without transcription
transcript_files = os.listdir(output_dir)
df = pd.DataFrame()
for file in transcript_files:
    if file != "transcriptions_complete.csv":
        temp = pd.read_csv(os.path.join(output_dir, file))
        df = pd.concat([df, temp], axis=0)
df.reset_index(drop=True, inplace=True)

transcripted_clips = df["clip"].unique()
remaining_clips = list(set(clips) - set(transcripted_clips))
len(remaining_clips)/len(clips)

0.0

In [8]:
df.to_csv(os.path.join(output_dir, f"transcriptions_complete.csv"), index=False)

In [4]:
numbers = [int(re.search(r"\d+", i).group()) if re.search(r"\d+", i) else None for i in transcript_files]
batch_num = np.max(numbers) + 1

## Whisper

In [5]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device)

# Whisper expects a sample rate of 16khz
target_sr = 16000
transcriptions = []

Device set to use cuda:0


In [6]:
batch_size = 2**0  # Adjust based on your GPU memory
transcriptions = []

# for i in tqdm(range(0, len(remaining_clips), batch_size)):
#     batch_clips = remaining_clips[i:i + batch_size]

while tqdm(len(remaining_clips) > 0):
    batch_clips = remaining_clips[:batch_size]  # Take a batch

    batch_audio = []
    for clip in batch_clips:
        wav, sr = torchaudio.load(os.path.join(clips_path, clip))

        # Convert to MONO (ensure single channel)
        if wav.shape[0] > 1:
            wav = wav.mean(dim=0, keepdim=True)  # Convert stereo -> mono

        # Resample to 16kHz (required by Whisper)
        wav = torchaudio.transforms.Resample(sr, target_sr)(wav)

        batch_audio.append(wav)

    # Determine max length in the batch for padding
    max_length = max(wav.shape[1] for wav in batch_audio)

    # Pad all audio clips to the same length
    batch_audio_padded = []
    for wav in batch_audio:
        pad_size = max_length - wav.shape[1]
        padded_wav = F.pad(wav, (0, pad_size))  # Pad on the right
        batch_audio_padded.append(padded_wav)

    # Convert to a single tensor and ensure correct shape for Whisper
    batch_audio_padded = torch.stack(batch_audio_padded)  # Shape: (batch, 1, time)
    batch_audio_padded = batch_audio_padded.squeeze(1)  # Remove channel dim -> (batch, time)

    # Convert batch audio to input features (ENSURE LIST OF NUMPY ARRAYS)
    batch_audio_numpy = [wav.cpu().numpy() for wav in batch_audio_padded]  # Whisper needs list of np.array
    batch_input_features = processor.feature_extractor(
        batch_audio_numpy, sampling_rate=target_sr, return_tensors="pt"
    ).input_features.to(device, dtype=torch.float16)  # Convert to FP16

    # Generate transcriptions
    with torch.no_grad():
        outputs = model.generate(
            batch_input_features,
            return_dict_in_generate=True,
            output_scores=True,
            forced_decoder_ids=processor.get_decoder_prompt_ids(language="en", task="transcribe")
        )

    # Decode results
    batch_transcriptions = processor.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)

    # Compute confidence scores
    batch_log_probs = outputs.scores
    if batch_log_probs:
        batch_probs = [F.softmax(logits, dim=-1).max() for logits in batch_log_probs]
        batch_avg_confidence = [torch.mean(probs).item() for probs in batch_probs]
    else:
        batch_avg_confidence = [None] * len(batch_clips)

    transcriptions.extend(zip(batch_clips, batch_transcriptions, batch_avg_confidence))

    # Remove processed clips
    remaining_clips = remaining_clips[batch_size:]

    # Save every N batches
    if (batch_num % 50 == 0) or (len(remaining_clips) == 0 ):
        output_file = os.path.join(output_dir, f"transcriptions_{batch_num}.csv")
        df = pd.DataFrame(transcriptions, columns=["clip", "transcription", "confidence"])
        df.to_csv(output_file, index=False)
        transcriptions = []
        batch_num += 1

0it [00:00, ?it/s]
