# STT Cloned Speech

In [31]:
import os
import re
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch.nn.functional as F
from audioseal import AudioSeal
from tqdm import tqdm
import librosa
import torch
import pandas as pd
import numpy as np
import torchaudio

In [32]:
watermarked_path = "../Dataset/Watermarked Audio"
unwatermarked_path = "../Dataset/Unwatermarked Audio"
transcription_path = '../Dataset/Transcriptions/transcriptions_complete.csv'
results_path = '../Dataset/Results'

In [33]:
# 1. Import the watermark detector
detector = AudioSeal.load_detector("audioseal_detector_16bits")

# 2. Import the model for transcription
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3-turbo"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device)

# Whisper and AudioSeal expect a sample rate of 16khz
target_sr = 16000

Device set to use cuda:0


In [34]:
# Get all filepaths
unwatermarked_files = os.listdir(unwatermarked_path)
unwatermarked_files = [i for i in unwatermarked_files if i[-4:] == ".mp3"]
unwatermarked_files = [i for i in unwatermarked_files if "audioseal" in i]

watermarked_files = os.listdir(watermarked_path)
watermarked_files = [i for i in watermarked_files if i[-4:] == ".mp3"]
watermarked_files = [i for i in watermarked_files if "audioseal" in i]

In [35]:
# Helper to extract the ID from the filename
def extract_id(filename):
    match = re.search(r'common_voice_en_(\d+)', filename)
    return match.group(1) if match else None

In [36]:
# Build dicts by ID
un_dict = {extract_id(f): f for f in unwatermarked_files}
w_dict = {extract_id(f): f for f in watermarked_files}

# Find common IDs and build a dict with (un, w) tuples
matched = {id_: (un_dict[id_], w_dict[id_]) for id_ in un_dict.keys() & w_dict.keys()}

print(f"Audios to evaluate: {len(matched):0,.0f}")

Audios to evaluate: 14,124


In [40]:
# Transcriptios of the original file
transcriptions = pd.read_csv('../Dataset/Transcriptions/transcriptions_complete.csv')
# Create ID
transcriptions["id"] = transcriptions["clip"].apply(extract_id)

# Find IDs already processed
processed_ids = []
last_file_num = 0
for file in os.listdir(results_path):
    file_path = os.path.join(results_path, file)
    if file.startswith("results_transcription_"):
        file_num = re.search(r'results_transcription_(\d+).csv', file).group(1)
        last_file_num = np.max([int(file_num), last_file_num])
        temp = pd.read_csv(file_path, usecols=["id"])
        processed_ids.extend(temp["id"].astype(str).tolist())

remaining_clips = list(matched.keys() - set(processed_ids))
print(f"Remaining clips {len(remaining_clips):0,.0f} ({len(remaining_clips)/len(matched.keys()):0.1%})")
print("Last file number: ", last_file_num)

Remaining clips 0 (0.0%)
Last file number:  36


In [38]:
# Configuration
batch_size = 2**4  # Adjust based on your GPU memory
batch_num = 0

# Store results incrementally
results_batch = []     # Stores intermediate batch results

# Iterate over remaining clips in batches
for _ in tqdm(range(0, len(remaining_clips), batch_size)):
    batch_ids = remaining_clips[:batch_size]  # Take a batch of clip IDs
    batch_audio = []
    watermark_probs = []

    for id_ in batch_ids:
        un, w = matched[id_]
        clip_path = os.path.join(unwatermarked_path, un)

        # Load audio using torchaudio for Whisper compatibility
        wav, sr = torchaudio.load(clip_path)

        # Convert to mono if needed
        if wav.shape[0] > 1:
            wav = wav.mean(dim=0, keepdim=True)

        # Resample to target sampling rate
        wav = torchaudio.transforms.Resample(sr, target_sr)(wav)

        # --- Step 2: Add audio to batch for transcription ---
        batch_audio.append(wav)

    # --- Step 3: Pad audio to equal length for batch processing ---
    max_len = max(wav.shape[1] for wav in batch_audio)
    batch_padded = [F.pad(wav, (0, max_len - wav.shape[1])) for wav in batch_audio]
    batch_tensor = torch.stack(batch_padded).squeeze(1)  # (batch, time)

    # --- Step 4: Convert to Whisper input format (list of numpy arrays) ---
    batch_np = [wav.cpu().numpy() for wav in batch_tensor]
    input_features = processor.feature_extractor(
        batch_np, sampling_rate=target_sr, return_tensors="pt"
    ).input_features.to(device, dtype=torch.float16)

    # --- Step 5: Generate transcriptions with Whisper ---
    with torch.no_grad():
        outputs = model.generate(
            input_features,
            return_dict_in_generate=True,
            output_scores=True,
            forced_decoder_ids=processor.get_decoder_prompt_ids(language="en", task="transcribe")
        )

    # Decode the generated tokens
    decoded = processor.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)

    # --- Step 6: Compute confidence scores ---
    scores = outputs.scores
    if scores:
        token_probs = [F.softmax(logits, dim=-1).max() for logits in scores]
        confidences = [torch.mean(p).item() for p in token_probs]
    else:
        confidences = [None] * len(batch_ids)

    # --- Step 7: Store results in memory ---
    for id_, transcription, confidence, in zip(batch_ids, decoded, confidences):
        results_batch.append({
            "id": id_,
            "unwatermarked_transcription": transcription,
            "unwatermarked_confidence": confidence,
        })

    # --- Step 8: Update progress ---
    batch_num += 1
    remaining_clips = remaining_clips[batch_size:]  # Remove processed clips

    # --- Step 9: Save batch results every N batches or at the end ---
    if (batch_num % 25 == 0) or (len(remaining_clips) == 0):
        last_file_num += 1
        batch_df = pd.DataFrame(results_batch)
        output_filename = f"results_transcription_{last_file_num}.csv"
        batch_df.to_csv(os.path.join(results_path, output_filename), index=False)
        results_batch = []  # Reset batch storage

100%|██████████| 846/846 [15:19<00:00,  1.09s/it]


In [39]:
all_results = pd.DataFrame()
for file in os.listdir(results_path):
    file_path = os.path.join(results_path, file)
    if file.startswith("results_transcription_"):
        temp = pd.read_csv(file_path)
        all_results = pd.concat([all_results, temp])
all_results = all_results.drop_duplicates(subset=["id"])
all_results = all_results.reset_index(drop=True)
all_results.to_csv(os.path.join(results_path, "all_results_transcription.csv"), index=False)
all_results.shape

(14124, 4)