# Validate Watermarking

In [33]:
import os
import re
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch.nn.functional as F
from audioseal import AudioSeal
from tqdm import tqdm
import librosa
import torch
import pandas as pd
import numpy as np
import torchaudio
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
watermarked_path = "../Dataset/Watermarked Audio"
unwatermarked_path = "../Dataset/Unwatermarked Audio"
transcription_path = '../Dataset/Transcriptions/transcriptions_complete.csv'
results_path = '../Dataset/Results'

In [3]:
# 1. Import the watermark detector
detector = AudioSeal.load_detector("audioseal_detector_16bits")

# 2. Import the model for transcription
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3-turbo"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device)

# Whisper and AudioSeal expect a sample rate of 16khz
target_sr = 16000

Device set to use cpu


In [4]:
# Get all filepaths
unwatermarked_files = os.listdir(unwatermarked_path)
unwatermarked_files = [i for i in unwatermarked_files if i[-4:] == ".mp3"]
unwatermarked_files = [i for i in unwatermarked_files if "audioseal" in i]

watermarked_files = os.listdir(watermarked_path)
watermarked_files = [i for i in watermarked_files if i[-4:] == ".mp3"]
watermarked_files = [i for i in watermarked_files if "audioseal" in i]

In [5]:
# Helper to extract the ID from the filename
def extract_id(filename):
    match = re.search(r'common_voice_en_(\d+)', filename)
    return match.group(1) if match else None

In [6]:
# Build dicts by ID
un_dict = {extract_id(f): f for f in unwatermarked_files}
w_dict = {extract_id(f): f for f in watermarked_files}

# Find common IDs and build a dict with (un, w) tuples
matched = {id_: (un_dict[id_], w_dict[id_]) for id_ in un_dict.keys() & w_dict.keys()}

print(f"Audios to evaluate: {len(matched):0,.0f}")

Audios to evaluate: 14,124


In [None]:
# Find IDs already processed
processed_ids = []
last_file_num = 0
for file in os.listdir(results_path):
    file_path = os.path.join(results_path, file)
    if file.startswith("results_watermark_prob_"):
        file_num = re.search(r'results_watermark_prob_(\d+).csv', file).group(1)
        last_file_num = np.max([int(file_num), last_file_num])
        temp = pd.read_csv(file_path, usecols=["id"])
        processed_ids.extend(temp["id"].astype(str).tolist())

remaining_clips = list(matched.keys() - set(processed_ids))
print(f"Remaining clips {len(remaining_clips):0,.0f} ({len(remaining_clips)/len(matched.keys()):0.1%})")
print("Last file number: ", last_file_num)

Remaining clips 0 (0.0%)
Last file number:  16


In [8]:
all_results = []
for n, i in tqdm(enumerate(remaining_clips), total=len(remaining_clips)):
    
    un, w = matched[i]
    # Load the unwatermarked audio file
    un_path = os.path.join(unwatermarked_path, un)
    un_wav, sr = librosa.load(un_path, sr=target_sr)
    # Convert to float32 tensor for PyTorch
    un_wav_tensor = torch.tensor(un_wav, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
    # Detect the watermark
    prob, _ = detector.detect_watermark(un_wav_tensor, sr)
    # Save result
    all_results.append([i, prob])

    # Save results every 1000 items or at the end
    if (n % 1000 == 0 and n > 0) or (n == len(remaining_clips) - 1):
        last_file_num += 1
        results_df = pd.DataFrame(all_results, columns=["id", "prob_w"])
        output_filename = f"results_watermark_prob_{last_file_num}.csv"
        results_df.to_csv(os.path.join(results_path, output_filename), index=False)
        all_results = []

0it [00:00, ?it/s]


In [10]:
all_results = pd.DataFrame()
for file in os.listdir(results_path):
    file_path = os.path.join(results_path, file)
    if file.startswith("results_watermark_prob_"):
        temp = pd.read_csv(file_path)
        all_results = pd.concat([all_results, temp])
all_results = all_results.drop_duplicates(subset=["id"])
all_results = all_results.reset_index(drop=True)
all_results.to_csv(os.path.join(results_path, "all_results_watermark_prob.csv"), index=False)
all_results.shape

(14124, 2)

In [43]:
all_results["prob_w"].describe().apply(lambda x: f"{x:0.2%}")

count    1412400.00%
mean           0.25%
std            0.82%
min            0.00%
25%            0.00%
50%            0.00%
75%            0.17%
max           28.43%
Name: prob_w, dtype: object