# Text-To-Speech (TTS)

In [1]:
# !huggingface-cli download fishaudio/fish-speech-1.5 --local-dir checkpoints/fish-speech-1.5/

In [22]:
import os
import pandas as pd
from IPython.display import Audio, display
from audioseal import AudioSeal
import librosa
import torch
from tqdm import tqdm

In [2]:
# Load transcriptions
transcriptions = pd.read_csv("../Dataset/Transcriptions/transcriptions_complete.csv")
transcriptions = transcriptions.sort_values("confidence", ascending=False).reset_index(drop = True)

In [None]:
# Import the detector
detector = AudioSeal.load_detector("audioseal_detector_16bits")

In [None]:
# Parameters
watermark_path = "../Dataset/Watermarked Audio"
target_sr = 16000
device = "cpu"

# Save results
results_unwater_audioseal = []


for i in tqdm(len(transcriptions)):
    # Find the name of the clip
    clip_path = transcriptions.loc[i, "clip"]
    # Retrieve the transcription of the clip
    transcript_i = transcriptions.loc[i, "transcription"]
    # Find the watermarked clip associated with the original clip
    clip_water_path = [i for i in os.listdir(watermark_path) if clip_path[:-4] in i][0]

    # Import the watermarked audio
    # audio_water = Audio(filename= os.path.join(watermark_path, clip_water_path))

    # Detect the probability of watermark
    # Load the watermarked audio file
    wav_watermarked, sr = librosa.load(os.path.join(watermark_path, clip_water_path), sr = target_sr)
    # Convert to a PyTorch tensor
    wav_watermarked_tensor = torch.tensor(wav_watermarked).unsqueeze(0).unsqueeze(0)  # Shape: (1, 1, samples)
    # Detect the watermark
    result_water, _ = detector.detect_watermark(wav_watermarked_tensor, sr)

    # Clone the audio
    # Enter the path to the audio file here
    src_audio = os.path.join(watermark_path, clip_water_path)

    !python fish-speech-main/fish_speech/models/vqgan/inference.py \
        -i "{src_audio}" \
        --checkpoint-path "checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \
        --device "{device}"

    # Change the name of the file
    clip_unwater_path = clip_water_path.replace("watermarked", "unwatermarked")
    os.rename("fake.wav", clip_unwater_path)

    # Find the watermark
    wav_unwater, sr = librosa.load(clip_unwater_path, sr = target_sr)
    # Convert to a PyTorch tensor
    wav_unwater_tensor = torch.tensor(wav_unwater).unsqueeze(0).unsqueeze(0)  # Shape: (1, 1, samples)
    # Detect the watermark
    result_unwater, _ = detector.detect_watermark(wav_unwater_tensor, sr)

    results_unwater_audioseal.append([clip_water_path, result_water, clip_unwater_path, result_unwater])

In [68]:
!cd "/Users/lucas/Library/CloudStorage/OneDrive-UniversityofWaterloo/2025-W/CS858 - Security & Privacy in Machine Learning/Project/Code/fish-speech-main" && \
export PYTHONPATH="${PYTHONPATH}:/Users/lucas/Library/CloudStorage/OneDrive-UniversityofWaterloo/2025-W/CS858 - Security & Privacy in Machine Learning/Project/Code/fish-speech-main" && \
    python fish_speech/models/text2semantic/inference.py \
    --text "{transcript_i}" \
    --prompt-text "{transcript_i}" \
    --prompt-tokens "/Users/lucas/Library/CloudStorage/OneDrive-UniversityofWaterloo/2025-W/CS858 - Security & Privacy in Machine Learning/Project/Code/fake.npy" \
    --checkpoint-path "/Users/lucas/Library/CloudStorage/OneDrive-UniversityofWaterloo/2025-W/CS858 - Security & Privacy in Machine Learning/Project/Code/checkpoints/fish-speech-1.5" \
    --num-samples 2 \
    --device "cpu"
    # --compile

[32m2025-03-16 21:32:06.471[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m1056[0m - [1mLoading model ...[0m
[32m2025-03-16 21:32:14.409[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m681[0m - [1mRestored model from checkpoint[0m
[32m2025-03-16 21:32:14.409[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m687[0m - [1mUsing DualARTransformer[0m
[32m2025-03-16 21:32:14.415[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m1070[0m - [1mTime to load model: 7.94 seconds[0m
[32m2025-03-16 21:32:14.430[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_long[0m:[36m788[0m - [1mEncoded text: {transcript_i}[0m
[32m2025-03-16 21:32:14.430[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_long[0m:[36m806[0m - [1mGenerating sentence 1/1 of sample 1/2[0m
  0%|                                         | 24/8055 [00:03<16:51,  7.94it/s]
[32m2025-03-16 21:32:21.347[0m | [1mINFO    [0m | [36m_

In [76]:
!cd "/Users/lucas/Library/CloudStorage/OneDrive-UniversityofWaterloo/2025-W/CS858 - Security & Privacy in Machine Learning/Project/Code/fish-speech-main" && \
python fish_speech/models/vqgan/inference.py \
    -i '/Users/lucas/Library/CloudStorage/OneDrive-UniversityofWaterloo/2025-W/CS858 - Security & Privacy in Machine Learning/Project/Code/fish-speech-main/temp/codes_1.npy' \
    --checkpoint-path '/Users/lucas/Library/CloudStorage/OneDrive-UniversityofWaterloo/2025-W/CS858 - Security & Privacy in Machine Learning/Project/Code/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth' \
    --device "cpu"

19510.77s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
[32m2025-03-16 21:36:57.043[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m46[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-03-16 21:36:57.043[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m99[0m - [1mProcessing precomputed indices from /Users/lucas/Library/CloudStorage/OneDrive-UniversityofWaterloo/2025-W/CS858 - Security & Privacy in Machine Learning/Project/Code/fish-speech-main/temp/codes_1.npy[0m
[32m2025-03-16 21:36:58.087[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m113[0m - [1mGenerated audio of shape torch.Size([1, 1, 157696]), equivalent to 3.58 seconds from 77 features, features/second: 21.53[0m
[32m2025-03-16 21:36:58.089[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m120[0m - [1mSaved audio to fake.wav[0m


In [77]:
from IPython.display import Audio, display
audio = Audio(filename='/Users/lucas/Library/CloudStorage/OneDrive-UniversityofWaterloo/2025-W/CS858 - Security & Privacy in Machine Learning/Project/Code/fish-speech-main/fake.wav')
display(audio)